In [3]:
import pyspark.pandas as ps
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv("../data/fdj_data_merge_processed.csv", encoding="gbk")
data.to_pickle("../data/fdj_data_merge_processed.pkl")
data.head(5)

Unnamed: 0,id,10MKA01CT303,10MKA01CT365,10MKA01CT353,10MKA01CT329,10MKA01CT319,TIME,10MKA01CT612,10MKA01CT335,10MKA10CE301XQ02,...,10MKA01CT359,10MKA01CT379,10MKA01CT613,10MKA01CT323,10MKA01CT615,10MKA01CT603,10MKA01CT618,10MKA10CE101XQ01,10MKA01CT315,10MKA01CT616
0,1,49.430996,49.470055,49.723679,48.836071,49.343246,2021-01-01 00:00:00.0,45.218296,48.531802,30.699219,...,48.836071,48.836071,42.985345,48.455738,43.124226,45.648443,45.444761,8.833629,48.962856,45.571431
1,2,49.557922,49.596863,49.850502,48.962856,49.470055,2021-01-01 00:00:00.0,45.103046,48.582512,35.494482,...,48.962856,48.962856,43.575311,48.582512,43.517502,45.425684,45.778702,8.947504,49.216446,46.491872
2,3,49.938732,49.977325,50.230991,49.343246,49.799773,2021-01-01 00:00:00.0,44.845448,48.962856,10.618408,...,49.216446,49.343246,43.35163,48.962856,43.3632,45.736603,45.740217,8.936005,49.596863,46.556899
3,4,49.81179,49.850502,50.230991,49.343246,49.723679,2021-01-01 00:00:00.0,45.08757,48.836071,7.450049,...,49.216446,49.216446,43.525094,48.836071,43.617587,45.525391,45.740217,8.643852,49.596863,46.303973
4,5,49.050243,49.089649,49.470055,48.582512,48.962856,2021-01-01 00:00:00.0,44.495407,48.202209,-6.679443,...,48.582512,48.582512,42.641618,48.202209,42.769016,45.295003,44.975907,7.387082,48.836071,45.632652


In [None]:
def convert_sliding_window_data(data, target_column, window_size):
    df_sliding_window = data.copy()
    new_elements = df_sliding_window.columns.tolist()
    new_elements.remove("TIME")
    
    # 每个变量都进行shift
    shift_cols = []
    shift_col_names = []
    for i in range(len(new_elements)):
        for window in window_size:
            shift_cols.append(df_sliding_window[new_elements[i]].shift(window))
            shift_col_names.append(f'{new_elements[i]}_{window}')
    
    # 按照window计算统计特征
    stats_cols = []
    stats_col_names = []
    for window in window_size[:-2]:
        f_min = lambda x: x.rolling(window=window).min()
        f_max = lambda x: x.rolling(window=window).max()
        f_mean = lambda x: x.rolling(window=window).mean()
        f_std = lambda x: x.rolling(window=window).std()
        f_median = lambda x: x.rolling(window=window).median()
#         f_skew = lambda x:x.rolling(window=window).skew()
#         function_list = [f_min, f_max, f_mean, f_std, f_median, f_skew]
#         function_name = ['min', 'max', 'mean', 'std', 'median', 'skew']
        function_list = [f_min, f_max, f_mean, f_std, f_median]
        function_name = ['min', 'max', 'mean', 'std', 'median']
        for k in range(len(function_list)):
            stats_cols.append(df_sliding_window[new_elements].apply(function_list[k]).shift(1))
            stats_col_names.extend(pd.Series(df_sliding_window[new_elements].columns)
                                   .apply(lambda x:x+str(f'_rolling_{window}_{function_name[k]}')).tolist())
            
    x_shift = pd.concat(shift_cols, axis=1)
    x_shift.columns = shift_col_names

    x_stats = pd.concat(stats_cols, axis=1)
    x_stats.columns = stats_col_names
    
    df_sliding_window = pd.concat([df_sliding_window, x_shift, x_stats], axis=1)
            
    # 提取时间性的特征
    df_sliding_window['Day'] = df_sliding_window['TIME'].dt.day
    df_sliding_window['DayOfWeek'] = df_sliding_window['TIME'].dt.dayofweek
    df_sliding_window['Hour'] = df_sliding_window['TIME'].dt.hour 
    df_sliding_window['minute'] = df_sliding_window['TIME'].dt.minute 
    
    # 缺失值剔除+原变量剔除(因变量除外)
    new_elements.remove(target_column[0])
    df_sliding_window = df_sliding_window.dropna(axis=0, how='any').drop(new_elements, axis=1)
    return df_sliding_window.reset_index(drop=True)

df1 = df1.toPandas()
df1 = df1.sort_values('TIME',  ascending=True)
target_column = eval(params['target_column'])   # ['10MKA01CT301']
df_train_window = convert_sliding_window_data(data=df1, target_column=target_column, window_size=eval(params['window_size']))
print(df_train_window.shape)

## 特征选择
train_cols = df_train_window.columns.tolist()
train_cols.remove('TIME')
train_cols.remove(target_column[0])

x_train, y_train = df_train_window[train_cols], df_train_window[target_column]

reg = RandomForestRegressor(n_jobs=-1) 
reg.fit(x_train.values, y_train.values) 
feature_model = SelectFromModel(reg, prefit=True, max_features=int(x_train.shape[1]*0.8))
x_train_new = feature_model.transform(x_train)
model_selected_columns = x_train.columns[feature_model.get_support()]

x_train_new = pd.DataFrame(x_train_new, columns=model_selected_columns)
x_train_new[target_column[0]] = df_train_window[target_column[0]]

# pipe = Pipeline(steps=[
# ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
# ('model', RandomForestRegressor())])
# param_grid = {'model__n_estimators':[*range(10, 20, 10)],
#               'model__max_depth':[*range(5, 10, 5)]}
# # 网格搜索
# grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, n_jobs=-1) 
# grid.fit(x_train_new, y_train.values)

# model_rdd = sc.parallelize([grid.best_estimator_], 1)
# model_rdd.saveAsPickleFile('hdfs://192.168.13.36:8020/ywj/12')

print(x_train_new.shape)
df1 = ps.from_pandas(x_train_new).to_spark()