### Xgboost best parameters searching
- train separately for each turbine_id
- max_depth

In [2]:
import xgboost as xgb
plt.style.use('seaborn')
pd.options.display.float_format = '{:,.5f}'.format

In [3]:
time_start = dt.now()

In [4]:
# read train dataset
df1 = pd.read_csv('data/train.csv')
df1['row_id'] = range(len(df1))

# read test dataset
df2 = pd.read_csv('data/new/test.csv')
df2['row_id'] = range(len(df2))
df2['row_id'] = df2['row_id']  + 1000000

# merge
df = pd.concat([df1,df2])
del df1,df2
gc.collect()

# add fold for splitting
np.random.seed(1234)
df['fold'] = np.random.randint(0,3,len(df))
# drop some features
feats_drop = ['timestamp','active_power_calculated_by_converter','reactice_power_calculated_by_converter']
for f in feats_drop:
    if f in df.columns:
        del df[f]
# label encoder of categorical feats
feats_cat = ['turbine_id']
list_lbl  = []
for f in feats_cat:
    lbl = preprocessing.LabelEncoder()
    df[f] = lbl.fit_transform(df[f])
    list_lbl.append(lbl)

In [5]:
feats_used = [   
    "active_power_raw",
    "ambient_temperature",
    "generator_speed",
    "generator_winding_temp_max",
    "grid_power10min_average",
    "nc1_inside_temp",
    "nacelle_temp",
    "reactive_power",
    "wind_direction_raw",
    "wind_speed_raw",
    "wind_speed_turbulence",  
]

### Functions

In [4]:
def ts_metrics(y_true, y_pred):
    return {
        'mae': metrics.mean_absolute_error(y_true, y_pred),
        'mse': metrics.mean_squared_error(y_true, y_pred),
        'mape': np.mean(np.abs((y_true - y_pred) / y_true)),
        'smape': np.mean(np.abs( 2*(y_true - y_pred) / (y_true+np.abs(y_pred)))),
    }

In [5]:
def prepare_sets():
    # Sets creation
    feat_target = 'Target'

    filt_fold  = df.fold == 0
    filt_null  = df[feat_target].isnull()
    filt_turb  = df.turbine_id == turb

    filt_train = ~filt_fold & ~filt_null & filt_turb
    filt_valid = filt_fold & ~filt_null & filt_turb
    filt_test  = filt_null & filt_turb

    x, y   = df[filt_train][feats_used], df[filt_train][feat_target]
    xv, yv = df[filt_valid][feats_used], df[filt_valid][feat_target]
    xt, yt = df[filt_test][feats_used],  df[filt_test][feat_target]
    # print(x.shape, xv.shape, xt.shape)
    
    x_ = xgb.DMatrix(x.values, 
                label = y, 
                feature_names = feats_used)
    xv_ = xgb.DMatrix(xv.values, 
                label = yv, 
                feature_names = feats_used)
    return x,y,x_,xv,yv,xv_

In [6]:
def xgb_search():
    params = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'objective': 'reg:squarederror', 
        # 'eval_metric': 'logloss',
        'eta': 0.1,
        'max_depth': 6,  # -1 means no limit
        'subsample': 1,  # Subsample ratio of the training instance.
        'colsample_bytree': 1,  # Subsample ratio of columns when constructing each tree.
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': -1,
        'verbosity': 0
    } 
    list1 = list()
    list2 = list()
    list3 = list()
    list4 = list()
    for max_depth in [5,6,7,8,9,10]:
        for subsample in [1]:
            for colsample_bytree in [1]:
                # fitting
                params['max_depth'] = max_depth
                params['subsample'] = subsample
                params['colsample_bytree'] = colsample_bytree
                evals_results = {}
                model_xgb     = xgb.train(params,
                                        x_,
                                         evals=[
                                             (x_,'train'), 
                                             (xv_,'valid'),
                                         ],  
                                        evals_result=evals_results,
                                        num_boost_round=10000,
                                        early_stopping_rounds=50,
                                        verbose_eval=50000, 
                                        feval=None) 
                pred = model_xgb.predict(xv_)
                res = ts_metrics(yv, pred)
                list1.append(res)
                list2.append(max_depth)
                list3.append(subsample)
                list4.append(colsample_bytree)
                
    df_out = pd.DataFrame(list1)
    df_out['max_depth'] = list2
    df_out['subsample'] = list3
    df_out['colsample_bytree'] = list4
    
    return df_out

### Main loop

In [7]:
dt_start = dt.now()
df_res3 = pd.DataFrame()
for turb in range(16):
    print('------------')
    print('turb =', turb)
    print(dt.now() - dt_start)
    
    x,y,x_,xv,yv,xv_ = prepare_sets()
    df_res0 = xgb_search()
    df_res0['turb'] = turb
    df_res3 = pd.concat([df_res3, df_res0])

------------
turb = 0
0:00:00.001000
[0]	train-rmse:44.02285	valid-rmse:44.02081
[2018]	train-rmse:0.66254	valid-rmse:1.40305
[0]	train-rmse:44.02179	valid-rmse:44.02030
[1362]	train-rmse:0.53467	valid-rmse:1.38593
[0]	train-rmse:44.02081	valid-rmse:44.01949
[1033]	train-rmse:0.39374	valid-rmse:1.37285
[0]	train-rmse:44.01970	valid-rmse:44.01837
[744]	train-rmse:0.31343	valid-rmse:1.37153
[0]	train-rmse:44.01889	valid-rmse:44.01692
[975]	train-rmse:0.08956	valid-rmse:1.35853
[0]	train-rmse:44.01802	valid-rmse:44.01544
[1404]	train-rmse:0.00894	valid-rmse:1.35585
------------
turb = 1
0:01:12.149195
[0]	train-rmse:42.07700	valid-rmse:42.05416
[1104]	train-rmse:0.63386	valid-rmse:1.01880
[0]	train-rmse:42.07668	valid-rmse:42.05430
[1228]	train-rmse:0.42514	valid-rmse:0.99243
[0]	train-rmse:42.07645	valid-rmse:42.05403
[832]	train-rmse:0.36406	valid-rmse:0.97959
[0]	train-rmse:42.07620	valid-rmse:42.05423
[1257]	train-rmse:0.11825	valid-rmse:0.95686
[0]	train-rmse:42.07595	valid-rmse:42.0

### The best results

In [8]:
df_res3.reset_index(inplace = True, drop = True)
df_res3.to_pickle('data/df_xgb.pkl')

In [10]:
df_res3 = pd.read_pickle('data/df_xgb.pkl')
df_res3['mape_min'] = df_res3.groupby('turb').mape.transform(min)
f1 = df_res3.mape_min == df_res3.mape
df_res4 = df_res3[f1]
df_res4

Unnamed: 0,mae,mse,mape,smape,max_depth,subsample,colsample_bytree,turb,mape_min
5,0.88855,1.83833,0.01774,0.01774,10,1,1,0,0.01774
11,0.62698,0.90229,0.01345,0.01338,10,1,1,1,0.01345
17,0.46768,0.45043,0.01025,0.01023,10,1,1,2,0.01025
23,0.46803,0.54502,0.01019,0.01014,10,1,1,3,0.01019
29,0.60402,0.77163,0.01302,0.01298,10,1,1,4,0.01302
35,0.6746,0.85772,0.01479,0.01477,10,1,1,5,0.01479
41,0.50903,0.49806,0.01136,0.01135,10,1,1,6,0.01136
47,0.52972,0.58368,0.01137,0.01134,10,1,1,7,0.01137
53,0.53146,0.59858,0.0117,0.01167,10,1,1,8,0.0117
59,0.4838,0.47027,0.01019,0.01017,10,1,1,9,0.01019


In [11]:
df_res4.mape.mean()

0.013346417779854871

In [12]:
print('Working time: ', dt.now() - time_start) 

Working time:  0:26:02.917938
