### Random forest best parameters searching
- train separately for each turbine_id
- max_depth

In [1]:
from sklearn import ensemble
plt.style.use('seaborn')
pd.options.display.float_format = '{:,.5f}'.format

In [2]:
time_start = dt.now()

In [3]:
# read train dataset
df1 = pd.read_csv('data/train.csv')
df1['row_id'] = range(len(df1))

# read test dataset
df2 = pd.read_csv('data/new/test.csv')
df2['row_id'] = range(len(df2))
df2['row_id'] = df2['row_id']  + 1000000

# merge
df = pd.concat([df1,df2])
del df1,df2
gc.collect()

# add fold for splitting
np.random.seed(1234)
df['fold'] = np.random.randint(0,3,len(df))
# drop some features
feats_drop = ['timestamp','active_power_calculated_by_converter','reactice_power_calculated_by_converter']
for f in feats_drop:
    if f in df.columns:
        del df[f]
# label encoder of categorical feats
feats_cat = ['turbine_id']
list_lbl  = []
for f in feats_cat:
    lbl = preprocessing.LabelEncoder()
    df[f] = lbl.fit_transform(df[f])
    list_lbl.append(lbl)
    
feats_used = [   
    "active_power_raw",
    "ambient_temperature",
    "generator_speed",
    "generator_winding_temp_max",
    "grid_power10min_average",
    "nc1_inside_temp",
    "nacelle_temp",
    "reactive_power",
    "wind_direction_raw",
    "wind_speed_raw",
    "wind_speed_turbulence",  
]

### Functions

In [4]:
def ts_metrics(y_true, y_pred):
    return {
        'mae': metrics.mean_absolute_error(y_true, y_pred),
        'mse': metrics.mean_squared_error(y_true, y_pred),
        'mape': np.mean(np.abs((y_true - y_pred) / y_true)),
        'smape': np.mean(np.abs( 2*(y_true - y_pred) / (y_true+np.abs(y_pred)))),
    }

In [5]:
def prepare_sets():
    # Sets creation
    feat_target = 'Target'

    filt_fold  = df.fold == 0
    filt_null  = df[feat_target].isnull()
    filt_turb  = df.turbine_id == turb

    filt_train = ~filt_fold & ~filt_null & filt_turb
    filt_valid = filt_fold & ~filt_null & filt_turb
    filt_test  = filt_null & filt_turb

    x, y   = df[filt_train][feats_used], df[filt_train][feat_target]
    xv, yv = df[filt_valid][feats_used], df[filt_valid][feat_target]
    xt, yt = df[filt_test][feats_used],  df[filt_test][feat_target]
    # print(x.shape, xv.shape, xt.shape)

    return x,y,xv,yv

In [6]:
def rf_search():
    list1 = list()
    list2 = list()
    for max_depth in [5,10,None]:
        rf = ensemble.RandomForestRegressor(n_estimators=500, max_depth=max_depth, n_jobs=-1)
        rf.fit(x, y)
        pred = rf.predict(xv)
        res = ts_metrics(yv, pred)
        list1.append(res)
        list2.append(max_depth)
                
    df_out = pd.DataFrame(list1)
    df_out['max_depth'] = list2
    return df_out

### Main loop

In [7]:
dt_start = dt.now()
df_res3 = pd.DataFrame()
for turb in range(16):
    print('------------')
    print('turb =', turb)
    print(dt.now() - dt_start)
    
    x,y,xv,yv = prepare_sets()
    df_res0 = rf_search()
    df_res0['turb'] = turb
    df_res3 = pd.concat([df_res3, df_res0])

------------
turb = 0
0:00:00.001000
------------
turb = 1
0:00:56.001115
------------
turb = 2
0:01:54.487196
------------
turb = 3
0:02:56.073701
------------
turb = 4
0:03:58.144630
------------
turb = 5
0:05:00.184277
------------
turb = 6
0:06:01.517457
------------
turb = 7
0:07:05.109727
------------
turb = 8
0:08:08.732025
------------
turb = 9
0:09:15.039057
------------
turb = 10
0:10:23.722612
------------
turb = 11
0:11:31.923064
------------
turb = 12
0:12:38.363359
------------
turb = 13
0:13:46.705399
------------
turb = 14
0:14:54.705902
------------
turb = 15
0:16:03.129373


### The best results

In [8]:
df_res3.reset_index(inplace = True, drop = True)
df_res3.to_pickle('data/df_rf.pkl')

In [9]:
df_res3 = pd.read_pickle('data/df_rf.pkl')
df_res3['mape_min'] = df_res3.groupby('turb').mape.transform(min)
f1 = df_res3.mape_min == df_res3.mape
df_res4 = df_res3[f1]
df_res4

Unnamed: 0,mae,mse,mape,smape,max_depth,turb,mape_min
2,0.88718,1.89118,0.0177,0.01769,,0,0.0177
5,0.62344,0.908,0.0134,0.01332,,1,0.0134
8,0.47958,0.47869,0.01052,0.0105,,2,0.01052
11,0.45597,0.54031,0.00996,0.00989,,3,0.00996
14,0.593,0.77769,0.01279,0.01274,,4,0.01279
17,0.65569,0.82683,0.01438,0.01436,,5,0.01438
20,0.50494,0.49859,0.01128,0.01126,,6,0.01128
23,0.53469,0.61333,0.01149,0.01146,,7,0.01149
26,0.51963,0.59295,0.01147,0.01142,,8,0.01147
29,0.47788,0.47668,0.01009,0.01006,,9,0.01009


In [10]:
df_res4.mape.mean()

0.013842084914805278

In [11]:
print('Working time: ', dt.now() - time_start) 

Working time:  0:17:09.377572
