### KNN best parameters searching
- train separately for each turbine_id
- finding the best features combination by recursively eliminating them
- input data scaling strategy (StandardScaler, MinMaxScaler, QuantileTransformer, RobustScaler)
- number of neighbors (K = 1...20)
- Minkowski distance degree (p = 1,2)
- Distances to neighbors - not using, linear(1/weights), quadratic (1/weights**2)

In [1]:
from sklearn import neighbors 
plt.style.use('seaborn')
pd.options.display.float_format = '{:,.5f}'.format

In [2]:
time_start = dt.now()

In [3]:
# read train dataset
df1 = pd.read_csv('data/train.csv')
df1['row_id'] = range(len(df1))

# read test dataset
df2 = pd.read_csv('data/new/test.csv')
df2['row_id'] = range(len(df2))
df2['row_id'] = df2['row_id']  + 1000000

# merge
df = pd.concat([df1,df2])
del df1,df2
gc.collect()

# add fold for splitting
np.random.seed(1234)
df['fold'] = np.random.randint(0,3,len(df))
# drop some features
feats_drop = ['timestamp','active_power_calculated_by_converter','reactice_power_calculated_by_converter']
for f in feats_drop:
    if f in df.columns:
        del df[f]
# label encoder of categorical feats
feats_cat = ['turbine_id']
list_lbl  = []
for f in feats_cat:
    lbl = preprocessing.LabelEncoder()
    df[f] = lbl.fit_transform(df[f])
    list_lbl.append(lbl)
    
feats_init = [    
    "active_power_raw",
    "ambient_temperature",
    "generator_speed",
    "generator_winding_temp_max",
    "grid_power10min_average",
    "nc1_inside_temp",
    "nacelle_temp",
    "reactive_power",
    "wind_direction_raw",
    "wind_speed_raw",
    "wind_speed_turbulence",  
]

### Functions

In [4]:
def ts_metrics(y_true, y_pred):
    return {
        'mae': metrics.mean_absolute_error(y_true, y_pred),
        'mse': metrics.mean_squared_error(y_true, y_pred),
        'mape': np.mean(np.abs((y_true - y_pred) / y_true)),
        'smape': np.mean(np.abs( 2*(y_true - y_pred) / (y_true+np.abs(y_pred)))),
    }

In [5]:
def prepare_sets():
    # Sets creation
    feat_target = 'Target'

    filt_fold  = df.fold == 0
    filt_null  = df[feat_target].isnull()
    filt_turb  = df.turbine_id == turb

    filt_train = ~filt_fold & ~filt_null & filt_turb
    filt_valid = filt_fold & ~filt_null & filt_turb
    filt_test  = filt_null & filt_turb

    x, y   = df[filt_train][feats_used], df[filt_train][feat_target]
    xv, yv = df[filt_valid][feats_used], df[filt_valid][feat_target]
    xt, yt = df[filt_test][feats_used],  df[filt_test][feat_target]
    # print(x.shape, xv.shape, xt.shape)
    return x,y,xv,yv

In [6]:
def knn_search():
    dict_scalers = {
        'z_score': preprocessing.StandardScaler(),
        'min_max': preprocessing.MinMaxScaler(),
        'quantile': preprocessing.QuantileTransformer(),
        'robust': preprocessing.RobustScaler(),
    }
    k_max = 20
    list1 = list()
    list2 = list()
    list3 = list()
    list4 = list()
    list5 = list()
    # find the best k, weights and data scaler
    for scaler_name, scaler in dict_scalers.items():
        scaler.fit(x)
        x1 = scaler.transform(x)
        xv1 = scaler.transform(xv)
        for p in [1,2]:
            # fitting
            knn = neighbors.NearestNeighbors(n_neighbors=k_max, n_jobs=-1, p=p)
            knn.fit(x1, y)
            # distances and neighbors
            dist,res = knn.kneighbors(xv1, return_distance=True)
            df_res = pd.DataFrame([y.iloc[r].values for r in res])
            df_dist = pd.DataFrame(dist)
            # without distances
            for k in range(1,k_max+1):
                pred = df_res.iloc[:,:k].mean(axis = 1)
                res = ts_metrics(yv, pred.fillna(0).values)
                list1.append(res)
                list2.append('no')
                list3.append(k)
                list4.append(p)
                list5.append(scaler_name)
            # linear dist
            df_mult = 1/df_dist
            for k in range(1,k_max+1):
                df_res1 = df_res * df_mult
                pred = df_res1.iloc[:,:k].mean(axis = 1) / df_mult.iloc[:,:k].mean(axis = 1)
                res = ts_metrics(yv, pred.fillna(0).values)
                list1.append(res)
                list2.append('linear')
                list3.append(k)
                list4.append(p)
                list5.append(scaler_name)
            # square dist
            df_mult = (1/df_dist)**2
            for k in range(1,k_max+1):
                df_res1 = df_res * df_mult
                pred = df_res1.iloc[:,:k].mean(axis = 1) / df_mult.iloc[:,:k].mean(axis = 1)
                res = ts_metrics(yv, pred.fillna(0).values)
                list1.append(res)
                list2.append('square')
                list3.append(k)
                list4.append(p)
                list5.append(scaler_name)

    df_out = pd.DataFrame(list1)
    df_out['weight'] = list2
    df_out['K'] = list3
    df_out['p'] = list4
    df_out['scaler'] = list5
    # best by mape
    df_best = df_out.iloc[[df_out.mape.argmin()]]
    return df_best, df_out

### Main loop with recursive feature elimination

In [7]:
dt_start = dt.now()
df_res3 = pd.DataFrame()
for turb in range(16):
    print('------------')
    print('turb =', turb)
    print(dt.now() - dt_start)
    feats_best = feats_init
    best_mape = 1
    
    df_res2 = pd.DataFrame()
    for it,_ in enumerate(feats_init):
        df_res1 = pd.DataFrame()
        feats_loop = [''] + feats_best if it == 0 else feats_best
        for f in feats_loop:
            feats_used = [f1 for f1 in feats_best if f1 != f]
            x,y,xv,yv = prepare_sets()
            df_res0, df_all = knn_search()
            df_res0['feat'] = f
            df_res1 = pd.concat([df_res1, df_res0])
        
        # collect scores
        df_res1['it'] = it
        df_res1['feats_best'] = [feats_best] * len(df_res1)
        df_res2 = pd.concat([df_res2, df_res1])
        
        # we found the best on the previous loop
        best_step_i = df_res1.mape.argmin()
        best_step_feat = df_res1.iloc[best_step_i].feat
        best_step_mape = df_res1.iloc[best_step_i].mape
        print(it, best_step_feat, best_step_mape)
        
        # compare with current best 
        if best_step_mape > best_mape:
            break
                  
        # update current best
        best_mape = best_step_mape
        feats_best = [f for f in feats_best if f != best_step_feat] 
        
        if best_step_feat == '':
            break
        
    df_res2['turb'] = turb
    df_res3 = pd.concat([df_res3, df_res2])

------------
turb = 0
0:00:00.000999
0 nc1_inside_temp 0.017128855211556764
1 generator_speed 0.01679630530536154
2 wind_speed_raw 0.01665115162016674
3 active_power_raw 0.01659617178156569
4 ambient_temperature 0.016979193334731885
------------
turb = 1
0:09:36.257129
0 active_power_raw 0.013294973431941902
1 generator_speed 0.01276703395143814
2 wind_speed_raw 0.01225195063824249
3 reactive_power 0.011518702308365705
4 wind_speed_turbulence 0.01082971864911946
5 nc1_inside_temp 0.010585917961203582
6 grid_power10min_average 0.012611117520177679
------------
turb = 2
0:20:59.222782
0 active_power_raw 0.010465637182887663
1 nc1_inside_temp 0.010019283567249938
2 reactive_power 0.009503167139994132
3 wind_speed_raw 0.009137045522279764
4 generator_speed 0.008436722657023754
5 wind_speed_turbulence 0.007895417690188514
6 nacelle_temp 0.008403031963397287
------------
turb = 3
0:33:52.686052
0 active_power_raw 0.008677736058400165
1 generator_speed 0.007804016274468731
2 reactive_power 0.

### The best results

In [8]:
df_res3.reset_index(inplace = True, drop = True)
df_res3.to_pickle('data/df_knn.pkl')

In [9]:
df_res3 = pd.read_pickle('data/df_knn.pkl')
df_res3['mape_min'] = df_res3.groupby('turb').mape.transform(min)
f1 = df_res3.mape_min == df_res3.mape
df_res4 = df_res3[f1]
df_res4['feats'] = df_res4.apply(lambda x: [x1 for x1 in x.feats_best if x1!=x.feat], axis = 1)
del df_res4['feats_best']
df_res4['params'] = df_res4.apply(lambda x: [x.scaler,x.weight, x.K, x.p], axis = 1)
df_res4

Unnamed: 0,mae,mse,mape,smape,weight,K,p,scaler,feat,it,turb,mape_min,feats,params
31,0.83258,2.46082,0.0166,0.01657,square,2,1,min_max,active_power_raw,3,0,0.0166,"[ambient_temperature, generator_winding_temp_m...","[min_max, square, 2, 1]"
95,0.49381,1.09166,0.01059,0.01053,linear,1,1,z_score,nc1_inside_temp,5,1,0.01059,"[ambient_temperature, generator_winding_temp_m...","[z_score, linear, 1, 1]"
154,0.36182,0.54143,0.0079,0.00788,linear,1,1,z_score,wind_speed_turbulence,5,2,0.0079,"[ambient_temperature, generator_winding_temp_m...","[z_score, linear, 1, 1]"
205,0.23746,0.41979,0.00516,0.00514,square,1,1,robust,wind_speed_turbulence,4,3,0.00516,"[ambient_temperature, generator_winding_temp_m...","[robust, square, 1, 1]"
249,0.52953,0.75386,0.01139,0.01135,square,3,1,min_max,wind_speed_raw,3,4,0.01139,"[ambient_temperature, generator_winding_temp_m...","[min_max, square, 3, 1]"
295,0.59299,0.86108,0.01297,0.01296,square,3,1,min_max,wind_speed_raw,3,5,0.01297,"[ambient_temperature, generator_winding_temp_m...","[min_max, square, 3, 1]"
346,0.44312,0.47984,0.00986,0.00985,square,3,1,min_max,nc1_inside_temp,4,6,0.00986,"[ambient_temperature, generator_winding_temp_m...","[min_max, square, 3, 1]"
401,0.34642,0.59092,0.00742,0.0074,square,1,1,robust,wind_speed_turbulence,4,7,0.00742,"[ambient_temperature, generator_winding_temp_m...","[robust, square, 1, 1]"
440,0.44389,0.59006,0.00972,0.00969,square,2,1,min_max,generator_speed,3,8,0.00972,"[ambient_temperature, generator_winding_temp_m...","[min_max, square, 2, 1]"
499,0.31975,0.46636,0.00671,0.0067,square,1,1,z_score,wind_speed_turbulence,4,9,0.00671,"[ambient_temperature, generator_winding_temp_m...","[z_score, square, 1, 1]"


In [10]:
# dict with best features (for main script)
df_res4.set_index('turb')['feats'].to_dict()

{0: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nacelle_temp',
  'reactive_power',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 1: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nacelle_temp',
  'wind_direction_raw'],
 2: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nacelle_temp',
  'wind_direction_raw'],
 3: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw'],
 4: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 5: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 6: ['ambient_temperature',
  'generator_win

In [11]:
# dict with best params (for main script)
df_res4.set_index('turb')['params'].to_dict()

{0: ['min_max', 'square', 2, 1],
 1: ['z_score', 'linear', 1, 1],
 2: ['z_score', 'linear', 1, 1],
 3: ['robust', 'square', 1, 1],
 4: ['min_max', 'square', 3, 1],
 5: ['min_max', 'square', 3, 1],
 6: ['min_max', 'square', 3, 1],
 7: ['robust', 'square', 1, 1],
 8: ['min_max', 'square', 2, 1],
 9: ['z_score', 'square', 1, 1],
 10: ['z_score', 'square', 2, 1],
 11: ['min_max', 'square', 3, 1],
 12: ['z_score', 'linear', 1, 1],
 13: ['z_score', 'square', 1, 1],
 14: ['min_max', 'square', 2, 1],
 15: ['z_score', 'square', 2, 1]}

In [18]:
df_res4.mape.mean()

0.011296764026636761

In [12]:
print('Working time: ', dt.now() - time_start) 

Working time:  3:04:42.040468
