### Lightgbm best parameters searching
- train separately for each turbine_id
- num_leaves
- subsample
- colsample_bytree

In [1]:
import lightgbm as lgb
plt.style.use('seaborn')
pd.options.display.float_format = '{:,.5f}'.format

In [2]:
time_start = dt.now()

In [3]:
# read train dataset
df1 = pd.read_csv('data/train.csv')
df1['row_id'] = range(len(df1))

# read test dataset
df2 = pd.read_csv('data/new/test.csv')
df2['row_id'] = range(len(df2))
df2['row_id'] = df2['row_id']  + 1000000

# merge
df = pd.concat([df1,df2])
del df1,df2
gc.collect()

# add fold for splitting
np.random.seed(1234)
df['fold'] = np.random.randint(0,3,len(df))
# drop some features
feats_drop = ['timestamp','active_power_calculated_by_converter','reactice_power_calculated_by_converter']
for f in feats_drop:
    if f in df.columns:
        del df[f]
# label encoder of categorical feats
feats_cat = ['turbine_id']
list_lbl  = []
for f in feats_cat:
    lbl = preprocessing.LabelEncoder()
    df[f] = lbl.fit_transform(df[f])
    list_lbl.append(lbl)
    
feats_used = [   
    "active_power_raw",
    "ambient_temperature",
    "generator_speed",
    "generator_winding_temp_max",
    "grid_power10min_average",
    "nc1_inside_temp",
    "nacelle_temp",
    "reactive_power",
    "wind_direction_raw",
    "wind_speed_raw",
    "wind_speed_turbulence",  
]

### Functions

In [4]:
def ts_metrics(y_true, y_pred):
    return {
        'mae': metrics.mean_absolute_error(y_true, y_pred),
        'mse': metrics.mean_squared_error(y_true, y_pred),
        'mape': np.mean(np.abs((y_true - y_pred) / y_true)),
        'smape': np.mean(np.abs( 2*(y_true - y_pred) / (y_true+np.abs(y_pred)))),
    }

In [5]:
def prepare_sets():
    # Sets creation
    feat_target = 'Target'

    filt_fold  = df.fold == 0
    filt_null  = df[feat_target].isnull()
    filt_turb  = df.turbine_id == turb

    filt_train = ~filt_fold & ~filt_null & filt_turb
    filt_valid = filt_fold & ~filt_null & filt_turb
    filt_test  = filt_null & filt_turb

    x, y   = df[filt_train][feats_used], df[filt_train][feat_target]
    xv, yv = df[filt_valid][feats_used], df[filt_valid][feat_target]
    xt, yt = df[filt_test][feats_used],  df[filt_test][feat_target]
    # print(x.shape, xv.shape, xt.shape)
    
    x_ = lgb.Dataset(x.values, 
                    label = y, 
                    feature_name = feats_used,
                    free_raw_data=False)
    xv_ = lgb.Dataset(xv.values, 
                    label = yv, 
                    feature_name = feats_used,
                    free_raw_data=False)
    return x,y,x_,xv,yv,xv_

In [6]:
def lgb_search():
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
    #     'metric': 'auc',
        'learning_rate': 0.1,
        'num_boost_round': 10000,
        'early_stopping_rounds': 50,

        'nthread': -1,
        'verbose': 0,

        # fitting params
        'boost_from_average': True,
        'scale_pos_weight': 1,

        'max_bin': 255,

        'num_leaves': 63,
        'max_depth': -1,
        'min_child_samples': 20,
        'min_child_weight': 1e-3,
        'min_split_gain': 0,

        'subsample': 1,
        'colsample_bytree': 1,

        'reg_alpha': 0, 
        'reg_lambda': 0,

    }
    list1 = list()
    list2 = list()
    list3 = list()
    list4 = list()
    for num_leaves in [31,63,127,255,511,1023]:
        for subsample in [1]:
            for colsample_bytree in [1]:
                # fitting
                params['num_leaves'] = num_leaves
                params['subsample'] = subsample
                params['colsample_bytree'] = colsample_bytree
                evals_results = {}
                model_lgb     = lgb.train(params,
                                        x_,
                                        valid_sets=[x_, xv_ ], 
                                        valid_names=['train', 'valid'], 
                                        evals_result=evals_results, 
                                        verbose_eval=50000, 
                                        feval=None) 
                pred = model_lgb.predict(xv)
                res = ts_metrics(yv, pred)
                list1.append(res)
                list2.append(num_leaves)
                list3.append(subsample)
                list4.append(colsample_bytree)
                
    df_out = pd.DataFrame(list1)
    df_out['leaves'] = list2
    df_out['subsample'] = list3
    df_out['colsample_bytree'] = list4
    
    return df_out

### Main loop

In [7]:
dt_start = dt.now()
df_res3 = pd.DataFrame()
for turb in range(16):
    print('------------')
    print('turb =', turb)
    print(dt.now() - dt_start)
    
    x,y,x_,xv,yv,xv_ = prepare_sets()
    df_res0 = lgb_search()
    df_res0['turb'] = turb
    df_res3 = pd.concat([df_res3, df_res0])

------------
turb = 0
0:00:00.001012
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1094]	train's l2: 0.619471	valid's l2: 1.89582
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1991]	train's l2: 0.076811	valid's l2: 1.81091
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[965]	train's l2: 0.071851	valid's l2: 1.78434
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[533]	train's l2: 0.0515228	valid's l2: 1.75217
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[7

### The best results

In [8]:
df_res3.reset_index(inplace = True, drop = True)
df_res3.to_pickle('data/df_lgb.pkl')

In [12]:
df_res3 = pd.read_pickle('data/df_lgb.pkl')
df_res3['mape_min'] = df_res3.groupby('turb').mape.transform(min)
f1 = df_res3.mape_min == df_res3.mape
df_res4 = df_res3[f1]
df_res4

Unnamed: 0,mae,mse,mape,smape,leaves,subsample,colsample_bytree,turb,mape_min
5,0.86888,1.77134,0.01736,0.01735,1023,1,1,0,0.01736
11,0.62209,0.84761,0.01334,0.01327,1023,1,1,1,0.01334
17,0.45891,0.42907,0.01006,0.01004,1023,1,1,2,0.01006
23,0.4545,0.49963,0.0099,0.00986,1023,1,1,3,0.0099
28,0.60037,0.77105,0.01295,0.0129,511,1,1,4,0.01295
35,0.65623,0.81861,0.01439,0.01438,1023,1,1,5,0.01439
41,0.50113,0.48208,0.01119,0.01118,1023,1,1,6,0.01119
47,0.51864,0.55482,0.01113,0.01111,1023,1,1,7,0.01113
53,0.51668,0.56886,0.01138,0.01135,1023,1,1,8,0.01138
59,0.47514,0.44594,0.01,0.00999,1023,1,1,9,0.01


In [10]:
df_res4.mape.mean()

0.013066885387846559

In [11]:
print('Working time: ', dt.now() - time_start) 

Working time:  0:19:09.073529
