In [56]:
# basic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint

# plot
import seaborn as sns
import matplotlib.pyplot as plt

# preprocess
#from sklearn.preprocessing import LabelEncoder

# model
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

import lightgbm as lgb

# optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error as MSE

# importance
import eli5
from eli5.sklearn import PermutationImportance

# other
import pickle
import time
import datetime
#import sys
#import gc

In [57]:
train = pd.read_pickle('../features/feature_train_2020-11-07-08-04-37_treated.pkl')
test = pd.read_pickle('../features/feature_test_2020-11-07-08-04-37_treated.pkl')
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

In [58]:
train

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car_name_code,car_brand_code
0,0,23.059782,6,140,110.0,2815,17.977429,80,1,33,6
1,3,17.674521,8,350,150.0,4456,13.514535,72,1,39,6
2,4,17.136353,8,302,140.0,2774,13.209912,79,1,59,11
3,7,22.664666,6,400,85.0,2190,15.196381,71,1,72,16
4,9,17.872018,8,429,220.0,2245,9.621400,70,1,47,8
...,...,...,...,...,...,...,...,...,...,...,...
495,981,22.798447,4,140,148.0,2835,13.477573,82,1,25,5
496,983,35.173640,4,97,67.0,2234,17.542681,80,3,69,15
497,994,17.825448,8,302,220.0,2774,15.177189,76,1,81,20
498,995,28.545147,4,97,150.0,2130,13.324669,70,1,31,5


In [120]:
def score(params):
    params['max_depth'] = int(params['max_depth'])
    params['num_leaves'] = int(params['num_leaves'])
    params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
    
    model = lgb.LGBMRegressor(**params, random_state=0, n_jobs=-1)
    model.fit(X=X_train, y=Y_train, eval_set=(X_valid, Y_valid))
    Y_pred = model.predict(X_valid)
    score = np.sqrt(MSE(y_true=Y_valid, y_pred=Y_pred))
    print(f'params: {params}, score:{score:.4f}')
    
    history.append((model, score))
    return {'loss':score, 'status':STATUS_OK}

In [118]:
param_space = {
    'alpha' : hp.loguniform(
        label='alpha', 
        low=np.log(1e-8), 
        high=np.log(1.0)
    ),
    'bagging_fraction': hp.quniform(
        label='bagging_fraction', low=0.6, high=0.95, q=0.05
    ),
    'feature_fraction': hp.quniform(
        label='feature_fraction', low=0.6, high=0.95, q=0.05
    ),
    'gamma': hp.quniform(
        label='gamma', low=0.1, high=0.4, q=0.1
    ),
    'lambda' : hp.loguniform(
        label='lambda', 
        low=np.log(1e-6), 
        high=np.log(10.0)
    ),
    'max_depth': hp.quniform(
        label='max_depth', low=3, high=9, q=1
    ),
    'min_child_weight': hp.quniform(
        label='min_child_weight', low=1, high=5, q=1
    ),
    'min_data_in_leaf': hp.quniform(
        label='min_data_in_leaf', low=5, high=20, q=2
    ),
    'num_leaves': hp.quniform(
        label='num_leaves', low=50, high=200, q=10
    )
}

In [121]:
history = []
best = []

train_X = train.drop(['id', 'mpg'], axis=1)
train_y = train['mpg']

for i in range(5):
    kf = KFold(n_splits=4, shuffle=True, random_state=i)
    for tr_idx, va_idx in kf.split(train_X):
        X_train = train_X.loc[tr_idx]
        Y_train = train_y.loc[tr_idx]
        X_valid = train_X.loc[va_idx]
        Y_valid = train_y.loc[va_idx]

        fmin(
            fn=score,
            space=param_space, 
            algo=tpe.suggest, 
            trials=Trials(),
            max_evals=10
        )

        history = sorted(history, key=lambda tpl:tpl[1])
        best.append(history[0])

[1]	valid_0's l2: 50.6247                                                                                              
[2]	valid_0's l2: 44.316                                                                                               
[3]	valid_0's l2: 38.7051                                                                                              
[4]	valid_0's l2: 34.6982                                                                                              
[5]	valid_0's l2: 31.4515                                                                                              
[6]	valid_0's l2: 28.6101                                                                                              
[7]	valid_0's l2: 26.2716                                                                                              
[8]	valid_0's l2: 24.2767                                                                                              
[9]	valid_0's l2: 22.6696               

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[11]	valid_0's l2: 14.9098
[12]	valid_0's l2: 13.8239                                                                                             
[13]	valid_0's l2: 13.0619                                                                                             
[14]	valid_0's l2: 12.5428                                                                                             
[15]	valid_0's l2: 11.957                                                                                              
[16]	valid_0's l2: 11.4538                                                                                             
[17]	valid_0's l2: 11.0156                                                                                             
[18]	valid_0's l2: 10.7588                                                                                             
[19]	valid_0's l2: 10.5014                                                                                             
[20]	valid_0'

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[70]	valid_0's l2: 5.52697
[71]	valid_0's l2: 5.54032                                                                                             
[72]	valid_0's l2: 5.52624                                                                                             
[73]	valid_0's l2: 5.54423                                                                                             
[74]	valid_0's l2: 5.545                                                                                               
[75]	valid_0's l2: 5.52838                                                                                             
[76]	valid_0's l2: 5.51943                                                                                             
[77]	valid_0's l2: 5.51161                                                                                             
[78]	valid_0's l2: 5.49743                                                                                             
[79]	valid_0'

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[68]	valid_0's l2: 7.65193                                                                                             
[69]	valid_0's l2: 7.63729                                                                                             
[70]	valid_0's l2: 7.62955                                                                                             
[71]	valid_0's l2: 7.63944                                                                                             
[72]	valid_0's l2: 7.65457                                                                                             
[73]	valid_0's l2: 7.6487                                                                                              
[74]	valid_0's l2: 7.61652                                                                                             
[75]	valid_0's l2: 7.60172                                                                                             
[76]	valid_0's l2: 7.60659              

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[95]	valid_0's l2: 12.9128                                                                                             
[96]	valid_0's l2: 12.9338                                                                                             
[97]	valid_0's l2: 12.8642                                                                                             
[98]	valid_0's l2: 12.9324                                                                                             
[99]	valid_0's l2: 12.9488                                                                                             
[100]	valid_0's l2: 12.9388                                                                                            
params: {'alpha': 0.75357358939517, 'bagging_fraction': 0.8, 'feature_fraction': 0.7000000000000001, 'gamma': 0.30000000000000004, 'lambda': 7.979305601149363e-05, 'max_depth': 3, 'min_child_weight': 2.0, 'min_data_in_leaf': 10, 'num_leaves': 90}, score:3.5971
[1]	valid_0's l2: 5

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[89]	valid_0's l2: 7.64867                                                                                             
[90]	valid_0's l2: 7.63003                                                                                             
[91]	valid_0's l2: 7.61541                                                                                             
[92]	valid_0's l2: 7.60591                                                                                             
[93]	valid_0's l2: 7.59129                                                                                             
[94]	valid_0's l2: 7.58071                                                                                             
[95]	valid_0's l2: 7.55596                                                                                             
[96]	valid_0's l2: 7.52211                                                                                             
[97]	valid_0's l2: 7.53136              

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[14]	valid_0's l2: 16.7742
[15]	valid_0's l2: 16.1012                                                                                             
[16]	valid_0's l2: 15.3912                                                                                             
[17]	valid_0's l2: 14.9336                                                                                             
[18]	valid_0's l2: 14.2799                                                                                             
[19]	valid_0's l2: 13.7561                                                                                             
[20]	valid_0's l2: 13.3192                                                                                             
[21]	valid_0's l2: 12.9357                                                                                             
[22]	valid_0's l2: 12.6959                                                                                             
[23]	valid_0'

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[31]	valid_0's l2: 9.90264
[32]	valid_0's l2: 9.88056                                                                                             
[33]	valid_0's l2: 9.81326                                                                                             
[34]	valid_0's l2: 9.71351                                                                                             
[35]	valid_0's l2: 9.56109                                                                                             
[36]	valid_0's l2: 9.48744                                                                                             
[37]	valid_0's l2: 9.455                                                                                               
[38]	valid_0's l2: 9.43573                                                                                             
[39]	valid_0's l2: 9.46348                                                                                             
[40]	valid_0'

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[35]	valid_0's l2: 7.2155
[36]	valid_0's l2: 7.18987                                                                                             
[37]	valid_0's l2: 7.19629                                                                                             
[38]	valid_0's l2: 7.23816                                                                                             
[39]	valid_0's l2: 7.17542                                                                                             
[40]	valid_0's l2: 7.17167                                                                                             
[41]	valid_0's l2: 7.18183                                                                                             
[42]	valid_0's l2: 7.20723                                                                                             
[43]	valid_0's l2: 7.20393                                                                                             
[44]	valid_0's

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[29]	valid_0's l2: 10.5638                                                                                             
[30]	valid_0's l2: 10.6471                                                                                             
[31]	valid_0's l2: 10.6616                                                                                             
[32]	valid_0's l2: 10.6722                                                                                             
[33]	valid_0's l2: 10.6599                                                                                             
[34]	valid_0's l2: 10.7021                                                                                             
[35]	valid_0's l2: 10.6684                                                                                             
[36]	valid_0's l2: 10.6265                                                                                             
[37]	valid_0's l2: 10.6146              

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[80]	valid_0's l2: 8.6515                                                                                              
[81]	valid_0's l2: 8.63702                                                                                             
[82]	valid_0's l2: 8.60541                                                                                             
[83]	valid_0's l2: 8.57217                                                                                             
[84]	valid_0's l2: 8.56759                                                                                             
[85]	valid_0's l2: 8.58264                                                                                             
[86]	valid_0's l2: 8.55174                                                                                             
[87]	valid_0's l2: 8.56113                                                                                             
[88]	valid_0's l2: 8.54749              

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[55]	valid_0's l2: 7.01778
[56]	valid_0's l2: 6.94848                                                                                             
[57]	valid_0's l2: 6.90167                                                                                             
[58]	valid_0's l2: 6.8642                                                                                              
[59]	valid_0's l2: 6.83717                                                                                             
[60]	valid_0's l2: 6.83057                                                                                             
[61]	valid_0's l2: 6.82072                                                                                             
[62]	valid_0's l2: 6.79834                                                                                             
[63]	valid_0's l2: 6.79266                                                                                             
[64]	valid_0'

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [127]:
for i, model_i in enumerate(best):
    print(i)
    print(model_i[0].get_params)

0
<bound method LGBMModel.get_params of LGBMRegressor(alpha=1.2151384107122742e-05, bagging_fraction=0.6000000000000001,
              feature_fraction=0.8, gamma=0.2, lambda=4.422392617877761e-05,
              max_depth=5, min_child_weight=3.0, min_data_in_leaf=12,
              num_leaves=190, random_state=0)>
1
<bound method LGBMModel.get_params of LGBMRegressor(alpha=1.2633460083202999e-07, bagging_fraction=0.9,
              feature_fraction=0.65, gamma=0.30000000000000004,
              lambda=2.8557142749719215, max_depth=9, min_child_weight=2.0,
              min_data_in_leaf=6, num_leaves=100, random_state=0)>
2
<bound method LGBMModel.get_params of LGBMRegressor(alpha=1.2633460083202999e-07, bagging_fraction=0.9,
              feature_fraction=0.65, gamma=0.30000000000000004,
              lambda=2.8557142749719215, max_depth=9, min_child_weight=2.0,
              min_data_in_leaf=6, num_leaves=100, random_state=0)>
3
<bound method LGBMModel.get_params of LGBMRegressor(alpha

In [191]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
submission = pd.DataFrame({'id':sample_sub.id})
importance_list = []

for i, model_i in enumerate(best):
    submission = pd.concat(
        [submission, pd.Series(
            model_i[0].predict(test.drop(['id', 'mpg'], axis=1)),
            name='model_'+str(i)+'_pred'
        )],
        axis=1
    )
    
    pickle.dump(
        model_i[0],
        open('../models/model_' + dt + '_lightgbm_num' + str(i).zfill(2) + '.pickle','wb')
    )
    
    pickle.dump(
        model_i[0].get_params,
        open('../logs/params_' + dt + '_num' + str(i).zfill(2) + '.pickle', 'wb')
    )
    
    pickle.dump(
        model_i[1],
        open('../logs/train_score_' + dt + '_num' + str(i).zfill(2) + '.pickle', 'wb')
    )
    importance_list.append(model_i[0].feature_importances_.tolist())

submission['mean'] = submission.iloc[:, 1:].mean(axis='columns')

# save predictions for an ensemble
pickle.dump(
    submission, 
    open('../logs/test_preds' + dt + '.pickle', 'wb')
)

# create submission file
submission[['id', 'mean']].to_csv('../data/output/sub_' + dt + '_lightgbm.csv', header=False, index=False)

# Check Importance

## lgb importance method

In [199]:
importance_df = pd.DataFrame(importance_list, columns=train_X.columns)
display(importance_df.mean().sort_values(ascending=False))

acceleration      328.75
car_name_code     216.60
displacement      215.70
horsepower        212.60
weight            197.75
model year        161.25
cylinders          65.20
car_brand_code     60.95
origin             53.00
dtype: float64

## eli5 permutation importance

In [369]:
perm_df = pd.DataFrame(train_X.columns.values, columns=['feature'])
for i, model_i in enumerate(best):
    cols = 'weight' + str(i).zfill(2)
    tmp = eli5.explain_weights_df(model_i[0]).rename(columns={'weight':cols})
    perm_df = pd.concat([perm_df, tmp[cols]], axis=1)

perm_df['mean'] = perm_df.mean(axis=1)
display(perm_df)
display(perm_df.set_index('feature')['mean'])

Unnamed: 0,feature,weight00,weight01,weight02,weight03,weight04,weight05,weight06,weight07,weight08,...,weight11,weight12,weight13,weight14,weight15,weight16,weight17,weight18,weight19,mean
0,cylinders,0.597284,0.523872,0.523872,0.595842,0.595842,0.595842,0.595842,0.595842,0.595842,...,0.595842,0.595842,0.595842,0.595842,0.595842,0.595842,0.595842,0.595842,0.595842,0.588717
1,displacement,0.08715,0.123921,0.123921,0.088772,0.088772,0.088772,0.088772,0.088772,0.088772,...,0.088772,0.088772,0.088772,0.088772,0.088772,0.088772,0.088772,0.088772,0.088772,0.092206
2,horsepower,0.083577,0.093699,0.093699,0.082258,0.082258,0.082258,0.082258,0.082258,0.082258,...,0.082258,0.082258,0.082258,0.082258,0.082258,0.082258,0.082258,0.082258,0.082258,0.083468
3,weight,0.05145,0.066017,0.066017,0.060099,0.060099,0.060099,0.060099,0.060099,0.060099,...,0.060099,0.060099,0.060099,0.060099,0.060099,0.060099,0.060099,0.060099,0.060099,0.060258
4,acceleration,0.048199,0.059052,0.059052,0.048394,0.048394,0.048394,0.048394,0.048394,0.048394,...,0.048394,0.048394,0.048394,0.048394,0.048394,0.048394,0.048394,0.048394,0.048394,0.04945
5,model year,0.046614,0.045003,0.045003,0.043314,0.043314,0.043314,0.043314,0.043314,0.043314,...,0.043314,0.043314,0.043314,0.043314,0.043314,0.043314,0.043314,0.043314,0.043314,0.043648
6,origin,0.045289,0.039402,0.039402,0.042756,0.042756,0.042756,0.042756,0.042756,0.042756,...,0.042756,0.042756,0.042756,0.042756,0.042756,0.042756,0.042756,0.042756,0.042756,0.042547
7,car_name_code,0.028379,0.034461,0.034461,0.031317,0.031317,0.031317,0.031317,0.031317,0.031317,...,0.031317,0.031317,0.031317,0.031317,0.031317,0.031317,0.031317,0.031317,0.031317,0.031485
8,car_brand_code,0.012058,0.014573,0.014573,0.007248,0.007248,0.007248,0.007248,0.007248,0.007248,...,0.007248,0.007248,0.007248,0.007248,0.007248,0.007248,0.007248,0.007248,0.007248,0.008221


feature
cylinders         0.588717
displacement      0.092206
horsepower        0.083468
weight            0.060258
acceleration      0.049450
model year        0.043648
origin            0.042547
car_name_code     0.031485
car_brand_code    0.008221
Name: mean, dtype: float64