In [1]:
# basic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint

# plot
import seaborn as sns
import matplotlib.pyplot as plt

# model
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

import lightgbm as lgb

# optimization
from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

# importance
import eli5
from eli5.sklearn import PermutationImportance

# other
import pickle
import time
import datetime
#import sys
#import gc



In [3]:
train = pd.read_pickle('../features/feature_train_2020-11-07-08-04-37_treated.pkl')
test = pd.read_pickle('../features/feature_test_2020-11-07-08-04-37_treated.pkl')
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

train_X = train.drop(['id', 'mpg'], axis=1)
train_y = train['mpg']

In [6]:
def RMSE(y_true, y_pred):
    return(np.sqrt(MSE(y_true, y_pred)))

def objective(X, y, args):
    args['max_depth'] = int(args['max_depth'])
    args['num_leaves'] = int(args['num_leaves'])
    args['min_data_in_leaf'] = int(args['min_data_in_leaf'])
    
    model = lgb.LGBMRegressor(**args)
    
    kf = KFold(n_splits=4, shuffle=True, random_state=1)
    
    score_func = {
        'score':make_scorer(RMSE)
    }
    
    scores = cross_validate(model, X=X, y=y, cv=kf, return_estimator=True, scoring=score_func)
    
    #history.append((model, scores))
    
    return {
        'loss':scores['test_score'].mean(),
        'status':STATUS_OK,
        'params':args
    }

In [8]:
param_space = {
    'alpha' : hp.loguniform(
        label='alpha', low=np.log(1e-8), high=np.log(1.0)
    ),
    'bagging_fraction': hp.quniform(
        label='bagging_fraction', low=0.6, high=0.95, q=0.05
    ),
    'feature_fraction': hp.quniform(
        label='feature_fraction', low=0.6, high=0.95, q=0.05
    ),
    'gamma': hp.quniform(
        label='gamma', low=0.1, high=0.4, q=0.1
    ),
    'lambda' : hp.loguniform(label='lambda', low=np.log(1e-6), high=np.log(10.0)
    ),
    'max_depth': hp.quniform(
        label='max_depth', low=3, high=9, q=1
    ),
    'min_child_weight': hp.quniform(
        label='min_child_weight', low=1, high=5, q=1
    ),
    'min_data_in_leaf': hp.quniform(
        label='min_data_in_leaf', low=5, high=20, q=2
    ),
    'num_leaves': hp.quniform(
        label='num_leaves', low=20, high=100, q=10
    )
}

#history = []

f = partial(objective, train_X, train_y)
trials=Trials()

best = fmin(
    fn=f,
    space=param_space, 
    algo=tpe.suggest, 
    trials=trials,
    max_evals=100
)

pprint.pprint(space_eval(param_space, best))

100%|██████████████████████████████████████████████| 100/100 [00:12<00:00,  8.07trial/s, best loss: 2.9141414306080233]
{'alpha': 0.030336593233652006,
 'bagging_fraction': 0.8,
 'feature_fraction': 0.6000000000000001,
 'gamma': 0.30000000000000004,
 'lambda': 3.0593234609941795e-05,
 'max_depth': 6.0,
 'min_child_weight': 4.0,
 'min_data_in_leaf': 6.0,
 'num_leaves': 60.0}


In [10]:
result = [(i['loss'], i['params']) for i in trials.results]
result.sort()
pprint.pprint(result[:5])

[(2.9141414306080233,
  {'alpha': 0.030336593233652006,
   'bagging_fraction': 0.8,
   'feature_fraction': 0.6000000000000001,
   'gamma': 0.30000000000000004,
   'lambda': 3.0593234609941795e-05,
   'max_depth': 6,
   'min_child_weight': 4.0,
   'min_data_in_leaf': 6,
   'num_leaves': 60}),
 (2.9183548229001017,
  {'alpha': 2.8374896060867815e-05,
   'bagging_fraction': 0.75,
   'feature_fraction': 0.6000000000000001,
   'gamma': 0.1,
   'lambda': 6.234928963455349e-05,
   'max_depth': 7,
   'min_child_weight': 5.0,
   'min_data_in_leaf': 6,
   'num_leaves': 50}),
 (2.919663970552579,
  {'alpha': 0.15354533897545977,
   'bagging_fraction': 0.8,
   'feature_fraction': 0.6000000000000001,
   'gamma': 0.1,
   'lambda': 0.0007765396846388249,
   'max_depth': 7,
   'min_child_weight': 5.0,
   'min_data_in_leaf': 6,
   'num_leaves': 50}),
 (2.9259190299423343,
  {'alpha': 1.567470962361922e-08,
   'bagging_fraction': 0.75,
   'feature_fraction': 0.6000000000000001,
   'gamma': 0.1,
   'lamb

In [12]:
# 結果を出力する
print('best parameters:')
pprint.pprint(space_eval(param_space, best))

best parameters:
{'alpha': 0.030336593233652006,
 'bagging_fraction': 0.8,
 'feature_fraction': 0.6000000000000001,
 'gamma': 0.30000000000000004,
 'lambda': 3.0593234609941795e-05,
 'max_depth': 6.0,
 'min_child_weight': 4.0,
 'min_data_in_leaf': 6.0,
 'num_leaves': 60.0}


In [96]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
submission = pd.DataFrame({'id':sample_sub.id})
importance_list = []

for i, model_i in enumerate(history):
    model_i[0].fit(train_X, train_y)
    
    submission = pd.concat(
        [submission, pd.Series(
            model_i[0].predict(test.drop(['id', 'mpg'], axis=1)),
            name='model_'+str(i)+'_pred'
        )],
        axis=1
    )
    
    pickle.dump(
        model_i[0],
        open('../models/model_' + dt + '_lightgbm_num' + str(i).zfill(2) + '.pickle','wb')
    )
    
    pickle.dump(
        model_i[0].get_params,
        open('../logs/params_' + dt + '_num' + str(i).zfill(2) + '.pickle', 'wb')
    )
    
    pickle.dump(
        model_i[1],
        open('../logs/train_score_' + dt + '_num' + str(i).zfill(2) + '.pickle', 'wb')
    )
    importance_list.append(model_i[0].feature_importances_.tolist())

submission['mean'] = submission.iloc[:, 1:].mean(axis='columns')

# save predictions for an ensemble
pickle.dump(
    submission, 
    open('../logs/test_preds' + dt + '.pickle', 'wb')
)

# create submission file
submission[['id', 'mean']].to_csv('../data/output/sub_' + dt + '_lightgbm.csv', header=False, index=False)

# Check Importance

## lgb importance method

In [97]:
importance_df = pd.DataFrame(importance_list, columns=train_X.columns)
display(importance_df.mean().sort_values(ascending=False))

acceleration      310.5
weight            182.6
car_name_code     181.8
horsepower        166.8
displacement      149.6
model year        124.2
cylinders          55.0
origin             46.3
car_brand_code     27.1
dtype: float64

## eli5 permutation importance

In [98]:
perm_df = pd.DataFrame(train_X.columns.values, columns=['feature'])
for i, model_i in enumerate(history):
    cols = 'weight' + str(i).zfill(2)
    tmp = eli5.explain_weights_df(model_i[0]).rename(columns={'weight':cols})
    perm_df = pd.concat([perm_df, tmp[cols]], axis=1)

perm_df['mean'] = perm_df.mean(axis=1)
display(perm_df)
display(perm_df.set_index('feature')['mean'])

Unnamed: 0,feature,weight00,weight01,weight02,weight03,weight04,weight05,weight06,weight07,weight08,weight09,mean
0,cylinders,0.500572,0.580648,0.532222,0.51003,0.572876,0.578269,0.576976,0.502479,0.518358,0.586519,0.545895
1,displacement,0.091122,0.094082,0.090519,0.093108,0.092991,0.087276,0.096944,0.085977,0.085938,0.095868,0.091383
2,horsepower,0.090948,0.08017,0.077069,0.09171,0.084955,0.08308,0.076874,0.082024,0.085481,0.068291,0.08206
3,weight,0.075611,0.065623,0.068483,0.071407,0.057428,0.061971,0.070866,0.074293,0.074717,0.065153,0.068555
4,acceleration,0.068425,0.050223,0.066388,0.065264,0.056718,0.057288,0.057465,0.071066,0.07235,0.052189,0.061738
5,model year,0.062984,0.049863,0.066161,0.062372,0.052632,0.053,0.041444,0.069171,0.065351,0.052116,0.057509
6,origin,0.062755,0.043239,0.062823,0.06047,0.040902,0.040096,0.037118,0.064444,0.049187,0.045438,0.050647
7,car_name_code,0.038518,0.029677,0.030111,0.034058,0.037962,0.037402,0.036314,0.040233,0.039639,0.034377,0.035829
8,car_brand_code,0.009065,0.006476,0.006224,0.011581,0.003535,0.001617,0.006,0.010311,0.008978,4.8e-05,0.006384


feature
cylinders         0.545895
displacement      0.091383
horsepower        0.082060
weight            0.068555
acceleration      0.061738
model year        0.057509
origin            0.050647
car_name_code     0.035829
car_brand_code    0.006384
Name: mean, dtype: float64