In [46]:
# basic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint

# plot
import seaborn as sns
import matplotlib.pyplot as plt

# model
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

import lightgbm as lgb

# optimization
from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

# importance
import eli5
from eli5.sklearn import PermutationImportance

# other
import pickle
import time
import datetime
#import sys
#import gc

In [47]:
train = pd.read_pickle('../features/feature_train_2020-11-09-06-43-14_treated.pkl')
test = pd.read_pickle('../features/feature_test_2020-11-09-06-43-14_treated.pkl')
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

In [3]:
train

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,pwr,car_name_code,car_brand_code
0,0,23.059782,6,140,110.0,2815,17.977429,80,1,25.590909,27,6
1,3,17.674521,8,350,150.0,4456,13.514535,72,1,29.706667,33,6
2,4,17.136353,8,302,140.0,2774,13.209912,79,1,19.814286,49,11
3,7,22.664666,6,400,85.0,2190,15.196381,71,1,25.764706,60,16
4,9,17.872018,8,429,220.0,2245,9.621400,70,1,10.204545,38,8
...,...,...,...,...,...,...,...,...,...,...,...,...
495,981,22.798447,4,140,148.0,2835,13.477573,82,1,19.155405,20,5
496,983,35.173640,4,97,67.0,2234,17.542681,80,3,33.343284,58,15
497,994,17.825448,8,302,220.0,2774,15.177189,76,1,12.609091,68,20
498,995,28.545147,4,97,150.0,2130,13.324669,70,1,14.200000,25,5


In [4]:
test

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,pwr,car_name_code,car_brand_code
500,1,,4,98,67.0,2000,15.049795,81,1,29.850746,58,14
501,2,,4,97,90.0,2720,15.339172,79,1,30.222222,31,6
502,5,,4,90,75.0,2807,17.821599,77,1,37.426667,34,6
503,6,,4,140,110.0,2807,13.780354,82,2,25.518182,8,2
504,8,,4,121,85.0,3070,13.688921,73,2,36.117647,68,18
...,...,...,...,...,...,...,...,...,...,...,...,...
995,992,,4,225,105.0,3870,15.376311,76,1,36.857143,20,4
996,993,,4,140,112.0,2720,13.616843,82,1,24.285714,8,2
997,996,,8,318,110.0,2774,13.272636,75,1,25.218182,46,8
998,998,,4,90,75.0,2807,17.792560,77,1,37.426667,34,6


In [45]:
def RMSE(y_true, y_pred):
    return(np.sqrt(MSE(y_true, y_pred)))

In [64]:
def score(X, y, args):
    args['max_depth'] = int(args['max_depth'])
    args['num_leaves'] = int(args['num_leaves'])
    args['min_data_in_leaf'] = int(args['min_data_in_leaf'])
    
    model = lgb.LGBMRegressor(**args)
    
    kf = KFold(n_splits=4, shuffle=True, random_state=1)
    
    score_func = {
        'custom_rmse':make_scorer(RMSE)
    }
    
    scores = cross_validate(model, X=X, y=y, cv=kf, scoring=score_func)
    
    history.append((model, scores))
    
    return {'loss':scores['test_custom_rmse'].mean(), 'status':STATUS_OK, 'params':args}

# def score(params):
    params['max_depth'] = int(params['max_depth'])
    params['num_leaves'] = int(params['num_leaves'])
    params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
    
    model = lgb.LGBMRegressor(**params, random_state=0, n_jobs=-1)
    model.fit(X=X_train, y=Y_train, eval_set=(X_valid, Y_valid))
    Y_pred = model.predict(X_valid)
    score = RMSE(y_true=Y_valid, y_pred=Y_pred))
    print(f'params: {params}, score:{score:.4f}')
    
    history.append((model, score))
    return {'loss':score, 'status':STATUS_OK}

In [57]:
param_space = {
    'alpha' : hp.loguniform(
        label='alpha', low=np.log(1e-8), high=np.log(1.0)
    ),
    'bagging_fraction': hp.quniform(
        label='bagging_fraction', low=0.6, high=0.95, q=0.05
    ),
    'feature_fraction': hp.quniform(
        label='feature_fraction', low=0.6, high=0.95, q=0.05
    ),
    'gamma': hp.quniform(
        label='gamma', low=0.1, high=0.4, q=0.1
    ),
    'lambda' : hp.loguniform(label='lambda', low=np.log(1e-6), high=np.log(10.0)
    ),
    'max_depth': hp.quniform(
        label='max_depth', low=3, high=9, q=1
    ),
    'min_child_weight': hp.quniform(
        label='min_child_weight', low=1, high=5, q=1
    ),
    'min_data_in_leaf': hp.quniform(
        label='min_data_in_leaf', low=5, high=20, q=2
    ),
    'num_leaves': hp.quniform(
        label='num_leaves', low=20, high=100, q=10
    )
}

In [76]:
history = []

train_X = train.drop(['id', 'mpg'], axis=1)
train_y = train['mpg']

f = partial(score, train_X, train_y)

best = fmin(
    fn=f,
    space=param_space, 
    algo=tpe.suggest, 
    trials=Trials(),
    max_evals=10
)

100%|████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.92trial/s, best loss: 3.0892201390924496]


# history = []
best = []

train_X = train.drop(['id', 'mpg'], axis=1)
train_y = train['mpg']
for i in range(5):
    kf = KFold(n_splits=4, shuffle=True, random_state=i)
    for tr_idx, va_idx in kf.split(train_X):
        X_train = train_X.loc[tr_idx]
        Y_train = train_y.loc[tr_idx]
        X_valid = train_X.loc[va_idx]
        Y_valid = train_y.loc[va_idx]

        fmin(
            fn=score,
            space=param_space, 
            algo=tpe.suggest, 
            trials=Trials(),
            max_evals=10
        )

        history = sorted(history, key=lambda tpl:tpl[1])
        best.append(history[0])

In [77]:
for i, hist_i in enumerate(history):
    print("No.", i)
    print("model:", hist_i[0])
    print("score:", hist_i[1]['test_custom_rmse'], "mean:", hist_i[1]['test_custom_rmse'].mean())
    print("\n")

No. 0
model: LGBMRegressor(alpha=4.149281472843539e-05, bagging_fraction=0.75,
              feature_fraction=0.75, gamma=0.1, lambda=7.795095478892617e-05,
              max_depth=9, min_child_weight=1.0, min_data_in_leaf=18,
              num_leaves=50)
score: [3.2786825  2.80939554 3.44991633 2.95085521] mean: 3.122212396238354


No. 1
model: LGBMRegressor(alpha=2.9209154803266497e-08, bagging_fraction=0.8,
              feature_fraction=0.75, gamma=0.30000000000000004,
              lambda=7.230514090025771e-06, max_depth=6, min_child_weight=4.0,
              min_data_in_leaf=16, num_leaves=30)
score: [3.20166467 2.82994765 3.53111922 2.94883757] mean: 3.1278922754375715


No. 2
model: LGBMRegressor(alpha=0.0407356190781153, bagging_fraction=0.8500000000000001,
              feature_fraction=0.8500000000000001, gamma=0.2,
              lambda=0.021491881098921375, max_depth=7, min_child_weight=2.0,
              min_data_in_leaf=20, num_leaves=30)
score: [3.32103    2.68941548 3.6

In [78]:
# 結果を出力する
print('best parameters:', space_eval(param_space, best))

best parameters: {'alpha': 0.8128928554247173, 'bagging_fraction': 0.9, 'feature_fraction': 0.7000000000000001, 'gamma': 0.30000000000000004, 'lambda': 5.438027524724452e-06, 'max_depth': 9.0, 'min_child_weight': 5.0, 'min_data_in_leaf': 18.0, 'num_leaves': 40.0}


In [81]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
submission = pd.DataFrame({'id':sample_sub.id})
importance_list = []

for i, model_i in enumerate(history):
    model_i[0].fit(train_X, train_y)
    
    submission = pd.concat(
        [submission, pd.Series(
            model_i[0].predict(test.drop(['id', 'mpg'], axis=1)),
            name='model_'+str(i)+'_pred'
        )],
        axis=1
    )
    
    pickle.dump(
        model_i[0],
        open('../models/model_' + dt + '_lightgbm_num' + str(i).zfill(2) + '.pickle','wb')
    )
    
    pickle.dump(
        model_i[0].get_params,
        open('../logs/params_' + dt + '_num' + str(i).zfill(2) + '.pickle', 'wb')
    )
    
    pickle.dump(
        model_i[1],
        open('../logs/train_score_' + dt + '_num' + str(i).zfill(2) + '.pickle', 'wb')
    )
    importance_list.append(model_i[0].feature_importances_.tolist())

submission['mean'] = submission.iloc[:, 1:].mean(axis='columns')

# save predictions for an ensemble
pickle.dump(
    submission, 
    open('../logs/test_preds' + dt + '.pickle', 'wb')
)

# create submission file
submission[['id', 'mean']].to_csv('../data/output/sub_' + dt + '_lightgbm.csv', header=False, index=False)

# Check Importance

## lgb importance method

In [55]:
importance_df = pd.DataFrame(importance_list, columns=train_X.columns)
display(importance_df.mean().sort_values(ascending=False))

acceleration      512.35
car_name_code     298.30
pwr               291.80
displacement      261.30
weight            259.75
horsepower        227.50
model year        179.55
cylinders          79.95
origin             61.45
car_brand_code     32.80
dtype: float64

## eli5 permutation importance

In [56]:
perm_df = pd.DataFrame(train_X.columns.values, columns=['feature'])
for i, model_i in enumerate(best):
    cols = 'weight' + str(i).zfill(2)
    tmp = eli5.explain_weights_df(model_i[0]).rename(columns={'weight':cols})
    perm_df = pd.concat([perm_df, tmp[cols]], axis=1)

perm_df['mean'] = perm_df.mean(axis=1)
display(perm_df)
display(perm_df.set_index('feature')['mean'])

Unnamed: 0,feature,weight00,weight01,weight02,weight03,weight04,weight05,weight06,weight07,weight08,...,weight11,weight12,weight13,weight14,weight15,weight16,weight17,weight18,weight19,mean
0,cylinders,0.484947,0.376307,0.376307,0.526398,0.526398,0.526398,0.526398,0.526398,0.526398,...,0.526398,0.526398,0.526398,0.526398,0.526398,0.526398,0.526398,0.526398,0.526398,0.509316
1,displacement,0.118834,0.14146,0.14146,0.123191,0.123191,0.123191,0.123191,0.123191,0.123191,...,0.123191,0.123191,0.123191,0.123191,0.123191,0.123191,0.123191,0.123191,0.123191,0.1248
2,horsepower,0.114668,0.10664,0.10664,0.073876,0.073876,0.073876,0.073876,0.073876,0.073876,...,0.073876,0.073876,0.073876,0.073876,0.073876,0.073876,0.073876,0.073876,0.073876,0.079192
3,weight,0.060543,0.094706,0.094706,0.061911,0.061911,0.061911,0.061911,0.061911,0.061911,...,0.061911,0.061911,0.061911,0.061911,0.061911,0.061911,0.061911,0.061911,0.061911,0.065122
4,acceleration,0.056553,0.092374,0.092374,0.053927,0.053927,0.053927,0.053927,0.053927,0.053927,...,0.053927,0.053927,0.053927,0.053927,0.053927,0.053927,0.053927,0.053927,0.053927,0.057903
5,model year,0.046974,0.067484,0.067484,0.053185,0.053185,0.053185,0.053185,0.053185,0.053185,...,0.053185,0.053185,0.053185,0.053185,0.053185,0.053185,0.053185,0.053185,0.053185,0.054304
6,origin,0.042203,0.039206,0.039206,0.039637,0.039637,0.039637,0.039637,0.039637,0.039637,...,0.039637,0.039637,0.039637,0.039637,0.039637,0.039637,0.039637,0.039637,0.039637,0.039722
7,pwr,0.03629,0.033729,0.033729,0.036614,0.036614,0.036614,0.036614,0.036614,0.036614,...,0.036614,0.036614,0.036614,0.036614,0.036614,0.036614,0.036614,0.036614,0.036614,0.036309
8,car_name_code,0.026963,0.027332,0.027332,0.029793,0.029793,0.029793,0.029793,0.029793,0.029793,...,0.029793,0.029793,0.029793,0.029793,0.029793,0.029793,0.029793,0.029793,0.029793,0.029405
9,car_brand_code,0.012025,0.020763,0.020763,0.001468,0.001468,0.001468,0.001468,0.001468,0.001468,...,0.001468,0.001468,0.001468,0.001468,0.001468,0.001468,0.001468,0.001468,0.001468,0.003925


feature
cylinders         0.509316
displacement      0.124800
horsepower        0.079192
weight            0.065122
acceleration      0.057903
model year        0.054304
origin            0.039722
pwr               0.036309
car_name_code     0.029405
car_brand_code    0.003925
Name: mean, dtype: float64