In [1]:
# basic
import numpy as np
import pandas as pd

# plot
import seaborn as sns
from matplotlib import cm
import matplotlib.pyplot as plt
import pprint

# model
from sklearn.svm import SVR
from lightgbm import LGBMRegressor as lgb

# metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error as MSE

# optimization
from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

# importance
import eli5
from eli5.sklearn import PermutationImportance

# other
import pickle
import time
import datetime



In [6]:
train = pd.read_pickle('../features/feature_train_2020-11-07-08-04-37_treated.pkl')
test = pd.read_pickle('../features/feature_test_2020-11-07-08-04-37_treated.pkl')
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

X_train = train.drop(['id', 'mpg'], axis=1)
y_train = train['mpg']

In [18]:
def RMSE(y_true, y_pred):
    return(np.sqrt(MSE(y_true, y_pred)))

def objective(X, y, args):
    reg_models = {
        'lgb':lgb,
        'svr':SVR
    }
    
    if args['model_type'] == 'lgb':
        args['max_depth'] = int(args['max_depth'])
        args['num_leaves'] = int(args['num_leaves'])
        args['min_data_in_leaf'] = int(args['min_data_in_leaf'])
    
    reg_model = reg_models.get(args['model_type'])
    del args['model_type']  
    model = reg_model(**args)
    
    kf = KFold(n_splits=4, shuffle=True, random_state=1)
    score_func = {'score':make_scorer(RMSE)}    
    cv_result = cross_validate(model, X=X, y=y, cv=kf, scoring=score_func)
    
    return {
        'loss':cv_result['test_score'].mean(),
        'status':STATUS_OK,
        'params':args
    }

In [19]:
# Setting Paarameter Space
params_svr = {
    'model_type': 'svr',
    'C': hp.loguniform(
        label='C', low=np.log(1e-8), high=np.log(1.0)
    ),
    'gamma': hp.loguniform(
        label='svr_gamma', low=np.log(1e-8), high=np.log(1.0)
    ),
    'epsilon': hp.loguniform(
        label='epsilon', low=np.log(1e-8), high=np.log(1.0)
    )
}

params_lgb = {
    'model_type': 'lgb',
    'alpha' : hp.loguniform(
        label='alpha', low=np.log(1e-8), high=np.log(1.0)
    ),
    'bagging_fraction': hp.quniform(
        label='bagging_fraction', low=0.6, high=0.95, q=0.05
    ),
    'feature_fraction': hp.quniform(
        label='feature_fraction', low=0.6, high=0.95, q=0.05
    ),
    'gamma': hp.quniform(
        label='lgb_gamma', low=0.1, high=0.4, q=0.1
    ),
    'lambda' : hp.loguniform(label='lambda', low=np.log(1e-6), high=np.log(10.0)
    ),
    'max_depth': hp.quniform(
        label='max_depth', low=3, high=9, q=1
    ),
    'min_child_weight': hp.quniform(
        label='min_child_weight', low=1, high=5, q=1
    ),
    'min_data_in_leaf': hp.quniform(
        label='min_data_in_leaf', low=5, high=20, q=2
    ),
    'num_leaves': hp.quniform(
        label='num_leaves', low=20, high=100, q=10
    )
}

In [29]:
# SVR
trials_svr = Trials()
f_svr = partial(objective, X_train, y_train)
best_svr = fmin(
    fn=f_svr,
    space=params_svr,
    algo=tpe.suggest,
    max_evals=10,
    trials=trials_svr
)

# LightGBM
trials_lgb = Trials()
f_lgb = partial(objective, X_train, y_train)
best_lgb = fmin(
    fn=f_lgb,
    space=params_lgb,
    algo=tpe.suggest,
    max_evals=10,
    trials=trials_lgb
)

100%|█████████████████████████████████████████████████| 10/10 [00:00<00:00, 25.39trial/s, best loss: 7.430264647900774]
100%|████████████████████████████████████████████████| 10/10 [00:00<00:00, 11.11trial/s, best loss: 3.0011088764279554]


In [30]:
print("best params")
print("SVR:")
pprint.pprint(space_eval(params_svr, best_svr))
print("LighGBM:")
pprint.pprint(space_eval(params_lgb, best_lgb))

best params
SVR:
{'C': 0.019857005939287863,
 'epsilon': 9.2896808168935e-08,
 'gamma': 3.968398310974546e-05,
 'model_type': 'svr'}
LighGBM:
{'alpha': 9.186499737361049e-07,
 'bagging_fraction': 0.6000000000000001,
 'feature_fraction': 0.8,
 'gamma': 0.1,
 'lambda': 0.004176487379111505,
 'max_depth': 8.0,
 'min_child_weight': 4.0,
 'min_data_in_leaf': 8.0,
 'model_type': 'lgb',
 'num_leaves': 30.0}


In [40]:
for result in trials_lgb.results:
    print(result['loss'])
    pprint.pprint(result['params'])

3.0011593756942307
{'alpha': 0.10437704251717303,
 'bagging_fraction': 0.8,
 'feature_fraction': 0.8,
 'gamma': 0.4,
 'lambda': 0.6979759218542592,
 'max_depth': 9,
 'min_child_weight': 5.0,
 'min_data_in_leaf': 10,
 'num_leaves': 80}
3.1441373781995305
{'alpha': 1.2911485785653844e-06,
 'bagging_fraction': 0.75,
 'feature_fraction': 0.65,
 'gamma': 0.1,
 'lambda': 0.000500040830988914,
 'max_depth': 5,
 'min_child_weight': 4.0,
 'min_data_in_leaf': 20,
 'num_leaves': 90}
3.0452659943672984
{'alpha': 0.01462462750837084,
 'bagging_fraction': 0.7000000000000001,
 'feature_fraction': 0.7000000000000001,
 'gamma': 0.2,
 'lambda': 0.05722226146366567,
 'max_depth': 6,
 'min_child_weight': 3.0,
 'min_data_in_leaf': 12,
 'num_leaves': 40}
3.2849306835529006
{'alpha': 3.0164362107101176e-06,
 'bagging_fraction': 0.65,
 'feature_fraction': 0.9,
 'gamma': 0.30000000000000004,
 'lambda': 0.005971648274270439,
 'max_depth': 3,
 'min_child_weight': 3.0,
 'min_data_in_leaf': 14,
 'num_leaves': 50}
