In [21]:
# basic
import numpy as np
import pandas as pd

# plot
import seaborn as sns
from matplotlib import cm
import matplotlib.pyplot as plt
import pprint

# model
from sklearn.svm import SVR
from lightgbm import LGBMRegressor as lgb

# metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error as MSE

# optimization
from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

# importance
import eli5
from eli5.sklearn import PermutationImportance

# other
import pickle
import time
import datetime

In [2]:
train = pd.read_pickle('../features/feature_train_2020-11-07-08-04-37_treated.pkl')
test = pd.read_pickle('../features/feature_test_2020-11-07-08-04-37_treated.pkl')
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

In [41]:
def RMSE(y_true, y_pred):
    return(np.sqrt(MSE(y_true, y_pred)))

def objective(X, y, args):
    reg_models = {
        'lgb':lgb,
        'svr':SVR
    }
    
    if args['model_type'] == 'lgb':
        args['max_depth'] = int(args['max_depth'])
        args['num_leaves'] = int(args['num_leaves'])
        args['min_data_in_leaf'] = int(args['min_data_in_leaf'])
    
    reg_model = reg_models.get(args['model_type'])
    del args['model_type']  
    model = reg_model(**args)
    
    kf = KFold(n_splits=4, shuffle=True, random_state=1)
    
    score_func = {'score':make_scorer(RMSE)}
    
    cv_result = cross_validate(model, X=X, y=y, cv=kf, scoring=score_func)
    
    return {
        'loss':cv_result['test_score'].mean(),
        'status':STATUS_OK,
        'params':args
    }

In [42]:
X_train = train.drop(['id', 'mpg'], axis=1)
y_train = train['mpg']

param_space = hp.choice(
    'algorithms', [
        {
            'model_type': 'svr',
            'C': hp.loguniform(
                label='C', low=np.log(1e-8), high=np.log(1.0)
            ),
            'gamma': hp.loguniform(
                label='svr_gamma', low=np.log(1e-8), high=np.log(1.0)
            ),
            'epsilon': hp.loguniform(
                label='epsilon', low=np.log(1e-8), high=np.log(1.0)
            )
        },
        {
            'model_type': 'lgb',
            'alpha' : hp.loguniform(
                label='alpha', low=np.log(1e-8), high=np.log(1.0)
            ),
            'bagging_fraction': hp.quniform(
                label='bagging_fraction', low=0.6, high=0.95, q=0.05
            ),
            'feature_fraction': hp.quniform(
                label='feature_fraction', low=0.6, high=0.95, q=0.05
            ),
            'gamma': hp.quniform(
                label='lgb_gamma', low=0.1, high=0.4, q=0.1
            ),
            'lambda' : hp.loguniform(label='lambda', low=np.log(1e-6), high=np.log(10.0)
            ),
            'max_depth': hp.quniform(
                label='max_depth', low=3, high=9, q=1
            ),
            'min_child_weight': hp.quniform(
                label='min_child_weight', low=1, high=5, q=1
            ),
            'min_data_in_leaf': hp.quniform(
                label='min_data_in_leaf', low=5, high=20, q=2
            ),
            'num_leaves': hp.quniform(
                label='num_leaves', low=20, high=100, q=10
            )
        }
    ]
)

trials = Trials()
f = partial(objective, X_train, y_train)

best = fmin(
    fn=f,
    space=param_space, 
    algo=tpe.suggest,
    max_evals=10,
    trials=trials
)

print(space_eval(param_space, best))

100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 12.95trial/s, best loss: 2.93959953598193]
{'alpha': 9.559657348220766e-06, 'bagging_fraction': 0.8500000000000001, 'feature_fraction': 0.7000000000000001, 'gamma': 0.30000000000000004, 'lambda': 3.0931216523530467, 'max_depth': 9.0, 'min_child_weight': 4.0, 'min_data_in_leaf': 8.0, 'model_type': 'lgb', 'num_leaves': 30.0}


In [43]:
args = [result['params'] for result in trials.results]
values = [result['loss'] for result in trials.results]
pprint.pprint(args)
pprint.pprint(values)

[{'C': 0.0037033616329921653,
  'epsilon': 0.5401269308156852,
  'gamma': 0.018404794621152954},
 {'C': 0.08322459303972511,
  'epsilon': 0.0003926268822828087,
  'gamma': 0.021412003197720474},
 {'alpha': 2.0811811695554163e-06,
  'bagging_fraction': 0.75,
  'feature_fraction': 0.75,
  'gamma': 0.4,
  'lambda': 0.0013468748999240021,
  'max_depth': 6,
  'min_child_weight': 2.0,
  'min_data_in_leaf': 14,
  'num_leaves': 50},
 {'C': 0.00020151062709822948,
  'epsilon': 2.6871046389191748e-08,
  'gamma': 0.0008002411782641696},
 {'alpha': 7.70024365510558e-07,
  'bagging_fraction': 0.65,
  'feature_fraction': 0.65,
  'gamma': 0.30000000000000004,
  'lambda': 7.637199176300727e-05,
  'max_depth': 6,
  'min_child_weight': 5.0,
  'min_data_in_leaf': 6,
  'num_leaves': 60},
 {'alpha': 0.00012511657192064288,
  'bagging_fraction': 0.75,
  'feature_fraction': 0.8,
  'gamma': 0.2,
  'lambda': 5.84963371547183e-05,
  'max_depth': 8,
  'min_child_weight': 4.0,
  'min_data_in_leaf': 18,
  'num_lea