In [23]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types
import copy

In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_rows', 2000)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [7]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [10]:
#check feature_importances
# df_feature_importances = df_trial.loc[294]['df_feature_importances']
# sorted_columns = EP.evaluate(df_feature_importances, key='average_permutation_weight')
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False).head(100)
# len(sorted_columns)

In [9]:
# mytrial = []

db = DFDB('../trial/xgbm.pkl', auto_commit=False)

In [15]:
param = {'columns': tsfresh_columns,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler'},
 'algorithm': {'cls': 'xgb.XGBRegressor',
  'init': {'max_depth': 3,
   'max_bin': 38,
   'eta': 0.27801915385245873,
   'colsample_bytree': 0.9416983653127328,
   'min_child_weight': 238,
   'n_estimators': 165,
   'subsample': 0.7471829960670435,
   'reg_lambda': 0.6813060508093833,
   'reg_alpha': 0.36085980027529035,
   'n_jobs': 32},
  'fit': {}},
}

In [16]:
mytrial =[]
EP.select_features_(df_train, param, mytrial, nfeats_best=10, nfeats_removed_per_try=20, key='average_permutation_weight', remark='group3 RFE')
for trial_i in mytrial:
    db.insert(trial_i)

In [24]:
df_trial = db.select()
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
0,2019-05-17 07:57:55.200597,1071,1.693748,0.001137,2.137787,0.008434,0.444039
1,2019-05-17 08:18:19.365666,1051,1.695037,0.001234,2.132506,0.007373,0.437469
2,2019-05-17 08:38:19.077706,1031,1.697924,0.000936,2.126578,0.006796,0.428655
3,2019-05-17 08:57:35.931659,1011,1.698723,0.000863,2.126707,0.007976,0.427984
4,2019-05-17 09:16:11.257187,991,1.703763,0.001034,2.120791,0.006548,0.417028
5,2019-05-17 09:33:58.118784,971,1.715405,0.001243,2.107058,0.006566,0.391654
6,2019-05-17 09:52:15.762353,951,1.714124,0.001471,2.094354,0.008226,0.380229
7,2019-05-17 10:08:34.687486,931,1.725046,0.001683,2.087804,0.009568,0.362758
8,2019-05-17 10:24:08.021330,911,1.724862,0.00162,2.087341,0.010001,0.362479
9,2019-05-17 10:39:08.141808,891,1.72582,0.001462,2.077605,0.010868,0.351785


In [22]:
db.commit()

In [27]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    max_depth = trial.suggest_int('max_depth', 2, 6)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    eta = trial.suggest_uniform('eta', 0.01, 0.4)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 200, 600)
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
    reg_lambda = trial.suggest_uniform('reg_lambda', 0.000001, 1.0)
    reg_alpha = trial.suggest_uniform('reg_alpha', 0.000001, 1.0)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':copy.deepcopy(df_trial.loc[51]['param']['columns']),
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'xgb.XGBRegressor',
            'init':{
                "max_depth":max_depth,
                "max_bin":max_bin,
                "eta":eta,
                "colsample_bytree":colsample_bytree,
                "min_child_weight":min_child_weight,
                "n_estimators":n_estimators,
                "subsample":subsample,
                "reg_lambda":reg_lambda,
                "reg_alpha":reg_alpha,
                'n_jobs':32
            },
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 51')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-17 13:54:59,915] Finished trial#0 resulted in value: 0.5232776226557809. Current best value is 0.5232776226557809 with parameters: {'max_depth': 2, 'max_bin': 39, 'eta': 0.21282522960350567, 'colsample_bytree': 0.703711232816592, 'min_child_weight': 534, 'n_estimators': 497, 'subsample': 0.6506789037249469, 'reg_lambda': 0.8090706683463588, 'reg_alpha': 0.8995260266487242, 'random_state': 8464}.
[I 2019-05-17 13:55:03,288] Finished trial#1 resulted in value: 0.6296763956188193. Current best value is 0.5232776226557809 with parameters: {'max_depth': 2, 'max_bin': 39, 'eta': 0.21282522960350567, 'colsample_bytree': 0.703711232816592, 'min_child_weight': 534, 'n_estimators': 497, 'subsample': 0.6506789037249469, 'reg_lambda': 0.8090706683463588, 'reg_alpha': 0.8995260266487242, 'random_state': 8464}.
[I 2019-05-17 13:55:14,962] Finished trial#2 resulted in value: 1.9060395348687709. Current best value is 0.5232776226557809 with parameters: {'max_depth': 2, 'max_bin': 39, 'eta':

[I 2019-05-17 13:57:22,094] Finished trial#38 resulted in value: 0.9083804886898339. Current best value is 0.22783758625832246 with parameters: {'max_depth': 2, 'max_bin': 44, 'eta': 0.16935188599687143, 'colsample_bytree': 0.8433226106360044, 'min_child_weight': 432, 'n_estimators': 101, 'subsample': 0.8740912686634448, 'reg_lambda': 0.4553205099812844, 'reg_alpha': 0.3007988035057162, 'random_state': 4147}.
[I 2019-05-17 13:57:24,844] Finished trial#39 resulted in value: 0.7384698459360912. Current best value is 0.22783758625832246 with parameters: {'max_depth': 2, 'max_bin': 44, 'eta': 0.16935188599687143, 'colsample_bytree': 0.8433226106360044, 'min_child_weight': 432, 'n_estimators': 101, 'subsample': 0.8740912686634448, 'reg_lambda': 0.4553205099812844, 'reg_alpha': 0.3007988035057162, 'random_state': 4147}.
[I 2019-05-17 13:57:32,032] Finished trial#40 resulted in value: 1.06712752606034. Current best value is 0.22783758625832246 with parameters: {'max_depth': 2, 'max_bin': 44, 

[I 2019-05-17 13:59:23,959] Finished trial#76 resulted in value: 0.3745737543477224. Current best value is 0.22783758625832246 with parameters: {'max_depth': 2, 'max_bin': 44, 'eta': 0.16935188599687143, 'colsample_bytree': 0.8433226106360044, 'min_child_weight': 432, 'n_estimators': 101, 'subsample': 0.8740912686634448, 'reg_lambda': 0.4553205099812844, 'reg_alpha': 0.3007988035057162, 'random_state': 4147}.
[I 2019-05-17 13:59:25,630] Finished trial#77 resulted in value: 0.38701482137119947. Current best value is 0.22783758625832246 with parameters: {'max_depth': 2, 'max_bin': 44, 'eta': 0.16935188599687143, 'colsample_bytree': 0.8433226106360044, 'min_child_weight': 432, 'n_estimators': 101, 'subsample': 0.8740912686634448, 'reg_lambda': 0.4553205099812844, 'reg_alpha': 0.3007988035057162, 'random_state': 4147}.
[I 2019-05-17 13:59:27,439] Finished trial#78 resulted in value: 0.30710311650110017. Current best value is 0.22783758625832246 with parameters: {'max_depth': 2, 'max_bin': 

[I 2019-05-17 14:01:20,149] Finished trial#114 resulted in value: 0.8986091806712417. Current best value is 0.22783758625832246 with parameters: {'max_depth': 2, 'max_bin': 44, 'eta': 0.16935188599687143, 'colsample_bytree': 0.8433226106360044, 'min_child_weight': 432, 'n_estimators': 101, 'subsample': 0.8740912686634448, 'reg_lambda': 0.4553205099812844, 'reg_alpha': 0.3007988035057162, 'random_state': 4147}.
[I 2019-05-17 14:01:23,834] Finished trial#115 resulted in value: 0.789268346573917. Current best value is 0.22783758625832246 with parameters: {'max_depth': 2, 'max_bin': 44, 'eta': 0.16935188599687143, 'colsample_bytree': 0.8433226106360044, 'min_child_weight': 432, 'n_estimators': 101, 'subsample': 0.8740912686634448, 'reg_lambda': 0.4553205099812844, 'reg_alpha': 0.3007988035057162, 'random_state': 4147}.
[I 2019-05-17 14:01:25,540] Finished trial#116 resulted in value: 0.29889706599204396. Current best value is 0.22783758625832246 with parameters: {'max_depth': 2, 'max_bin':

[I 2019-05-17 14:02:56,891] Finished trial#152 resulted in value: 0.32495653310728495. Current best value is 0.22121668620719104 with parameters: {'max_depth': 2, 'max_bin': 35, 'eta': 0.0932003472753572, 'colsample_bytree': 0.784639839428384, 'min_child_weight': 365, 'n_estimators': 100, 'subsample': 0.6502017167178653, 'reg_lambda': 0.30072802701797746, 'reg_alpha': 0.4657245495732788, 'random_state': 5895}.
[I 2019-05-17 14:02:58,129] Finished trial#153 resulted in value: 0.23122367743262698. Current best value is 0.22121668620719104 with parameters: {'max_depth': 2, 'max_bin': 35, 'eta': 0.0932003472753572, 'colsample_bytree': 0.784639839428384, 'min_child_weight': 365, 'n_estimators': 100, 'subsample': 0.6502017167178653, 'reg_lambda': 0.30072802701797746, 'reg_alpha': 0.4657245495732788, 'random_state': 5895}.
[I 2019-05-17 14:03:00,007] Finished trial#154 resulted in value: 0.309905258187627. Current best value is 0.22121668620719104 with parameters: {'max_depth': 2, 'max_bin': 

[I 2019-05-17 14:04:27,174] Finished trial#190 resulted in value: 0.47238339800433954. Current best value is 0.22121668620719104 with parameters: {'max_depth': 2, 'max_bin': 35, 'eta': 0.0932003472753572, 'colsample_bytree': 0.784639839428384, 'min_child_weight': 365, 'n_estimators': 100, 'subsample': 0.6502017167178653, 'reg_lambda': 0.30072802701797746, 'reg_alpha': 0.4657245495732788, 'random_state': 5895}.
[I 2019-05-17 14:04:28,952] Finished trial#191 resulted in value: 0.2765353224833394. Current best value is 0.22121668620719104 with parameters: {'max_depth': 2, 'max_bin': 35, 'eta': 0.0932003472753572, 'colsample_bytree': 0.784639839428384, 'min_child_weight': 365, 'n_estimators': 100, 'subsample': 0.6502017167178653, 'reg_lambda': 0.30072802701797746, 'reg_alpha': 0.4657245495732788, 'random_state': 5895}.
[I 2019-05-17 14:04:30,774] Finished trial#192 resulted in value: 0.3553107148493051. Current best value is 0.22121668620719104 with parameters: {'max_depth': 2, 'max_bin': 

In [29]:
for trial_i in mytrial:
    db.insert(trial_i)

In [42]:
df_trial = db.select()
df_trial[(df_trial['remark']=='tune 51')&(df_trial['mae_diff']<.11)][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
183,2019-05-17 14:02:01.242754,tune 51,51,1.949187,0.003206,2.056744,0.01394,0.107557


In [43]:
db.commit()

In [45]:
mytrial =[]
EP.select_features_(df_train, df_trial.loc[183]['param'], mytrial, nfeats_best=20, nfeats_removed_per_try=2, key='average_permutation_weight', remark='group3 RFE2')
for trial_i in mytrial:
    db.insert(trial_i)

In [46]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [47]:
df_trial[(df_trial['remark']=='group3 RFE2')][['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
254,2019-05-17 14:24:24.128242,51,1.949187,0.003206,2.056743,0.01394,0.107555
255,2019-05-17 14:24:27.555252,49,1.946199,0.002874,2.057028,0.013795,0.11083
256,2019-05-17 14:24:31.013552,47,1.947283,0.003228,2.057456,0.014132,0.110173
257,2019-05-17 14:24:34.883254,45,1.949163,0.002955,2.059381,0.014876,0.110218
258,2019-05-17 14:24:38.997272,43,1.949182,0.002927,2.057389,0.013903,0.108206
259,2019-05-17 14:24:42.916866,41,1.948917,0.003035,2.057362,0.014602,0.108445
260,2019-05-17 14:24:47.887916,39,1.950225,0.002925,2.055703,0.014057,0.105478
261,2019-05-17 14:24:51.469544,37,1.95252,0.002985,2.058316,0.014218,0.105796
262,2019-05-17 14:24:55.307709,35,1.950406,0.003251,2.058409,0.014612,0.108003
263,2019-05-17 14:24:58.793753,33,1.95234,0.003117,2.056627,0.015279,0.104287


In [48]:
db.commit()

In [49]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    max_depth = trial.suggest_int('max_depth', 2, 6)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    eta = trial.suggest_uniform('eta', 0.01, 0.4)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 200, 600)
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
    reg_lambda = trial.suggest_uniform('reg_lambda', 0.000001, 1.0)
    reg_alpha = trial.suggest_uniform('reg_alpha', 0.000001, 1.0)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':copy.deepcopy(df_trial.loc[286]['param']['columns']),
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'xgb.XGBRegressor',
            'init':{
                "max_depth":max_depth,
                "max_bin":max_bin,
                "eta":eta,
                "colsample_bytree":colsample_bytree,
                "min_child_weight":min_child_weight,
                "n_estimators":n_estimators,
                "subsample":subsample,
                "reg_lambda":reg_lambda,
                "reg_alpha":reg_alpha,
                'n_jobs':32
            },
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 286')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-17 14:36:47,011] Finished trial#0 resulted in value: 0.21444472453785562. Current best value is 0.21444472453785562 with parameters: {'max_depth': 2, 'max_bin': 91, 'eta': 0.3160394887219425, 'colsample_bytree': 0.6137382709510724, 'min_child_weight': 595, 'n_estimators': 129, 'subsample': 0.9926974167070457, 'reg_lambda': 0.09915788918128646, 'reg_alpha': 0.6665740414596795, 'random_state': 7818}.
[I 2019-05-17 14:36:49,854] Finished trial#1 resulted in value: 0.4764664635744477. Current best value is 0.21444472453785562 with parameters: {'max_depth': 2, 'max_bin': 91, 'eta': 0.3160394887219425, 'colsample_bytree': 0.6137382709510724, 'min_child_weight': 595, 'n_estimators': 129, 'subsample': 0.9926974167070457, 'reg_lambda': 0.09915788918128646, 'reg_alpha': 0.6665740414596795, 'random_state': 7818}.
[I 2019-05-17 14:36:56,212] Finished trial#2 resulted in value: 0.8165070274377284. Current best value is 0.21444472453785562 with parameters: {'max_depth': 2, 'max_bin': 91, 

[I 2019-05-17 14:38:42,302] Finished trial#38 resulted in value: 0.3603775563203622. Current best value is 0.1859468507189565 with parameters: {'max_depth': 2, 'max_bin': 48, 'eta': 0.25510728449397224, 'colsample_bytree': 0.6079441662413729, 'min_child_weight': 442, 'n_estimators': 100, 'subsample': 0.7314592410070677, 'reg_lambda': 0.29888673193601645, 'reg_alpha': 0.6963573826736154, 'random_state': 8557}.
[I 2019-05-17 14:38:45,845] Finished trial#39 resulted in value: 0.40760368067577174. Current best value is 0.1859468507189565 with parameters: {'max_depth': 2, 'max_bin': 48, 'eta': 0.25510728449397224, 'colsample_bytree': 0.6079441662413729, 'min_child_weight': 442, 'n_estimators': 100, 'subsample': 0.7314592410070677, 'reg_lambda': 0.29888673193601645, 'reg_alpha': 0.6963573826736154, 'random_state': 8557}.
[I 2019-05-17 14:38:47,716] Finished trial#40 resulted in value: 0.32707107485758163. Current best value is 0.1859468507189565 with parameters: {'max_depth': 2, 'max_bin': 4

[I 2019-05-17 14:40:19,498] Finished trial#76 resulted in value: 0.23756868948988583. Current best value is 0.1859468507189565 with parameters: {'max_depth': 2, 'max_bin': 48, 'eta': 0.25510728449397224, 'colsample_bytree': 0.6079441662413729, 'min_child_weight': 442, 'n_estimators': 100, 'subsample': 0.7314592410070677, 'reg_lambda': 0.29888673193601645, 'reg_alpha': 0.6963573826736154, 'random_state': 8557}.
[I 2019-05-17 14:40:21,071] Finished trial#77 resulted in value: 0.2867403974491734. Current best value is 0.1859468507189565 with parameters: {'max_depth': 2, 'max_bin': 48, 'eta': 0.25510728449397224, 'colsample_bytree': 0.6079441662413729, 'min_child_weight': 442, 'n_estimators': 100, 'subsample': 0.7314592410070677, 'reg_lambda': 0.29888673193601645, 'reg_alpha': 0.6963573826736154, 'random_state': 8557}.
[I 2019-05-17 14:40:23,106] Finished trial#78 resulted in value: 0.265173434327502. Current best value is 0.1859468507189565 with parameters: {'max_depth': 2, 'max_bin': 48,

[I 2019-05-17 14:41:44,555] Finished trial#114 resulted in value: 0.2937150764409372. Current best value is 0.184129283781799 with parameters: {'max_depth': 2, 'max_bin': 56, 'eta': 0.3101823248252045, 'colsample_bytree': 0.6329839474310256, 'min_child_weight': 379, 'n_estimators': 100, 'subsample': 0.7497788098393146, 'reg_lambda': 0.37363299011564066, 'reg_alpha': 0.6041039833046937, 'random_state': 8747}.
[I 2019-05-17 14:41:46,624] Finished trial#115 resulted in value: 0.3448813538321294. Current best value is 0.184129283781799 with parameters: {'max_depth': 2, 'max_bin': 56, 'eta': 0.3101823248252045, 'colsample_bytree': 0.6329839474310256, 'min_child_weight': 379, 'n_estimators': 100, 'subsample': 0.7497788098393146, 'reg_lambda': 0.37363299011564066, 'reg_alpha': 0.6041039833046937, 'random_state': 8747}.
[I 2019-05-17 14:41:49,864] Finished trial#116 resulted in value: 0.3590681023760682. Current best value is 0.184129283781799 with parameters: {'max_depth': 2, 'max_bin': 56, '

[I 2019-05-17 14:43:10,352] Finished trial#152 resulted in value: 0.2501876421320501. Current best value is 0.184129283781799 with parameters: {'max_depth': 2, 'max_bin': 56, 'eta': 0.3101823248252045, 'colsample_bytree': 0.6329839474310256, 'min_child_weight': 379, 'n_estimators': 100, 'subsample': 0.7497788098393146, 'reg_lambda': 0.37363299011564066, 'reg_alpha': 0.6041039833046937, 'random_state': 8747}.
[I 2019-05-17 14:43:11,783] Finished trial#153 resulted in value: 0.23269868027749924. Current best value is 0.184129283781799 with parameters: {'max_depth': 2, 'max_bin': 56, 'eta': 0.3101823248252045, 'colsample_bytree': 0.6329839474310256, 'min_child_weight': 379, 'n_estimators': 100, 'subsample': 0.7497788098393146, 'reg_lambda': 0.37363299011564066, 'reg_alpha': 0.6041039833046937, 'random_state': 8747}.
[I 2019-05-17 14:43:13,428] Finished trial#154 resulted in value: 0.3061869135996969. Current best value is 0.184129283781799 with parameters: {'max_depth': 2, 'max_bin': 56, 

[I 2019-05-17 14:44:27,725] Finished trial#190 resulted in value: 0.5103476254598938. Current best value is 0.184129283781799 with parameters: {'max_depth': 2, 'max_bin': 56, 'eta': 0.3101823248252045, 'colsample_bytree': 0.6329839474310256, 'min_child_weight': 379, 'n_estimators': 100, 'subsample': 0.7497788098393146, 'reg_lambda': 0.37363299011564066, 'reg_alpha': 0.6041039833046937, 'random_state': 8747}.
[I 2019-05-17 14:44:29,238] Finished trial#191 resulted in value: 0.23026297697574175. Current best value is 0.184129283781799 with parameters: {'max_depth': 2, 'max_bin': 56, 'eta': 0.3101823248252045, 'colsample_bytree': 0.6329839474310256, 'min_child_weight': 379, 'n_estimators': 100, 'subsample': 0.7497788098393146, 'reg_lambda': 0.37363299011564066, 'reg_alpha': 0.6041039833046937, 'random_state': 8747}.
[I 2019-05-17 14:44:31,672] Finished trial#192 resulted in value: 0.40565609601208874. Current best value is 0.184129283781799 with parameters: {'max_depth': 2, 'max_bin': 56,

In [50]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [58]:
df_trial[(df_trial['remark']=='tune 286')&(df_trial['mae_diff']<.1)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
378,2019-05-17 14:40:44.908569,21,1.962614,0.003184,2.052332,0.015719,0.089717
349,2019-05-17 14:39:51.779407,21,1.954242,0.002858,2.052374,0.016091,0.098132
443,2019-05-17 14:43:14.544540,21,1.956987,0.002879,2.05276,0.016121,0.095773
434,2019-05-17 14:43:02.160114,21,1.95875,0.002963,2.053259,0.015189,0.094509
444,2019-05-17 14:43:15.768467,21,1.9572,0.002828,2.053493,0.016109,0.096293
433,2019-05-17 14:43:01.011091,21,1.962648,0.002973,2.053622,0.0153,0.090974
453,2019-05-17 14:43:29.747895,21,1.956381,0.002806,2.053638,0.015548,0.097258
362,2019-05-17 14:40:16.980469,21,1.958315,0.002771,2.053681,0.016213,0.095366
387,2019-05-17 14:41:04.083168,21,1.962674,0.003001,2.054025,0.015768,0.091351
305,2019-05-17 14:37:39.517349,21,1.956644,0.002804,2.054043,0.015813,0.0974


In [59]:
db.commit()

In [60]:
param = copy.deepcopy(df_trial.loc[378]['param'])
param['kfold']['type'] = 'stratified'
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 378 use stratified')

In [61]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [62]:
df_trial.tail(1)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
488,2019-05-17 18:23:35.551786,21,1.977546,1.8e-05,2.000048,6.6e-05,0.022502


In [63]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    max_depth = trial.suggest_int('max_depth', 2, 6)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    eta = trial.suggest_uniform('eta', 0.01, 0.4)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 200, 600)
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
    reg_lambda = trial.suggest_uniform('reg_lambda', 0.000001, 1.0)
    reg_alpha = trial.suggest_uniform('reg_alpha', 0.000001, 1.0)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':copy.deepcopy(df_trial.loc[286]['param']['columns']),
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'xgb.XGBRegressor',
            'init':{
                "max_depth":max_depth,
                "max_bin":max_bin,
                "eta":eta,
                "colsample_bytree":colsample_bytree,
                "min_child_weight":min_child_weight,
                "n_estimators":n_estimators,
                "subsample":subsample,
                "reg_lambda":reg_lambda,
                "reg_alpha":reg_alpha,
                'n_jobs':32
            },
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 286 by stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-17 18:26:46,968] Finished trial#0 resulted in value: 0.31003840568843716. Current best value is 0.31003840568843716 with parameters: {'max_depth': 6, 'max_bin': 81, 'eta': 0.28139520212171565, 'colsample_bytree': 0.8283617914283987, 'min_child_weight': 500, 'n_estimators': 424, 'subsample': 0.8192376118516976, 'reg_lambda': 0.29870555870910903, 'reg_alpha': 0.4528537321072557, 'random_state': 8479}.
[I 2019-05-17 18:26:50,553] Finished trial#1 resulted in value: 0.19028793240028655. Current best value is 0.19028793240028655 with parameters: {'max_depth': 4, 'max_bin': 55, 'eta': 0.21237504248254957, 'colsample_bytree': 0.6840784519339266, 'min_child_weight': 279, 'n_estimators': 243, 'subsample': 0.9815866359528702, 'reg_lambda': 0.40742601724873206, 'reg_alpha': 0.8691034919171355, 'random_state': 6005}.
[I 2019-05-17 18:26:55,519] Finished trial#2 resulted in value: 0.22164729793776625. Current best value is 0.19028793240028655 with parameters: {'max_depth': 4, 'max_bin': 

[I 2019-05-17 18:28:35,810] Finished trial#38 resulted in value: 0.16809440951006716. Current best value is 0.04877502955054448 with parameters: {'max_depth': 2, 'max_bin': 85, 'eta': 0.22928925176206189, 'colsample_bytree': 0.6546320556199996, 'min_child_weight': 501, 'n_estimators': 108, 'subsample': 0.9857713487337425, 'reg_lambda': 0.8782435002372866, 'reg_alpha': 0.273707299933715, 'random_state': 3712}.
[I 2019-05-17 18:28:41,889] Finished trial#39 resulted in value: 0.2307114771695182. Current best value is 0.04877502955054448 with parameters: {'max_depth': 2, 'max_bin': 85, 'eta': 0.22928925176206189, 'colsample_bytree': 0.6546320556199996, 'min_child_weight': 501, 'n_estimators': 108, 'subsample': 0.9857713487337425, 'reg_lambda': 0.8782435002372866, 'reg_alpha': 0.273707299933715, 'random_state': 3712}.
[I 2019-05-17 18:28:43,476] Finished trial#40 resulted in value: 0.0537803966030586. Current best value is 0.04877502955054448 with parameters: {'max_depth': 2, 'max_bin': 85,

[I 2019-05-17 18:30:18,744] Finished trial#76 resulted in value: 0.10209773310991796. Current best value is 0.04577550368240165 with parameters: {'max_depth': 2, 'max_bin': 93, 'eta': 0.34160229696327965, 'colsample_bytree': 0.6423440331530649, 'min_child_weight': 252, 'n_estimators': 101, 'subsample': 0.7729063795205674, 'reg_lambda': 0.720958499732286, 'reg_alpha': 0.44906471570915896, 'random_state': 8984}.
[I 2019-05-17 18:30:20,195] Finished trial#77 resulted in value: 0.05294864569173404. Current best value is 0.04577550368240165 with parameters: {'max_depth': 2, 'max_bin': 93, 'eta': 0.34160229696327965, 'colsample_bytree': 0.6423440331530649, 'min_child_weight': 252, 'n_estimators': 101, 'subsample': 0.7729063795205674, 'reg_lambda': 0.720958499732286, 'reg_alpha': 0.44906471570915896, 'random_state': 8984}.
[I 2019-05-17 18:30:21,849] Finished trial#78 resulted in value: 0.06466833192065946. Current best value is 0.04577550368240165 with parameters: {'max_depth': 2, 'max_bin':

[I 2019-05-17 18:31:51,531] Finished trial#114 resulted in value: 0.08854939148842077. Current best value is 0.04577550368240165 with parameters: {'max_depth': 2, 'max_bin': 93, 'eta': 0.34160229696327965, 'colsample_bytree': 0.6423440331530649, 'min_child_weight': 252, 'n_estimators': 101, 'subsample': 0.7729063795205674, 'reg_lambda': 0.720958499732286, 'reg_alpha': 0.44906471570915896, 'random_state': 8984}.
[I 2019-05-17 18:31:53,528] Finished trial#115 resulted in value: 0.07253936101330331. Current best value is 0.04577550368240165 with parameters: {'max_depth': 2, 'max_bin': 93, 'eta': 0.34160229696327965, 'colsample_bytree': 0.6423440331530649, 'min_child_weight': 252, 'n_estimators': 101, 'subsample': 0.7729063795205674, 'reg_lambda': 0.720958499732286, 'reg_alpha': 0.44906471570915896, 'random_state': 8984}.
[I 2019-05-17 18:31:56,454] Finished trial#116 resulted in value: 0.09921764750068192. Current best value is 0.04577550368240165 with parameters: {'max_depth': 2, 'max_bi

[I 2019-05-17 18:33:22,144] Finished trial#152 resulted in value: 0.049864004245648026. Current best value is 0.04577550368240165 with parameters: {'max_depth': 2, 'max_bin': 93, 'eta': 0.34160229696327965, 'colsample_bytree': 0.6423440331530649, 'min_child_weight': 252, 'n_estimators': 101, 'subsample': 0.7729063795205674, 'reg_lambda': 0.720958499732286, 'reg_alpha': 0.44906471570915896, 'random_state': 8984}.
[I 2019-05-17 18:33:23,520] Finished trial#153 resulted in value: 0.05338542340080287. Current best value is 0.04577550368240165 with parameters: {'max_depth': 2, 'max_bin': 93, 'eta': 0.34160229696327965, 'colsample_bytree': 0.6423440331530649, 'min_child_weight': 252, 'n_estimators': 101, 'subsample': 0.7729063795205674, 'reg_lambda': 0.720958499732286, 'reg_alpha': 0.44906471570915896, 'random_state': 8984}.
[I 2019-05-17 18:33:25,493] Finished trial#154 resulted in value: 0.09485214489369816. Current best value is 0.04577550368240165 with parameters: {'max_depth': 2, 'max_b

[I 2019-05-17 18:34:35,486] Finished trial#190 resulted in value: 0.08294091371866369. Current best value is 0.041270788878606905 with parameters: {'max_depth': 2, 'max_bin': 40, 'eta': 0.29791171043671494, 'colsample_bytree': 0.63947287836424, 'min_child_weight': 575, 'n_estimators': 100, 'subsample': 0.757898880002618, 'reg_lambda': 0.5186119274105667, 'reg_alpha': 0.6912095894576386, 'random_state': 3708}.
[I 2019-05-17 18:34:39,949] Finished trial#191 resulted in value: 0.17249897933586786. Current best value is 0.041270788878606905 with parameters: {'max_depth': 2, 'max_bin': 40, 'eta': 0.29791171043671494, 'colsample_bytree': 0.63947287836424, 'min_child_weight': 575, 'n_estimators': 100, 'subsample': 0.757898880002618, 'reg_lambda': 0.5186119274105667, 'reg_alpha': 0.6912095894576386, 'random_state': 3708}.
[I 2019-05-17 18:34:43,176] Finished trial#192 resulted in value: 0.1136313781871084. Current best value is 0.041270788878606905 with parameters: {'max_depth': 2, 'max_bin': 

In [64]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [66]:
df_trial[(df_trial['remark']=='tune 286 by stratified')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
667,2019-05-17 18:34:06.894757,21,1.895596,3.7e-05,1.945179,5.4e-05,0.049582
519,2019-05-17 18:28:12.520814,21,1.899979,1.4e-05,1.949615,0.000122,0.049636
643,2019-05-17 18:33:25.469423,21,1.902247,6.8e-05,1.950868,5.5e-05,0.04862
684,2019-05-17 18:34:51.311313,21,1.901141,3.4e-05,1.951058,6.3e-05,0.049917
648,2019-05-17 18:33:33.078151,21,1.904413,1.1e-05,1.952314,9.6e-05,0.047901
585,2019-05-17 18:31:04.190938,21,1.903414,1.4e-05,1.953113,9.6e-05,0.049699
564,2019-05-17 18:30:16.747078,21,1.90912,2.4e-05,1.953407,7.1e-05,0.044286
671,2019-05-17 18:34:13.430054,21,1.907021,2.6e-05,1.954875,6.4e-05,0.047853
662,2019-05-17 18:33:57.107964,21,1.911749,3.8e-05,1.955365,8.4e-05,0.043616
618,2019-05-17 18:32:31.268906,21,1.913476,5.1e-05,1.955903,4.1e-05,0.042426


In [67]:
db.commit()

In [68]:
param = copy.deepcopy(df_trial.loc[667]['param'])
param['kfold']['type'] = 'group'
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 667 use group')

In [69]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [70]:
df_trial.tail(1)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
689,2019-05-17 18:38:44.960095,21,1.866915,0.001973,2.045915,0.01453,0.178999


In [71]:
db.commit()