In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 2000)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
db = DFDB('../trial/lgbm.pkl', auto_commit=False)

In [7]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [8]:
len(tsfresh_columns)

1071

In [9]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [10]:
#check feature_importances
# df_feature_importances = df_trial.loc[1398]['df_feature_importances']
# if type(df_feature_importances)==pd.DataFrame:
#     sorted_columns = EP.evaluate(df_feature_importances, key='average_permutation_weight')
# else:
#     sorted_columns = df_trial.loc[1398]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)
# len(sorted_columns)

In [11]:
# param = {'columns': tsfresh_columns,
#  'kfold': {'n_splits': 8,
#   'random_state': 1985,
#   'shuffle': True,
#   'type': 'group'},
#  'scaler': {'cls': 'StandardScaler'},
#  'algorithm': {'cls': 'lgb.LGBMRegressor',
#   'init': {'learning_rate': 0.19329183957823715,
#    'feature_fraction': 0.7363782645537933,
#    'bagging_fraction': 0.8585309819625903,
#    'min_data_in_leaf': 351,
#    'lambda_l1': 95.63411922439124,
#    'lambda_l2': 48.949784919880365,
#    'max_bin': 25,
#    'num_leaves': 8,
#    'random_state': 5040,
#    'n_jobs': 32},
#   'fit': {}}}

In [14]:
# run one try
# df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial)

In [13]:
#  select features by permutation_weight
# EP.select_features_(df_train, param, mytrial, nfeats_best=10, nfeats_removed_per_try=20, key='average_permutation_weight', remark='group3 RFE')

In [12]:
df_trial = db.select()
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold'])

In [13]:
df_trial[['datetime','nfeatures', 'kfold', 'remark', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']].loc[578:601]

Unnamed: 0,datetime,nfeatures,kfold,remark,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark.1
578,2019-05-06 04:17:59.005928,1820,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.893897,0.044238,2.168245,0.575238,0.274348,
579,2019-05-06 05:47:48.214510,21,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.92768,0.032108,2.136739,0.568493,0.209059,
580,2019-05-06 05:47:51.248568,16,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.964615,0.019045,2.134496,0.579854,0.169881,
581,2019-05-06 05:51:14.797628,120,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.89826,0.044882,2.142732,0.564976,0.244472,
582,2019-05-06 05:51:29.188184,115,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.891487,0.039969,2.124524,0.57102,0.233037,
583,2019-05-06 05:51:43.775254,110,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.879317,0.044273,2.127104,0.5641,0.247787,
584,2019-05-06 05:51:57.339862,105,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.893405,0.042054,2.124504,0.571893,0.231099,
585,2019-05-06 05:52:09.446663,100,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.92878,0.038368,2.129189,0.579794,0.200409,
586,2019-05-06 05:52:41.505636,95,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.904639,0.041249,2.113594,0.571713,0.208955,
587,2019-05-06 05:52:53.828660,90,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.877943,0.044818,2.117913,0.570247,0.239971,


In [14]:
def revert_rfe(df_train, param, sorted_columns, df_test, trial, start_columns, limit=None, remark=None):
    
    # init cv_score and try only base feature
    selected_columns = copy.deepcopy(start_columns)
    if type(limit) == type(None):
        limit = len(sorted_columns)
    args = copy.deepcopy(param)
    args['columns'] = selected_columns
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
    val_mae_mean = np.mean(df_his.valid)
    cv_score = val_mae_mean
    
    # add feature one by one and check cv score change
    for idx,col in enumerate(sorted_columns):
#         if idx in start_column_index:
#             continue
        args = copy.deepcopy(param)
        args['columns'] = list(set(selected_columns + [col]))
        df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
        val_mae_mean = np.mean(df_his.valid)
        if val_mae_mean < cv_score:
            selected_columns.append(col)
            cv_score = val_mae_mean
        if len(selected_columns) >= limit:
            break
        
    return selected_columns


In [19]:
param_idx = 601
column_idx = 581
db_ = db
df_trial_ = df_trial
mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, start_columns=[columns[0]], limit=20, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])

14


In [30]:
db_.commit()

In [29]:
df_trial_[df_trial_['remark']=='start from top1 column'][['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].loc[2251:].sort_values(by=['val_mae'])

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
2367,2019-05-23 21:45:02.439202,start from top1 column,14,1.891329,0.010296,2.061111,0.478845,0.169782
2364,2019-05-23 21:44:02.554098,start from top1 column,13,1.89471,0.010298,2.06115,0.47728,0.16644
2366,2019-05-23 21:44:42.492511,start from top1 column,14,1.893094,0.010457,2.06143,0.477524,0.168336
2370,2019-05-23 21:46:02.765549,start from top1 column,15,1.887783,0.009995,2.062143,0.48061,0.174359
2371,2019-05-23 21:46:23.100894,start from top1 column,15,1.88704,0.010278,2.063905,0.483393,0.176865
2355,2019-05-23 21:41:04.294214,start from top1 column,12,1.924735,0.010826,2.064491,0.493286,0.139756
2347,2019-05-23 21:38:28.495673,start from top1 column,11,1.928975,0.010977,2.064884,0.490122,0.135908
2358,2019-05-23 21:42:03.272748,start from top1 column,13,1.922165,0.010885,2.065722,0.489924,0.143557
2357,2019-05-23 21:41:43.465034,start from top1 column,13,1.921112,0.010941,2.066165,0.492241,0.145054
2363,2019-05-23 21:43:42.196320,start from top1 column,13,1.924659,0.010851,2.06673,0.490025,0.142071


In [28]:
print(len(selected_columns))
selected_columns

14


['q25_roll_std_100',
 'abs_q25_5',
 'spkt_welch_density__coeff_3',
 'abs_q75_6',
 'abs_q75_7',
 'spkt_welch_densitycoeff_2',
 'median__roll_std',
 'abs_q01_5',
 '5000smoothness_quantile05',
 '5000smoothness_std_',
 'abs_q95_3',
 'FFT_Mag_75q0',
 '5000median_std_',
 'spkt_welch_density__coeff_17']

In [31]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    learning_rate = trial.suggest_uniform('learning_rate', .01, .5)
    feature_fraction = trial.suggest_uniform('feature_fraction', .6, 1)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 200, 800)
    lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-6, 1e2)
    lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-6, 1e2)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    num_leaves = trial.suggest_int('num_leaves', 4, 128)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'lgb.LGBMRegressor',
            'init':{
                'learning_rate':learning_rate,
                'feature_fraction':feature_fraction,
                'bagging_fraction':bagging_fraction,
                'min_data_in_leaf':min_data_in_leaf,
                'lambda_l1':lambda_l1,
                'lambda_l2':lambda_l2,
                'max_bin':max_bin,
                'num_leaves':num_leaves,
                'random_state':random_state,
                'n_jobs':32
            },
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 2367')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-23 22:01:19,358] Finished a trial resulted in value: 2.0838188880477237. Current best value is 2.0838188880477237 with parameters: {'learning_rate': 0.22847582393737892, 'feature_fraction': 0.9616895731289026, 'bagging_fraction': 0.8793241815741455, 'min_data_in_leaf': 701, 'lambda_l1': 1.932840208660401, 'lambda_l2': 2.3054027836862958e-06, 'max_bin': 79, 'num_leaves': 56, 'random_state': 5464}.
[I 2019-05-23 22:01:56,717] Finished a trial resulted in value: 2.0766060524028123. Current best value is 2.0766060524028123 with parameters: {'learning_rate': 0.19550204981477504, 'feature_fraction': 0.829324683685019, 'bagging_fraction': 0.7485187330556753, 'min_data_in_leaf': 658, 'lambda_l1': 0.0002597540735437899, 'lambda_l2': 9.872104948629654e-05, 'max_bin': 36, 'num_leaves': 108, 'random_state': 8379}.
[I 2019-05-23 22:02:52,231] Finished a trial resulted in value: 2.1332477548799105. Current best value is 2.0766060524028123 with parameters: {'learning_rate': 0.1955020498147

[I 2019-05-23 22:15:43,925] Finished a trial resulted in value: 2.0649139809721104. Current best value is 2.0637099097635407 with parameters: {'learning_rate': 0.06945693708392793, 'feature_fraction': 0.701264939545745, 'bagging_fraction': 0.6452793956482179, 'min_data_in_leaf': 557, 'lambda_l1': 0.511216675429136, 'lambda_l2': 0.0008640855732412854, 'max_bin': 86, 'num_leaves': 120, 'random_state': 8484}.
[I 2019-05-23 22:16:19,588] Finished a trial resulted in value: 2.0743942208455373. Current best value is 2.0637099097635407 with parameters: {'learning_rate': 0.06945693708392793, 'feature_fraction': 0.701264939545745, 'bagging_fraction': 0.6452793956482179, 'min_data_in_leaf': 557, 'lambda_l1': 0.511216675429136, 'lambda_l2': 0.0008640855732412854, 'max_bin': 86, 'num_leaves': 120, 'random_state': 8484}.
[I 2019-05-23 22:17:01,293] Finished a trial resulted in value: 2.0917580110351737. Current best value is 2.0637099097635407 with parameters: {'learning_rate': 0.06945693708392793,

[I 2019-05-23 22:28:33,897] Finished a trial resulted in value: 2.218451776976593. Current best value is 2.0637099097635407 with parameters: {'learning_rate': 0.06945693708392793, 'feature_fraction': 0.701264939545745, 'bagging_fraction': 0.6452793956482179, 'min_data_in_leaf': 557, 'lambda_l1': 0.511216675429136, 'lambda_l2': 0.0008640855732412854, 'max_bin': 86, 'num_leaves': 120, 'random_state': 8484}.
[I 2019-05-23 22:29:26,814] Finished a trial resulted in value: 2.0707440799374193. Current best value is 2.0637099097635407 with parameters: {'learning_rate': 0.06945693708392793, 'feature_fraction': 0.701264939545745, 'bagging_fraction': 0.6452793956482179, 'min_data_in_leaf': 557, 'lambda_l1': 0.511216675429136, 'lambda_l2': 0.0008640855732412854, 'max_bin': 86, 'num_leaves': 120, 'random_state': 8484}.
[I 2019-05-23 22:30:10,299] Finished a trial resulted in value: 2.0719932424965135. Current best value is 2.0637099097635407 with parameters: {'learning_rate': 0.06945693708392793, 

[I 2019-05-23 22:43:29,224] Finished a trial resulted in value: 2.217045660517852. Current best value is 2.0637099097635407 with parameters: {'learning_rate': 0.06945693708392793, 'feature_fraction': 0.701264939545745, 'bagging_fraction': 0.6452793956482179, 'min_data_in_leaf': 557, 'lambda_l1': 0.511216675429136, 'lambda_l2': 0.0008640855732412854, 'max_bin': 86, 'num_leaves': 120, 'random_state': 8484}.
[I 2019-05-23 22:44:11,510] Finished a trial resulted in value: 2.0659885035798657. Current best value is 2.0637099097635407 with parameters: {'learning_rate': 0.06945693708392793, 'feature_fraction': 0.701264939545745, 'bagging_fraction': 0.6452793956482179, 'min_data_in_leaf': 557, 'lambda_l1': 0.511216675429136, 'lambda_l2': 0.0008640855732412854, 'max_bin': 86, 'num_leaves': 120, 'random_state': 8484}.
[I 2019-05-23 22:44:52,887] Finished a trial resulted in value: 2.0681402781806972. Current best value is 2.0637099097635407 with parameters: {'learning_rate': 0.06945693708392793, 

[I 2019-05-23 22:57:18,659] Finished a trial resulted in value: 2.0643770578286946. Current best value is 2.0628346411024547 with parameters: {'learning_rate': 0.06996433095942133, 'feature_fraction': 0.7896471586486594, 'bagging_fraction': 0.7439815985215854, 'min_data_in_leaf': 456, 'lambda_l1': 0.015012800511685544, 'lambda_l2': 3.222321068953333, 'max_bin': 87, 'num_leaves': 83, 'random_state': 6199}.
[I 2019-05-23 22:58:20,411] Finished a trial resulted in value: 2.0828141771003796. Current best value is 2.0628346411024547 with parameters: {'learning_rate': 0.06996433095942133, 'feature_fraction': 0.7896471586486594, 'bagging_fraction': 0.7439815985215854, 'min_data_in_leaf': 456, 'lambda_l1': 0.015012800511685544, 'lambda_l2': 3.222321068953333, 'max_bin': 87, 'num_leaves': 83, 'random_state': 6199}.
[I 2019-05-23 22:59:19,903] Finished a trial resulted in value: 2.1825759737813457. Current best value is 2.0628346411024547 with parameters: {'learning_rate': 0.06996433095942133, '

[I 2019-05-23 23:15:28,924] Finished a trial resulted in value: 2.0692846155192397. Current best value is 2.0628346411024547 with parameters: {'learning_rate': 0.06996433095942133, 'feature_fraction': 0.7896471586486594, 'bagging_fraction': 0.7439815985215854, 'min_data_in_leaf': 456, 'lambda_l1': 0.015012800511685544, 'lambda_l2': 3.222321068953333, 'max_bin': 87, 'num_leaves': 83, 'random_state': 6199}.
[I 2019-05-23 23:16:02,288] Finished a trial resulted in value: 2.072759297268762. Current best value is 2.0628346411024547 with parameters: {'learning_rate': 0.06996433095942133, 'feature_fraction': 0.7896471586486594, 'bagging_fraction': 0.7439815985215854, 'min_data_in_leaf': 456, 'lambda_l1': 0.015012800511685544, 'lambda_l2': 3.222321068953333, 'max_bin': 87, 'num_leaves': 83, 'random_state': 6199}.
[I 2019-05-23 23:16:18,148] Finished a trial resulted in value: 2.060747893694659. Current best value is 2.060747893694659 with parameters: {'learning_rate': 0.10552827589333989, 'fea

[I 2019-05-23 23:24:23,106] Finished a trial resulted in value: 2.063073747466655. Current best value is 2.060747893694659 with parameters: {'learning_rate': 0.10552827589333989, 'feature_fraction': 0.8225468330406465, 'bagging_fraction': 0.6922327051313745, 'min_data_in_leaf': 479, 'lambda_l1': 4.482113130607666e-06, 'lambda_l2': 0.002275923305169795, 'max_bin': 82, 'num_leaves': 12, 'random_state': 8347}.
[I 2019-05-23 23:25:22,036] Finished a trial resulted in value: 2.0609363709237147. Current best value is 2.060747893694659 with parameters: {'learning_rate': 0.10552827589333989, 'feature_fraction': 0.8225468330406465, 'bagging_fraction': 0.6922327051313745, 'min_data_in_leaf': 479, 'lambda_l1': 4.482113130607666e-06, 'lambda_l2': 0.002275923305169795, 'max_bin': 82, 'num_leaves': 12, 'random_state': 8347}.
[I 2019-05-23 23:26:35,339] Finished a trial resulted in value: 2.062829816130937. Current best value is 2.060747893694659 with parameters: {'learning_rate': 0.10552827589333989

[I 2019-05-23 23:37:44,316] Finished a trial resulted in value: 2.2263055250453387. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665446719, 'feature_fraction': 0.888521381879426, 'bagging_fraction': 0.6300793923277712, 'min_data_in_leaf': 246, 'lambda_l1': 1.4237098115498778e-06, 'lambda_l2': 0.0016757688725194976, 'max_bin': 70, 'num_leaves': 19, 'random_state': 8922}.
[I 2019-05-23 23:38:30,802] Finished a trial resulted in value: 2.0877337015561634. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665446719, 'feature_fraction': 0.888521381879426, 'bagging_fraction': 0.6300793923277712, 'min_data_in_leaf': 246, 'lambda_l1': 1.4237098115498778e-06, 'lambda_l2': 0.0016757688725194976, 'max_bin': 70, 'num_leaves': 19, 'random_state': 8922}.
[I 2019-05-23 23:38:53,000] Finished a trial resulted in value: 2.091926915204644. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665

[I 2019-05-23 23:50:54,728] Finished a trial resulted in value: 2.0680323947143475. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665446719, 'feature_fraction': 0.888521381879426, 'bagging_fraction': 0.6300793923277712, 'min_data_in_leaf': 246, 'lambda_l1': 1.4237098115498778e-06, 'lambda_l2': 0.0016757688725194976, 'max_bin': 70, 'num_leaves': 19, 'random_state': 8922}.
[I 2019-05-23 23:51:19,769] Finished a trial resulted in value: 2.065583438269637. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665446719, 'feature_fraction': 0.888521381879426, 'bagging_fraction': 0.6300793923277712, 'min_data_in_leaf': 246, 'lambda_l1': 1.4237098115498778e-06, 'lambda_l2': 0.0016757688725194976, 'max_bin': 70, 'num_leaves': 19, 'random_state': 8922}.
[I 2019-05-23 23:52:00,529] Finished a trial resulted in value: 2.0657236140248223. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665

[I 2019-05-24 00:06:00,221] Finished a trial resulted in value: 2.0711641100370057. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665446719, 'feature_fraction': 0.888521381879426, 'bagging_fraction': 0.6300793923277712, 'min_data_in_leaf': 246, 'lambda_l1': 1.4237098115498778e-06, 'lambda_l2': 0.0016757688725194976, 'max_bin': 70, 'num_leaves': 19, 'random_state': 8922}.
[I 2019-05-24 00:07:04,022] Finished a trial resulted in value: 2.1909551105325686. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665446719, 'feature_fraction': 0.888521381879426, 'bagging_fraction': 0.6300793923277712, 'min_data_in_leaf': 246, 'lambda_l1': 1.4237098115498778e-06, 'lambda_l2': 0.0016757688725194976, 'max_bin': 70, 'num_leaves': 19, 'random_state': 8922}.
[I 2019-05-24 00:08:17,742] Finished a trial resulted in value: 2.0883064506157942. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.0591950166

[I 2019-05-24 00:21:38,733] Finished a trial resulted in value: 2.086897217243198. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665446719, 'feature_fraction': 0.888521381879426, 'bagging_fraction': 0.6300793923277712, 'min_data_in_leaf': 246, 'lambda_l1': 1.4237098115498778e-06, 'lambda_l2': 0.0016757688725194976, 'max_bin': 70, 'num_leaves': 19, 'random_state': 8922}.
[I 2019-05-24 00:22:29,663] Finished a trial resulted in value: 2.152938238836611. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.05919501665446719, 'feature_fraction': 0.888521381879426, 'bagging_fraction': 0.6300793923277712, 'min_data_in_leaf': 246, 'lambda_l1': 1.4237098115498778e-06, 'lambda_l2': 0.0016757688725194976, 'max_bin': 70, 'num_leaves': 19, 'random_state': 8922}.
[I 2019-05-24 00:22:47,181] Finished a trial resulted in value: 2.064902530908474. Current best value is 2.0585234036269946 with parameters: {'learning_rate': 0.0591950166544

In [33]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold'])

In [41]:
df_trial[(df_trial['remark']=='tune 2367')&(df_trial['mae_diff']<.1)].sort_values(by=['val_mae'])[['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
2404,2019-05-23 22:23:36.444552,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.008041,0.01231,2.106609,0.548666,0.098568
2388,2019-05-23 22:13:27.663116,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.043446,0.012748,2.142511,0.568646,0.099065
2431,2019-05-23 22:43:29.213444,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.165175,0.014123,2.217046,0.586314,0.05187
2411,2019-05-23 22:28:33.890862,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.127492,0.012567,2.218452,0.572226,0.09096
2509,2019-05-23 23:37:44.282301,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.144451,0.012607,2.226306,0.563629,0.081854
2553,2019-05-24 00:11:36.540712,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.154023,0.01225,2.24252,0.563661,0.088497
2461,2019-05-23 23:06:57.730649,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.165736,0.012522,2.245955,0.570678,0.08022


In [42]:
db.commit()

In [None]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    learning_rate = trial.suggest_uniform('learning_rate', .01, .5)
    feature_fraction = trial.suggest_uniform('feature_fraction', .6, 1)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 200, 800)
    lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-6, 1e2)
    lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-6, 1e2)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    num_leaves = trial.suggest_int('num_leaves', 4, 128)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'lgb.LGBMRegressor',
            'init':{
                'learning_rate':learning_rate,
                'feature_fraction':feature_fraction,
                'bagging_fraction':bagging_fraction,
                'min_data_in_leaf':min_data_in_leaf,
                'lambda_l1':lambda_l1,
                'lambda_l2':lambda_l2,
                'max_bin':max_bin,
                'num_leaves':num_leaves,
                'random_state':random_state,
                'n_jobs':16},
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 2367 by stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-24 02:21:29,638] Finished a trial resulted in value: 1.9070712611607885. Current best value is 1.9070712611607885 with parameters: {'learning_rate': 0.47974439322688767, 'feature_fraction': 0.8344750630086852, 'bagging_fraction': 0.6537402084729821, 'min_data_in_leaf': 435, 'lambda_l1': 0.001996785641796469, 'lambda_l2': 0.00107938464105204, 'max_bin': 19, 'num_leaves': 86, 'random_state': 44}.
[I 2019-05-24 02:28:56,121] Finished a trial resulted in value: 2.150530612530169. Current best value is 1.9070712611607885 with parameters: {'learning_rate': 0.47974439322688767, 'feature_fraction': 0.8344750630086852, 'bagging_fraction': 0.6537402084729821, 'min_data_in_leaf': 435, 'lambda_l1': 0.001996785641796469, 'lambda_l2': 0.00107938464105204, 'max_bin': 19, 'num_leaves': 86, 'random_state': 44}.
[I 2019-05-24 02:40:32,083] Finished a trial resulted in value: 1.8787911489289948. Current best value is 1.8787911489289948 with parameters: {'learning_rate': 0.3750361685264374, 'fe

In [33]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold'])

In [41]:
df_trial[(df_trial['remark']=='tune 2367 by stratified')&(df_trial['mae_diff']<.1)].sort_values(by=['val_mae'])[['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
2404,2019-05-23 22:23:36.444552,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.008041,0.01231,2.106609,0.548666,0.098568
2388,2019-05-23 22:13:27.663116,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.043446,0.012748,2.142511,0.568646,0.099065
2431,2019-05-23 22:43:29.213444,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.165175,0.014123,2.217046,0.586314,0.05187
2411,2019-05-23 22:28:33.890862,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.127492,0.012567,2.218452,0.572226,0.09096
2509,2019-05-23 23:37:44.282301,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.144451,0.012607,2.226306,0.563629,0.081854
2553,2019-05-24 00:11:36.540712,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.154023,0.01225,2.24252,0.563661,0.088497
2461,2019-05-23 23:06:57.730649,tune 2367,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.165736,0.012522,2.245955,0.570678,0.08022


In [42]:
db.commit()