In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
# pd.set_option('display.max_columns', 2000)
# pd.set_option('display.width', 2000)
# pd.set_option('display.expand_frame_repr', True)
pd.set_option('display.max_colwidth', -1)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [7]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [8]:
np.unique(group)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])

In [9]:
# mytrial = []
db = DFDB('../trial2/mystacknet2.pkl', auto_commit=False)

In [10]:
def revert_rfe(df_train, param, sorted_columns, df_test, trial, start_columns, remark=None):
    
    # init cv_score and try only base feature
    selected_columns = copy.deepcopy(start_columns)
    args = copy.deepcopy(param)
    args['columns'] = selected_columns
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
    val_mae_mean = np.mean(df_his.valid)
    cv_score = val_mae_mean
    
    # add feature one by one and check cv score change
    for idx,col in enumerate(sorted_columns):
#         if idx in start_column_index:
#             continue
        args = copy.deepcopy(param)
        args['columns'] = list(set(selected_columns + [col]))
        df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
        val_mae_mean = np.mean(df_his.valid)
        if val_mae_mean < cv_score:
            selected_columns.append(col)
            cv_score = val_mae_mean
            
    return selected_columns


In [11]:
db_lgbm = DFDB('../trial2/lgbm.pkl', auto_commit=False)
df_trial_lgbm = db_lgbm.select()
df_trial_lgbm['kfold'] = df_trial_lgbm['param'].apply(lambda x: x['kfold'])
df_trial_lgbm.loc[[269, 391]][['datetime','remark','kfold','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
269,2019-05-17 05:34:44.593625,group3 RFE from 51feats,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",21,2.008663,0.003328,2.061966,0.014057,0.053304
391,2019-05-17 06:10:32.416213,tune 269,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",21,1.979505,0.002285,2.067314,0.010692,0.087809


In [12]:
df_trial_lgbm.loc[[254]][['datetime','remark','kfold','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
254,2019-05-17 05:33:52.140704,group3 RFE from 51feats,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",51,2.000854,0.003298,2.065917,0.014304,0.065063


In [18]:
param_idx = 391
column_idx = 254
db_ = db_lgbm
df_trial_ = df_trial_lgbm

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
param['kfold']['n_splits']=8
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, [columns[0]], remark='start from top1 column2')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])


14


In [19]:
df_trial_[df_trial_['remark']=='start from top1 column2'].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
794,2019-05-22 23:21:51.857377,start from top1 column2,14,1.987193,0.012072,2.085777,0.542064,0.098584
786,2019-05-22 23:19:25.317229,start from top1 column2,13,1.986643,0.011982,2.085991,0.539995,0.099348
788,2019-05-22 23:20:01.930738,start from top1 column2,14,1.9863,0.012111,2.085993,0.54391,0.099693
797,2019-05-22 23:22:46.701378,start from top1 column2,15,1.986276,0.01203,2.086079,0.539744,0.099803
790,2019-05-22 23:20:38.512187,start from top1 column2,14,1.987046,0.012083,2.086096,0.542167,0.09905
783,2019-05-22 23:18:30.192508,start from top1 column2,12,1.989236,0.012151,2.0862,0.542562,0.096964
782,2019-05-22 23:18:11.808810,start from top1 column2,11,1.989255,0.012097,2.086206,0.542456,0.096952
789,2019-05-22 23:20:20.218343,start from top1 column2,14,1.987568,0.012137,2.08624,0.542523,0.098672
796,2019-05-22 23:22:28.419258,start from top1 column2,15,1.987054,0.012069,2.086256,0.539983,0.099202
791,2019-05-22 23:20:56.780245,start from top1 column2,14,1.986736,0.012028,2.086265,0.542435,0.099529


In [None]:
idx_ = 
#  tune hypterparameters
def objective(trial):
        
    learning_rate = trial.suggest_uniform('learning_rate', .01, .5)
    feature_fraction = trial.suggest_uniform('feature_fraction', .6, 1)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 200,800)
    lambda_l1 = trial.suggest_uniform('lambda_l1', 1e-6, 1e2)
    lambda_l2 = trial.suggest_uniform('lambda_l2', 1e-6, 1e2)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    num_leaves = trial.suggest_int('num_leaves', 4, 128)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':copy.deepcopy(df_trial_.loc[idx_].param['columns']),
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'lgb.LGBMRegressor',
            'init':{
                'learning_rate':learning_rate,
                'feature_fraction':feature_fraction,
                'bagging_fraction':bagging_fraction,
                'min_data_in_leaf':min_data_in_leaf,
                'lambda_l1':lambda_l1,
                'lambda_l2':lambda_l2,
                'max_bin':max_bin,
                'num_leaves':num_leaves,
                'random_state':random_state,
                'n_jobs':16
            },
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune '.format(idx_))
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

In [None]:
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

In [None]:
best_lgbm_idx =

In [66]:
db_.commit()

In [11]:
db_xgbm = DFDB('../trial2/xgbm.pkl', auto_commit=False)
df_trial_xgbm = db_xgbm.select()
df_trial_xgbm['kfold'] = df_trial_xgbm['param'].apply(lambda x: x['kfold'])
df_trial_xgbm.loc[[378,488,667,689, 286, 254]][['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
378,2019-05-17 14:40:44.908569,tune 286,21,1.962614,0.003184,2.052332,0.015719,0.089717
488,2019-05-17 18:23:35.551786,remodel 378 use stratified,21,1.977546,1.8e-05,2.000048,6.6e-05,0.022502
667,2019-05-17 18:34:06.894757,tune 286 by stratified,21,1.895596,3.7e-05,1.945179,5.4e-05,0.049582
689,2019-05-17 18:38:44.960095,remodel 667 use group,21,1.866915,0.001973,2.045915,0.01453,0.178999
286,2019-05-17 14:25:16.036224,group3 RFE2,21,1.962638,0.002975,2.055362,0.015291,0.092724
254,2019-05-17 14:24:24.128242,group3 RFE2,51,1.949187,0.003206,2.056743,0.01394,0.107555


In [70]:
param_idx = 667
column_idx = 254
db_ = db_xgbm
df_trial_ = df_trial_xgbm

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[df_trial_['remark']=='start from top1 column'].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(1)

35


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
758,2019-05-22 09:18:22.514990,start from top1 column,35,1.859676,4.9e-05,1.91714,7.5e-05,0.057465


In [71]:
db_.commit()

In [12]:
db_catboost = DFDB('../trial2/catboost.pkl', auto_commit=False)
df_trial_catboost = db_catboost.select()
df_trial_catboost['kfold'] = df_trial_catboost['param'].apply(lambda x: x['kfold'])
df_trial_catboost.loc[[399,448,625,649,239]][['datetime','remark', 'kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
399,2019-05-19 16:16:28.165523,tune 239,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",18,2.011196,0.000486,2.094451,0.002037,0.083255
448,2019-05-19 21:26:25.175250,remodel 399 use stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",18,2.030253,1e-06,2.035186,4e-06,0.004932
625,2019-05-19 18:43:45.614456,tune 239 by stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",18,1.870594,4e-06,1.917066,8.9e-05,0.046472
649,2019-05-19 21:29:50.830576,remodel 625 use group,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",18,1.812501,0.000577,2.080786,0.00455,0.268285
239,2019-05-19 14:26:03.463079,group3 RFE3,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",18,2.022445,0.000414,2.098963,0.003333,0.076519


In [76]:
# df_trial_catboost[df_trial_catboost['remark']=='group3 RFE3'][['datetime','remark', 'kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

In [82]:
param_idx = 625
column_idx = 211
db_ = db_catboost
df_trial_ = df_trial_catboost

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
688,2019-05-22 09:27:08.122953,start from top1 column,15,1.813774,8e-06,1.871522,0.000134,0.057747
687,2019-05-22 09:26:53.370106,start from top1 column,14,1.814666,2e-06,1.872204,9.4e-05,0.057538
689,2019-05-22 09:27:24.407777,start from top1 column,16,1.8137,1.8e-05,1.872386,0.000117,0.058686
686,2019-05-22 09:26:39.128497,start from top1 column,13,1.817598,4e-06,1.874092,7.8e-05,0.056494
684,2019-05-22 09:26:10.987154,start from top1 column,12,1.869628,5e-06,1.917836,9.6e-05,0.048208


In [83]:
db_.commit()

In [13]:
db_randomforest = DFDB('../trial2/randomforest.pkl', auto_commit=False)
df_trial_randomforest = db_randomforest.select()
df_trial_randomforest['kfold'] = df_trial_randomforest['param'].apply(lambda x: x['kfold'])
df_trial_randomforest.loc[[40, 78,242,253,443]][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
40,2019-05-18 08:08:56.382129,group3 RFE2,21,2.067817,0.002064,2.12311,0.006078,0.055293
78,2019-05-18 08:31:09.037848,tune 40,21,2.11185,0.002311,2.145305,0.007731,0.033456
242,2019-05-18 09:09:25.478405,remodel 78 use stratified,21,2.122507,5.5e-05,2.127397,0.000151,0.00489
253,2019-05-18 09:16:12.602745,tune 40 by stratified,21,2.084296,7e-06,2.090182,8.3e-05,0.005887
443,2019-05-18 11:06:48.347881,remodel 253 use group,21,2.070761,0.002008,2.133419,0.005291,0.062658


In [86]:
# df_trial_randomforest[df_trial_randomforest['remark']=='group3 RFE2'][['datetime','remark', 'kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]
# 21

In [88]:
param_idx = 253
column_idx = 21
db_ = db_randomforest
df_trial_ = df_trial_randomforest

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

7


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
574,2019-05-22 09:43:36.871284,start from top1 column,7,2.084647,5e-06,2.089882,7e-05,0.005235
585,2019-05-22 09:48:57.616511,start from top1 column,8,2.084874,5e-06,2.09001,7.4e-05,0.005136
577,2019-05-22 09:45:01.446371,start from top1 column,8,2.084912,5e-06,2.090103,7.4e-05,0.005191
591,2019-05-22 09:51:53.999963,start from top1 column,8,2.084942,5e-06,2.090111,7.4e-05,0.005169
575,2019-05-22 09:44:05.063535,start from top1 column,8,2.084423,7e-06,2.090126,7.6e-05,0.005703


In [89]:
db_.commit()

In [14]:
db_extratrees = DFDB('../trial2/extratrees.pkl', auto_commit=False)
df_trial_extratrees = db_extratrees.select()
df_trial_extratrees['kfold'] = df_trial_extratrees['param'].apply(lambda x: x['kfold'])
df_trial_extratrees.loc[[38,236,348,437,31]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
38,2019-05-18 02:19:58.774681,tune 31,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",26,2.087228,0.002689,2.10074,0.011794,0.013511
236,2019-05-18 06:42:38.618614,remodel 38 use stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",26,2.090315,3e-06,2.091928,2.6e-05,0.001613
348,2019-05-18 07:10:17.473837,tune 31 by stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",26,2.094741,3e-06,2.0963,2.2e-05,0.001559
437,2019-05-18 07:30:23.386685,remodel 348 use group,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",26,2.09126,0.002742,2.103574,0.011848,0.012313
31,2019-05-18 01:55:38.757217,group3 RFE2,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",26,2.088347,0.002761,2.101585,0.011729,0.013238


In [108]:
# df_trial_extratrees[df_trial_extratrees['remark']=='group3 RFE2'][['datetime','remark', 'kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].loc[[22]]

In [99]:
param_idx = 348
column_idx = 22
db_ = db_extratrees
df_trial_ = df_trial_extratrees

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)


Data with input dtype int64 was converted to float64 by StandardScaler.


Data with input dtype int64 was converted to float64 by StandardScaler.


Data with input dtype int64 was converted to float64 by StandardScaler.


Data with input dtype int64 was converted to float64 by StandardScaler.


Data with input dtype int64 was converted to float64 by StandardScaler.


Data with input dtype int64 was converted to float64 by StandardScaler.


Data with input dtype int64 was converted to float64 by StandardScaler.


Data with input dtype int64 was converted to float64 by StandardScaler.


Data with input dtype int64 was converted to float64 by StandardScaler.


Data with input dtype int64 was converted to float64 by StandardScaler.



17


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
472,2019-05-22 10:54:52.672764,start from top1 column,17,2.122143,1e-06,2.123263,1.8e-05,0.00112
485,2019-05-22 10:58:25.482361,start from top1 column,18,2.122269,1e-06,2.123542,2.1e-05,0.001273
479,2019-05-22 10:56:47.167993,start from top1 column,18,2.122413,4e-06,2.123775,1.3e-05,0.001361
482,2019-05-22 10:57:36.306347,start from top1 column,18,2.122697,1e-06,2.124005,2.1e-05,0.001308
494,2019-05-22 11:00:53.077845,start from top1 column,18,2.12307,3e-06,2.124434,1.7e-05,0.001365


In [100]:
db_.commit()

In [15]:
db_gradientboosting = DFDB('../trial2/gradientboosting.pkl', auto_commit=False)
df_trial_gradientboosting = db_gradientboosting.select()
df_trial_gradientboosting['kfold'] = df_trial_gradientboosting['param'].apply(lambda x: x['kfold'])
df_trial_gradientboosting.loc[[181,232,273,433,23]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
181,2019-05-18 08:03:47.057550,tune 23,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",18,2.032553,0.002867,2.080601,0.011945,0.048049
232,2019-05-18 08:29:06.495866,remodel 181 use stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",18,2.04161,2e-06,2.053282,8.4e-05,0.011672
273,2019-05-18 08:57:10.043037,tune 23 by stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",18,1.902923,3.3e-05,1.950592,0.000158,0.047668
433,2019-05-18 11:08:44.349147,remodel 273 use group,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",18,1.875567,0.001795,2.035,0.011964,0.159432
23,2019-05-18 02:26:43.940775,group3 RFE2,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",18,1.865293,0.001703,2.031389,0.01167,0.166095


In [104]:
# df_trial_gradientboosting[df_trial_gradientboosting['remark']=='group3 RFE2'][['datetime','remark', 'kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

In [105]:
param_idx = 273
column_idx = 21
db_ = db_gradientboosting
df_trial_ = df_trial_gradientboosting

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

17


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
453,2019-05-22 11:57:27.928732,start from top1 column,17,1.90517,1.5e-05,1.950239,0.000149,0.045069
452,2019-05-22 11:57:02.568869,start from top1 column,16,1.906268,1.9e-05,1.950772,0.000143,0.044504
451,2019-05-22 11:56:38.732269,start from top1 column,15,1.907928,2.1e-05,1.95146,0.000123,0.043533
450,2019-05-22 11:56:16.067806,start from top1 column,14,1.916782,3.2e-05,1.960259,0.000147,0.043477
449,2019-05-22 11:55:54.821540,start from top1 column,13,1.919155,2.9e-05,1.961667,0.000149,0.042512


In [106]:
db_.commit()

In [21]:
catboost_columns = copy.deepcopy(df_trial_catboost.loc[688]['param']['columns'])#688
lgbm_columns = copy.deepcopy(df_trial_lgbm.loc[726]['param']['columns'])#726
xgbm_columns = copy.deepcopy(df_trial_xgbm.loc[758]['param']['columns'])#758
randomforest_randomforest = copy.deepcopy(df_trial_randomforest.loc[574]['param']['columns'])#574
extratrees_columns = copy.deepcopy(df_trial_extratrees.loc[348]['param']['columns'])#348
gradientboosting_columns = copy.deepcopy(df_trial_gradientboosting.loc[453]['param']['columns'])#453

In [22]:
all_columns = catboost_columns+lgbm_columns+xgbm_columns+randomforest_randomforest+extratrees_columns+gradientboosting_columns
unique_columns = list(set(all_columns))
common_columns = []
common_columns50 = []
common_columns75 = []
common_columns95 = []
N_columns = 6
count_values = [all_columns.count(col) for col in unique_columns]
for col in unique_columns:
    if all_columns.count(col)==N_columns:
        common_columns.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .5):
        common_columns50.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .75):
        common_columns75.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .95):
        common_columns95.append(col)
print('unique_columns ',len(unique_columns))
print('common_columns50 ',len(common_columns50))
print('common_columns75 ',len(common_columns75))
print('common_columns95 ',len(common_columns95))
print('common_columns ',len(common_columns))

unique_columns  64
common_columns50  64
common_columns75  24
common_columns95  6
common_columns  3


In [17]:
db_knn = DFDB('../trial2/knn.pkl', auto_commit=False)
df_trial_knn = db_knn.select()
df_trial_knn['kfold'] = df_trial_knn['param'].apply(lambda x: x['kfold'])
df_trial_knn.loc[[253,403,521,604]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
253,2019-05-20 01:59:56.752054,tune 1,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",15,2.025539,0.002811,2.075086,0.013919,0.049548
403,2019-05-20 04:43:04.926060,remodel 253 use stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",15,2.038075,4e-06,2.042308,3.7e-05,0.004233
521,2019-05-20 06:45:30.675143,tune 1 by stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",15,1.991735,3e-06,2.010023,1.6e-05,0.018288
604,2019-05-20 09:19:19.905349,remodel 521 use group,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",15,1.965282,0.00242,2.082229,0.013162,0.116947


In [124]:
param_idx = 521
column_idx = 521
db_ = db_knn
df_trial_ = df_trial_knn

mytrial = []
columns = copy.deepcopy(common_columns50)
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, common_columns50, df_test, mytrial, start_columns=common_columns, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

26


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
666,2019-05-22 13:22:50.436810,start from top1 column,26,1.933333,4e-06,1.951963,0.000142,0.018631
668,2019-05-22 13:24:42.742224,start from top1 column,27,1.934381,6e-06,1.952578,0.000139,0.018197
667,2019-05-22 13:23:46.412951,start from top1 column,27,1.934183,1.3e-05,1.952803,0.000153,0.01862
663,2019-05-22 13:19:58.618529,start from top1 column,25,1.934648,5e-06,1.953006,0.000138,0.018359
661,2019-05-22 13:18:11.502480,start from top1 column,24,1.935154,7e-06,1.954024,0.000157,0.01887


In [125]:
db_.commit()

In [19]:
db_svr = DFDB('../trial2/svr.pkl', auto_commit=False)
df_trial_svr = db_svr.select()
df_trial_svr['kfold'] = df_trial_svr['param'].apply(lambda x: x['kfold'])
df_trial_svr.loc[[137,203, 238, 604]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
137,2019-05-20 09:01:46.433971,tune 1,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",15,2.043196,0.003123,2.09175,0.015234,0.048555
203,2019-05-20 12:49:47.634780,remodel 137 use stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",15,2.054332,2e-06,2.055705,2.3e-05,0.001373
238,2019-05-21 01:02:20.908098,tune 1 by stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",15,2.000519,2e-06,2.004971,2e-06,0.004453
604,2019-05-21 10:31:01.450355,remodel 238 use group,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",15,1.976886,0.002282,2.087706,0.015596,0.110819


In [None]:
param_idx = 238
column_idx = 238
db_ = db_knn
df_trial_ = df_trial_knn

mytrial = []
columns = copy.deepcopy(common_columns50)
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, common_columns50, df_test, mytrial, start_columns=common_columns, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

In [None]:
db_.commit()

In [150]:
df_trial_fm = DFDB('../trial2/fm.pkl', auto_commit=False).select()
df_trial_fm.loc[[75,203,289,404]][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
75,2019-05-20 00:35:06.279843,tune 1,15,2.061092,0.003389,2.08785,0.014045,0.026758
203,2019-05-20 00:43:35.910470,remodel 75 use stratified,15,2.066791,6e-06,2.070952,5.2e-05,0.004161
289,2019-05-20 00:50:08.943796,tune 1 by stratified,15,2.059574,1e-05,2.063472,4.2e-05,0.003898
404,2019-05-20 01:07:49.204141,remodel 289 use group,15,2.052364,0.003006,2.091492,0.015884,0.039128


In [151]:
df_trial_lasso = DFDB('../trial2/lasso.pkl', auto_commit=False).select()
df_trial_lasso.loc[[103,203,234,404]][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
103,2019-05-20 01:52:46.174696,tune 0,52,2.10715,0.002435,2.155821,0.008016,0.048671
203,2019-05-20 02:40:49.310949,remodel 103 use stratified,52,2.121924,3e-06,2.123358,7e-06,0.001433
234,2019-05-20 02:45:43.776455,tune 0 by stratified,52,2.082728,4e-06,2.085442,3e-05,0.002714
404,2019-05-20 02:50:37.878055,remodel 234 use group,52,2.06857,0.002412,2.137323,0.006813,0.068753


In [152]:
df_trial_ridge = DFDB('../trial2/ridge.pkl', auto_commit=False).select()
df_trial_ridge.loc[[133,204,228,405]][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
133,2019-05-20 02:39:46.454189,tune 0,52,2.101847,0.002689,2.136976,0.011682,0.035129
204,2019-05-20 02:42:11.518203,remodel 133 use stratified,52,2.112391,4e-06,2.113513,3.6e-05,0.001122
228,2019-05-20 02:46:36.962602,tune 0 by stratified,52,2.114446,3e-06,2.115535,3.6e-05,0.001089
405,2019-05-20 02:51:40.118659,remodel 228 use group,52,2.104126,0.002708,2.137954,0.011853,0.033828


In [141]:
db_kerasmlp = DFDB('../trial2/kerasmlp.pkl', auto_commit=False)
df_trial_kerasmlp = db_kerasmlp.select()
df_trial_kerasmlp['kfold'] = df_trial_kerasmlp['param'].apply(lambda x: x['kfold'])
df_trial_kerasmlp['algorithm-init'] = df_trial_kerasmlp['param'].apply(lambda x: x['algorithm']['init'] )
df_trial_kerasmlp.loc[[2,4]][['datetime','remark','kfold', 'algorithm-init', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,algorithm-init,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
2,2019-05-20 04:10:33.387380,,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}","{'batch': 128, 'solver': 'adam', 'metric': 'mean_absolute_error', 'lr': 0.0001, 'sgd_momentum': 0.9, 'sgd_decay': 0.0001, 'base_save_dir': 'KerasMLPRegressor/input_dim=15,hidden_layer_sizes=[64, 16],activation='relu',l1l2regularizer=None,dropout=0.3', 'alias': 'kerasmlp', 'input_dim': 15, 'hidden_layer_sizes': [64, 16], 'activation': 'relu', 'l1l2regularizer': None, 'dropout': 0.3}",15,1.974893,0.002249,2.084937,0.01654,0.110044
4,2019-05-20 04:47:50.530474,,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}","{'batch': 128, 'solver': 'adam', 'metric': 'mean_absolute_error', 'lr': 0.0001, 'sgd_momentum': 0.9, 'sgd_decay': 0.0001, 'base_save_dir': 'KerasMLPRegressor/input_dim=15,hidden_layer_sizes=[64, 16],activation='relu',l1l2regularizer=None,dropout=0.3', 'alias': 'kerasmlp', 'input_dim': 15, 'hidden_layer_sizes': [64, 16], 'activation': 'relu', 'l1l2regularizer': None, 'dropout': 0.3}",15,1.997296,7.7e-05,2.006472,1.5e-05,0.009176


In [153]:
#group - stratified
trial_lst = [
    df_trial_xgbm.loc[758], 
    df_trial_lgbm.loc[726],
    df_trial_catboost.loc[688],
    df_trial_gradientboosting.loc[453],
    df_trial_randomforest.loc[574],
    df_trial_extratrees.loc[348],
    df_trial_knn.loc[666], 
    df_trial_svr.loc[137],
    df_trial_fm.loc[75],
    df_trial_lasso.loc[103], 
    df_trial_ridge.loc[133],
    df_trial_kerasmlp.loc[4],
    ]
name_lst = [
    'xgbm758', 
    'lgbm726', 
    'cb688',
    'gbm453',
    'rf574',
    'et348',
    'knn666',
    'svr137', 
    'fm75',
    'lasso103',
    'ridge133',
    'kmlp4',
]

In [154]:
df_train_stacknet = pd.DataFrame()
df_test_stacknet = pd.DataFrame()
for df_, name_ in zip(trial_lst, name_lst):
    try:
        df_test_stacknet_i = df_['df_test_pred']
        df_test_stacknet[name_] = np.mean(df_test_stacknet_i.drop(columns=['index']).values, axis=1)
        
        df_train_stacknet_i = df_['df_valid_pred']
        df_train_stacknet[name_] = df_train_stacknet_i['predict']
        
    except Exception as e:
        print(name_, 'exception')

df_train_stacknet['index'] = df_train_stacknet_i['index']
df_train_stacknet = pd.merge(df_train_stacknet, df_train[['y','index', 'group','label']+catboost_columns], on='index')
df_test_stacknet['index'] = df_test_stacknet_i['index']
df_test_stacknet = pd.merge(df_test_stacknet, df_test[['index']+catboost_columns], on='index')

In [160]:
param = {'columns': name_lst+catboost_columns,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler','init':{}},
 'algorithm': 
         {'cls': 'cb.CatBoostRegressor',
  'init': {'num_trees': 334,
   'depth': 4,
   'learning_rate': 0.10006752091939333,
   'l2_leaf_reg': 72.87133089473954,
   'bagging_temperature': 0.7569916539111943,
   'random_strength': 0.3324200819773294,
   'random_state': 5702,
           'logging_level': 'Silent'},
  'fit': {}},
         
}

In [161]:
mytrial = []
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, param, df_test = df_test_stacknet, trial=mytrial, remark='half revert-rfe')

In [162]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [165]:
df_trial[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].tail(1)

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
402,2019-05-22 14:27:31.828373,half revert-rfe,27,1.705975,0.000135,1.754131,0.000176,0.048156


In [166]:
idx=402
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_mystacknet2_{}.csv'.format(idx), index=False)

In [None]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.001, 100)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':name_lst+catboost_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
            },
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, args, df_test = df_test_stacknet, trial=mytrial, remark='tune pure group')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

0:	learn: 4.8872037	total: 81.4ms	remaining: 54.6s
1:	learn: 3.7838922	total: 101ms	remaining: 33.8s
2:	learn: 3.1845275	total: 120ms	remaining: 26.8s
3:	learn: 2.8761021	total: 141ms	remaining: 23.5s
4:	learn: 2.7208942	total: 160ms	remaining: 21.3s
5:	learn: 2.6508343	total: 183ms	remaining: 20.4s
6:	learn: 2.6177873	total: 203ms	remaining: 19.3s
7:	learn: 2.5849963	total: 224ms	remaining: 18.6s
8:	learn: 2.5709179	total: 244ms	remaining: 18s
9:	learn: 2.5504786	total: 265ms	remaining: 17.5s
10:	learn: 2.5393145	total: 286ms	remaining: 17.2s
11:	learn: 2.5345628	total: 306ms	remaining: 16.8s
12:	learn: 2.5323774	total: 328ms	remaining: 16.6s
13:	learn: 2.5228733	total: 350ms	remaining: 16.4s
14:	learn: 2.5200687	total: 370ms	remaining: 16.2s
15:	learn: 2.5163731	total: 391ms	remaining: 16s
16:	learn: 2.5107489	total: 412ms	remaining: 15.9s
17:	learn: 2.5057833	total: 434ms	remaining: 15.8s
18:	learn: 2.4915865	total: 455ms	remaining: 15.6s
19:	learn: 2.4894013	total: 476ms	remaining:

161:	learn: 2.2167279	total: 3.33s	remaining: 10.5s
162:	learn: 2.2157292	total: 3.35s	remaining: 10.5s
163:	learn: 2.2155127	total: 3.37s	remaining: 10.4s
164:	learn: 2.2144363	total: 3.39s	remaining: 10.4s
165:	learn: 2.2128443	total: 3.41s	remaining: 10.4s
166:	learn: 2.2117481	total: 3.43s	remaining: 10.4s
167:	learn: 2.2106376	total: 3.45s	remaining: 10.3s
168:	learn: 2.2095409	total: 3.47s	remaining: 10.3s
169:	learn: 2.2089882	total: 3.49s	remaining: 10.3s
170:	learn: 2.2081124	total: 3.51s	remaining: 10.3s
171:	learn: 2.2073739	total: 3.52s	remaining: 10.2s
172:	learn: 2.2071043	total: 3.55s	remaining: 10.2s
173:	learn: 2.2063003	total: 3.57s	remaining: 10.2s
174:	learn: 2.2050054	total: 3.59s	remaining: 10.2s
175:	learn: 2.2043764	total: 3.61s	remaining: 10.2s
176:	learn: 2.2040568	total: 3.63s	remaining: 10.1s
177:	learn: 2.2033902	total: 3.65s	remaining: 10.1s
178:	learn: 2.2024155	total: 3.67s	remaining: 10.1s
179:	learn: 2.2022470	total: 3.69s	remaining: 10.1s
180:	learn: 

320:	learn: 2.1096622	total: 6.48s	remaining: 7.09s
321:	learn: 2.1088223	total: 6.5s	remaining: 7.07s
322:	learn: 2.1079483	total: 6.52s	remaining: 7.05s
323:	learn: 2.1076593	total: 6.54s	remaining: 7.03s
324:	learn: 2.1075723	total: 6.56s	remaining: 7.01s
325:	learn: 2.1067890	total: 6.58s	remaining: 6.99s
326:	learn: 2.1067280	total: 6.6s	remaining: 6.96s
327:	learn: 2.1063276	total: 6.62s	remaining: 6.94s
328:	learn: 2.1060255	total: 6.64s	remaining: 6.92s
329:	learn: 2.1055841	total: 6.66s	remaining: 6.9s
330:	learn: 2.1053944	total: 6.68s	remaining: 6.88s
331:	learn: 2.1042819	total: 6.7s	remaining: 6.86s
332:	learn: 2.1033167	total: 6.72s	remaining: 6.84s
333:	learn: 2.1026853	total: 6.74s	remaining: 6.82s
334:	learn: 2.1023984	total: 6.76s	remaining: 6.8s
335:	learn: 2.1023269	total: 6.78s	remaining: 6.78s
336:	learn: 2.1022213	total: 6.8s	remaining: 6.76s
337:	learn: 2.1019363	total: 6.82s	remaining: 6.74s
338:	learn: 2.1011732	total: 6.84s	remaining: 6.72s
339:	learn: 2.1010

479:	learn: 2.0550553	total: 9.6s	remaining: 3.84s
480:	learn: 2.0544053	total: 9.62s	remaining: 3.82s
481:	learn: 2.0539918	total: 9.64s	remaining: 3.8s
482:	learn: 2.0534977	total: 9.66s	remaining: 3.78s
483:	learn: 2.0531388	total: 9.68s	remaining: 3.76s
484:	learn: 2.0527203	total: 9.7s	remaining: 3.74s
485:	learn: 2.0526139	total: 9.71s	remaining: 3.72s
486:	learn: 2.0524696	total: 9.73s	remaining: 3.7s
487:	learn: 2.0523828	total: 9.75s	remaining: 3.68s
488:	learn: 2.0521517	total: 9.78s	remaining: 3.66s
489:	learn: 2.0518573	total: 9.79s	remaining: 3.64s
490:	learn: 2.0508909	total: 9.81s	remaining: 3.62s
491:	learn: 2.0503818	total: 9.83s	remaining: 3.6s
492:	learn: 2.0493359	total: 9.85s	remaining: 3.58s
493:	learn: 2.0489016	total: 9.87s	remaining: 3.56s
494:	learn: 2.0486053	total: 9.89s	remaining: 3.54s
495:	learn: 2.0477388	total: 9.91s	remaining: 3.52s
496:	learn: 2.0473304	total: 9.93s	remaining: 3.5s
497:	learn: 2.0470903	total: 9.95s	remaining: 3.48s
498:	learn: 2.0470

638:	learn: 2.0111631	total: 12.7s	remaining: 658ms
639:	learn: 2.0108491	total: 12.8s	remaining: 638ms
640:	learn: 2.0096759	total: 12.8s	remaining: 618ms
641:	learn: 2.0096105	total: 12.8s	remaining: 598ms
642:	learn: 2.0088863	total: 12.8s	remaining: 578ms
643:	learn: 2.0087498	total: 12.8s	remaining: 558ms
644:	learn: 2.0086474	total: 12.9s	remaining: 538ms
645:	learn: 2.0086149	total: 12.9s	remaining: 518ms
646:	learn: 2.0085206	total: 12.9s	remaining: 498ms
647:	learn: 2.0084486	total: 12.9s	remaining: 478ms
648:	learn: 2.0073891	total: 12.9s	remaining: 458ms
649:	learn: 2.0059432	total: 12.9s	remaining: 438ms
650:	learn: 2.0056585	total: 13s	remaining: 418ms
651:	learn: 2.0046692	total: 13s	remaining: 398ms
652:	learn: 2.0045221	total: 13s	remaining: 378ms
653:	learn: 2.0044214	total: 13s	remaining: 359ms
654:	learn: 2.0040768	total: 13s	remaining: 339ms
655:	learn: 2.0038290	total: 13.1s	remaining: 319ms
656:	learn: 2.0038203	total: 13.1s	remaining: 299ms
657:	learn: 2.0030960	

127:	learn: 2.2509806	total: 2.62s	remaining: 11.1s
128:	learn: 2.2490798	total: 2.63s	remaining: 11.1s
129:	learn: 2.2473102	total: 2.65s	remaining: 11.1s
130:	learn: 2.2459422	total: 2.67s	remaining: 11s
131:	learn: 2.2452046	total: 2.69s	remaining: 11s
132:	learn: 2.2439506	total: 2.71s	remaining: 11s
133:	learn: 2.2415456	total: 2.73s	remaining: 11s
134:	learn: 2.2408111	total: 2.75s	remaining: 10.9s
135:	learn: 2.2394831	total: 2.77s	remaining: 10.9s
136:	learn: 2.2379106	total: 2.79s	remaining: 10.9s
137:	learn: 2.2373103	total: 2.81s	remaining: 10.9s
138:	learn: 2.2356511	total: 2.83s	remaining: 10.9s
139:	learn: 2.2343287	total: 2.85s	remaining: 10.8s
140:	learn: 2.2337604	total: 2.87s	remaining: 10.8s
141:	learn: 2.2332566	total: 2.89s	remaining: 10.8s
142:	learn: 2.2313587	total: 2.91s	remaining: 10.8s
143:	learn: 2.2299747	total: 2.94s	remaining: 10.8s
144:	learn: 2.2280099	total: 2.96s	remaining: 10.7s
145:	learn: 2.2273034	total: 2.98s	remaining: 10.7s
146:	learn: 2.227001

286:	learn: 2.1173623	total: 5.82s	remaining: 7.81s
287:	learn: 2.1163748	total: 5.84s	remaining: 7.79s
288:	learn: 2.1158983	total: 5.86s	remaining: 7.77s
289:	learn: 2.1154288	total: 5.88s	remaining: 7.75s
290:	learn: 2.1151790	total: 5.9s	remaining: 7.73s
291:	learn: 2.1151072	total: 5.92s	remaining: 7.71s
292:	learn: 2.1147483	total: 5.94s	remaining: 7.69s
293:	learn: 2.1132718	total: 5.96s	remaining: 7.67s
294:	learn: 2.1131567	total: 5.98s	remaining: 7.64s
295:	learn: 2.1129469	total: 6s	remaining: 7.63s
296:	learn: 2.1127965	total: 6.02s	remaining: 7.61s
297:	learn: 2.1122230	total: 6.04s	remaining: 7.58s
298:	learn: 2.1120731	total: 6.06s	remaining: 7.57s
299:	learn: 2.1112695	total: 6.08s	remaining: 7.54s
300:	learn: 2.1109646	total: 6.11s	remaining: 7.53s
301:	learn: 2.1103889	total: 6.13s	remaining: 7.5s
302:	learn: 2.1096313	total: 6.15s	remaining: 7.48s
303:	learn: 2.1092216	total: 6.17s	remaining: 7.46s
304:	learn: 2.1086434	total: 6.19s	remaining: 7.44s
305:	learn: 2.108

445:	learn: 2.0404492	total: 9.02s	remaining: 4.57s
446:	learn: 2.0392864	total: 9.04s	remaining: 4.55s
447:	learn: 2.0389128	total: 9.06s	remaining: 4.53s
448:	learn: 2.0385756	total: 9.08s	remaining: 4.51s
449:	learn: 2.0374323	total: 9.1s	remaining: 4.49s
450:	learn: 2.0372297	total: 9.12s	remaining: 4.47s
451:	learn: 2.0370406	total: 9.14s	remaining: 4.45s
452:	learn: 2.0368386	total: 9.16s	remaining: 4.43s
453:	learn: 2.0368059	total: 9.18s	remaining: 4.41s
454:	learn: 2.0356004	total: 9.2s	remaining: 4.39s
455:	learn: 2.0352876	total: 9.22s	remaining: 4.37s
456:	learn: 2.0351848	total: 9.24s	remaining: 4.35s
457:	learn: 2.0349783	total: 9.26s	remaining: 4.33s
458:	learn: 2.0345326	total: 9.28s	remaining: 4.31s
459:	learn: 2.0342969	total: 9.3s	remaining: 4.29s
460:	learn: 2.0340295	total: 9.32s	remaining: 4.27s
461:	learn: 2.0336693	total: 9.34s	remaining: 4.25s
462:	learn: 2.0334762	total: 9.36s	remaining: 4.23s
463:	learn: 2.0333296	total: 9.38s	remaining: 4.21s
464:	learn: 2.0

604:	learn: 1.9910793	total: 12.3s	remaining: 1.36s
605:	learn: 1.9910536	total: 12.3s	remaining: 1.34s
606:	learn: 1.9906446	total: 12.4s	remaining: 1.32s
607:	learn: 1.9905584	total: 12.4s	remaining: 1.3s
608:	learn: 1.9904926	total: 12.4s	remaining: 1.28s
609:	learn: 1.9902750	total: 12.4s	remaining: 1.26s
610:	learn: 1.9894948	total: 12.4s	remaining: 1.24s
611:	learn: 1.9889017	total: 12.5s	remaining: 1.22s
612:	learn: 1.9887892	total: 12.5s	remaining: 1.2s
613:	learn: 1.9878460	total: 12.5s	remaining: 1.18s
614:	learn: 1.9874678	total: 12.5s	remaining: 1.16s
615:	learn: 1.9873438	total: 12.5s	remaining: 1.14s
616:	learn: 1.9871558	total: 12.6s	remaining: 1.12s
617:	learn: 1.9860905	total: 12.6s	remaining: 1.1s
618:	learn: 1.9860249	total: 12.6s	remaining: 1.08s
619:	learn: 1.9858327	total: 12.6s	remaining: 1.06s
620:	learn: 1.9858041	total: 12.7s	remaining: 1.04s
621:	learn: 1.9850354	total: 12.7s	remaining: 1.02s
622:	learn: 1.9845027	total: 12.7s	remaining: 998ms
623:	learn: 1.9

93:	learn: 2.2914665	total: 1.94s	remaining: 12s
94:	learn: 2.2892304	total: 1.97s	remaining: 11.9s
95:	learn: 2.2885862	total: 1.99s	remaining: 11.9s
96:	learn: 2.2882185	total: 2s	remaining: 11.9s
97:	learn: 2.2864034	total: 2.02s	remaining: 11.9s
98:	learn: 2.2837878	total: 2.04s	remaining: 11.8s
99:	learn: 2.2821648	total: 2.06s	remaining: 11.8s
100:	learn: 2.2794630	total: 2.09s	remaining: 11.8s
101:	learn: 2.2770826	total: 2.11s	remaining: 11.8s
102:	learn: 2.2752288	total: 2.13s	remaining: 11.8s
103:	learn: 2.2733845	total: 2.15s	remaining: 11.7s
104:	learn: 2.2714288	total: 2.17s	remaining: 11.7s
105:	learn: 2.2698764	total: 2.19s	remaining: 11.7s
106:	learn: 2.2689145	total: 2.21s	remaining: 11.7s
107:	learn: 2.2671895	total: 2.23s	remaining: 11.6s
108:	learn: 2.2664717	total: 2.25s	remaining: 11.6s
109:	learn: 2.2647526	total: 2.27s	remaining: 11.6s
110:	learn: 2.2645746	total: 2.29s	remaining: 11.6s
111:	learn: 2.2640344	total: 2.31s	remaining: 11.5s
112:	learn: 2.2625935	to

252:	learn: 2.1368723	total: 5.15s	remaining: 8.52s
253:	learn: 2.1360993	total: 5.17s	remaining: 8.5s
254:	learn: 2.1356066	total: 5.19s	remaining: 8.48s
255:	learn: 2.1354893	total: 5.21s	remaining: 8.46s
256:	learn: 2.1349065	total: 5.23s	remaining: 8.44s
257:	learn: 2.1347783	total: 5.25s	remaining: 8.42s
258:	learn: 2.1346289	total: 5.27s	remaining: 8.4s
259:	learn: 2.1331757	total: 5.29s	remaining: 8.38s
260:	learn: 2.1324110	total: 5.31s	remaining: 8.36s
261:	learn: 2.1317368	total: 5.33s	remaining: 8.34s
262:	learn: 2.1315895	total: 5.35s	remaining: 8.31s
263:	learn: 2.1299696	total: 5.37s	remaining: 8.29s
264:	learn: 2.1294963	total: 5.39s	remaining: 8.28s
265:	learn: 2.1285664	total: 5.41s	remaining: 8.26s
266:	learn: 2.1281998	total: 5.43s	remaining: 8.23s
267:	learn: 2.1273708	total: 5.45s	remaining: 8.21s
268:	learn: 2.1271729	total: 5.47s	remaining: 8.19s
269:	learn: 2.1266049	total: 5.49s	remaining: 8.17s
270:	learn: 2.1260986	total: 5.51s	remaining: 8.15s
271:	learn: 2.

411:	learn: 2.0785432	total: 8.34s	remaining: 5.26s
412:	learn: 2.0785154	total: 8.36s	remaining: 5.24s
413:	learn: 2.0783106	total: 8.38s	remaining: 5.22s
414:	learn: 2.0779493	total: 8.39s	remaining: 5.2s
415:	learn: 2.0775116	total: 8.41s	remaining: 5.18s
416:	learn: 2.0770478	total: 8.43s	remaining: 5.16s
417:	learn: 2.0770376	total: 8.45s	remaining: 5.14s
418:	learn: 2.0768992	total: 8.47s	remaining: 5.12s
419:	learn: 2.0762511	total: 8.49s	remaining: 5.09s
420:	learn: 2.0761999	total: 8.51s	remaining: 5.07s
421:	learn: 2.0757186	total: 8.53s	remaining: 5.05s
422:	learn: 2.0756835	total: 8.55s	remaining: 5.03s
423:	learn: 2.0751615	total: 8.57s	remaining: 5.01s
424:	learn: 2.0746272	total: 8.59s	remaining: 4.99s
425:	learn: 2.0745854	total: 8.61s	remaining: 4.97s
426:	learn: 2.0744618	total: 8.63s	remaining: 4.95s
427:	learn: 2.0741476	total: 8.65s	remaining: 4.93s
428:	learn: 2.0739455	total: 8.67s	remaining: 4.91s
429:	learn: 2.0733837	total: 8.69s	remaining: 4.89s
430:	learn: 2

In [None]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [66]:
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial[(df_trial['remark']=='tune pure group')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'kfold-type', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,remark,kfold-type,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
26,2019-05-21 09:54:21.968111,tune pure group,stratified,30,1.806266,3.8e-05,1.854197,0.000208,0.047932
45,2019-05-21 10:03:39.606307,tune pure group,stratified,30,1.821322,9e-06,1.863465,8.3e-05,0.042143
84,2019-05-21 10:15:55.607335,tune pure group,stratified,30,1.821273,2.3e-05,1.866967,0.00014,0.045693
37,2019-05-21 10:00:19.014242,tune pure group,stratified,30,1.820959,2.5e-05,1.867446,0.000102,0.046487
85,2019-05-21 10:16:05.271145,tune pure group,stratified,30,1.824184,9e-06,1.868355,9.5e-05,0.044172


In [69]:
idx=26
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_mystacknet2_{}.csv'.format(idx), index=False)