In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB
from models import *

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_columns', 2000)
pd.set_option('display.max_rows', 2000)
# pd.set_option('display.width', 2000)
# pd.set_option('display.expand_frame_repr', True)
pd.set_option('display.max_colwidth', -1)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [7]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [8]:
# mytrial = []
db = DFDB('../trial/mystack.pkl', auto_commit=False)
df_trial = db.select()

In [9]:
df_trial.shape

(841, 15)

In [10]:
def revert_rfe(df_train, param, sorted_columns, df_test, trial, start_columns, remark=None):
    
    # init cv_score and try only base feature
    selected_columns = copy.deepcopy(start_columns)
    args = copy.deepcopy(param)
    args['columns'] = selected_columns
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
    val_mae_mean = np.mean(df_his.valid)
    cv_score = val_mae_mean
    
    # add feature one by one and check cv score change
    for idx,col in enumerate(sorted_columns):
#         if idx in start_column_index:
#             continue
        args = copy.deepcopy(param)
        args['columns'] = list(set(selected_columns + [col]))
        df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
        val_mae_mean = np.mean(df_his.valid)
        if val_mae_mean < cv_score:
            selected_columns.append(col)
            cv_score = val_mae_mean
            
    return selected_columns


In [11]:
db_catboost = DFDB('../trial/catboost.pkl', auto_commit=False)
df_trial_catboost = db_catboost.select()
df_trial_catboost['kfold'] = df_trial_catboost['param'].apply(lambda x: x['kfold'])
df_trial_catboost.loc[[452]][['datetime','kfold', 'remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,kfold,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
452,2019-05-16 06:46:11.662876,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",tune 437,30,1.853478,5e-06,1.900535,7.2e-05,0.047056


In [101]:
param_idx = 452
column_idx = 452
db_ = db_catboost
df_trial_ = df_trial_catboost
mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[df_trial_['remark']=='start from top1 column'].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(1)

20


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
666,2019-05-22 07:39:20.868903,start from top1 column,20,1.852061,8e-06,1.89749,7e-05,0.045429


In [102]:
db_.commit()

In [12]:
db_xgbm = DFDB('../trial/xgbm.pkl', auto_commit=False)
df_trial_xgbm = db_xgbm.select()
df_trial_xgbm['kfold'] = df_trial_xgbm['param'].apply(lambda x: x['kfold'])
df_trial_xgbm.loc[[1172]][['datetime','kfold', 'remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,kfold,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
1172,2019-05-16 09:18:54.750120,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",tune 1099,25,1.924779,1.6e-05,1.96556,6.6e-05,0.040782


In [105]:
param_idx = 1172
column_idx = 1172
db_ = db_xgbm
df_trial_ = df_trial_xgbm
mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[df_trial_['remark']=='start from top1 column'].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(1)

14


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
1316,2019-05-22 07:46:04.993884,start from top1 column,14,1.943401,6e-06,1.977736,8.3e-05,0.034334


In [109]:
db_.commit()

In [13]:
db_lgbm = DFDB('../trial/lgbm.pkl', auto_commit=False)
df_trial_lgbm = db_lgbm.select()
df_trial_lgbm['kfold'] = df_trial_lgbm['param'].apply(lambda x: x['kfold'])
df_trial_lgbm.loc[[2156]][['datetime','kfold', 'remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,kfold,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
2156,2019-05-16 22:30:28.865774,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",tune 2025,25,1.911171,2e-05,1.952897,0.000129,0.041726


In [111]:
param_idx = 2156
column_idx = 2156
db_ = db_lgbm
df_trial_ = df_trial_lgbm

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[df_trial_['remark']=='start from top1 column'].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(1)

19


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
2249,2019-05-22 07:52:55.286514,start from top1 column,19,1.915651,2.9e-05,1.949103,6.3e-05,0.033452


In [112]:
db_.commit()

In [14]:
db_randomforest = DFDB('../trial/randomforest.pkl', auto_commit=False)
df_trial_randomforest = db_randomforest.select()
df_trial_randomforest['kfold'] = df_trial_lgbm['param'].apply(lambda x: x['kfold'])
df_trial_randomforest.loc[[297]][['datetime','kfold', 'remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,kfold,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
297,2019-05-16 12:50:31.611238,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",tune 239,80,2.080347,3e-06,2.086208,6.3e-05,0.00586


In [115]:
param_idx = 297
column_idx = 297
db_ = db_randomforest
df_trial_ = df_trial_randomforest

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[df_trial_['remark']=='start from top1 column'].sort_values(by=['val_mae'])[['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(1)

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
457,2019-05-22 08:52:35.818158,start from top1 column,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",9,2.079816,1e-06,2.084794,3.7e-05,0.004977


In [116]:
db_.commit()

In [15]:
db_extratrees = DFDB('../trial/extratrees.pkl', auto_commit=False)
df_trial_extratrees = db_extratrees.select()
df_trial_extratrees['kfold'] = df_trial_extratrees['param'].apply(lambda x: x['kfold'])
df_trial_extratrees.loc[459:459][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
459,2019-05-16 11:30:08.432873,tune 427,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",70,2.109147,1.1e-05,2.110347,2.5e-05,0.0012


In [126]:
# df_trial_extratrees[df_trial_extratrees['remark']=='start from top1 column'][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

In [118]:
param_idx = 459
column_idx = 459
db_ = db_extratrees
df_trial_ = df_trial_extratrees

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

7


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
634,2019-05-22 09:44:28.519659,start from top1 column,7,2.280364,0.000129,2.280876,9.8e-05,0.000512
637,2019-05-22 09:44:44.271200,start from top1 column,8,2.283377,0.000189,2.284008,0.000167,0.000631
638,2019-05-22 09:44:49.648880,start from top1 column,8,2.306283,0.000264,2.306855,0.000224,0.000572
654,2019-05-22 09:46:12.088567,start from top1 column,8,2.308755,0.000241,2.309285,0.000187,0.00053
691,2019-05-22 09:49:24.982221,start from top1 column,8,2.310001,0.000174,2.310549,0.000155,0.000547


In [119]:
db_.commit()

In [16]:
db_gradientboosting = DFDB('../trial/gradientboosting.pkl', auto_commit=False)
df_trial_gradientboosting = db_gradientboosting.select()
df_trial_gradientboosting['kfold'] = df_trial_gradientboosting['param'].apply(lambda x: x['kfold'])
df_trial_gradientboosting.loc[306:306][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
306,2019-05-16 14:16:23.815368,tune 220,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",70,1.914761,1.9e-05,1.951837,0.000144,0.037076


In [123]:
param_idx = 306
column_idx = 306
db_ = db_gradientboosting
df_trial_ = df_trial_gradientboosting

mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

34


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
390,2019-05-22 10:54:15.304056,start from top1 column,34,1.914421,2.1e-05,1.947916,0.000136,0.033495
394,2019-05-22 10:55:34.700015,start from top1 column,35,1.914548,2e-05,1.948167,0.000144,0.033619
393,2019-05-22 10:55:14.918492,start from top1 column,35,1.914617,1.9e-05,1.948239,0.000141,0.033622
391,2019-05-22 10:54:35.173820,start from top1 column,35,1.914408,1.3e-05,1.948407,0.00015,0.033999
386,2019-05-22 10:52:58.560697,start from top1 column,33,1.915272,2.2e-05,1.948579,0.000144,0.033307


In [139]:
db_.commit()

In [17]:
catboost_columns = df_trial_catboost.loc[666]['param']['columns']
xgbm_columns = df_trial_xgbm.loc[1172]['param']['columns']
lgbm_columns = df_trial_lgbm.loc[2249]['param']['columns']
rf_columns = df_trial_randomforest.loc[457]['param']['columns']
extratrees_columns = df_trial_extratrees.loc[459]['param']['columns']
gradientboosting_columns = df_trial_gradientboosting.loc[390]['param']['columns']

In [18]:
all_columns = catboost_columns+xgbm_columns+lgbm_columns+rf_columns+extratrees_columns+gradientboosting_columns
unique_columns = list(set(all_columns))
common_columns = []
common_columns25 = []
common_columns50 = []
common_columns75 = []
common_columns95 = []
N_columns = 6
count_values = [all_columns.count(col) for col in unique_columns]
for col in unique_columns:
    if all_columns.count(col)==N_columns:
        common_columns.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .75):
        common_columns75.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .95):
        common_columns95.append(col)
len(unique_columns), len(common_columns),len(common_columns75),len(common_columns95)

(107, 3, 28, 9)

In [19]:
db_knn = DFDB('../trial/knn.pkl', auto_commit=False)
df_trial_knn = db_knn.select()
df_trial_knn['kfold'] = df_trial_knn['param'].apply(lambda x: x['kfold'])
df_trial_knn.loc[[17]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
17,2019-05-16 23:11:48.201772,,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",7,2.026411,0.002031,2.081674,0.013139,0.055264


In [147]:
param_idx = 17
column_idx = 17
db_ = db_knn
df_trial_ = df_trial_knn

mytrial = []
columns = copy.deepcopy(common_columns75)
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, common_columns75, df_test, mytrial, start_columns=common_columns, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

11


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
44,2019-05-22 12:01:13.064525,start from top1 column,11,2.030474,0.002579,2.067129,0.012971,0.036655
45,2019-05-22 12:02:08.901869,start from top1 column,12,2.029889,0.00261,2.067775,0.012532,0.037886
42,2019-05-22 11:59:36.369687,start from top1 column,10,2.014322,0.002294,2.069829,0.012439,0.055507
40,2019-05-22 11:58:22.750305,start from top1 column,9,2.014051,0.002266,2.070267,0.012733,0.056216
39,2019-05-22 11:57:47.577279,start from top1 column,9,2.014051,0.002266,2.070267,0.012733,0.056216


In [148]:
db_.commit()

In [20]:
db_svr = DFDB('../trial/svr.pkl', auto_commit=False)
df_trial_svr = db_svr.select()
df_trial_svr['kfold'] = df_trial_knn['param'].apply(lambda x: x['kfold'])
df_trial_svr.loc[[7]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
7,2019-05-16 23:57:06.120463,,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'timeseries'}",7,2.050366,0.003073,2.092532,0.01549,0.042166


In [151]:
param_idx = 7
column_idx = 7
db_ = db_svr
df_trial_ = df_trial_svr

mytrial = []
columns = copy.deepcopy(common_columns75)
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, common_columns75, df_test, mytrial, start_columns=common_columns, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

15


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
33,2019-05-22 13:27:47.976992,start from top1 column,15,2.049126,0.002872,2.084708,0.014058,0.035582
31,2019-05-22 13:21:22.869097,start from top1 column,14,2.046522,0.002914,2.084855,0.014546,0.038333
34,2019-05-22 13:31:03.931189,start from top1 column,16,2.049807,0.002894,2.08531,0.014089,0.035503
32,2019-05-22 13:24:34.951492,start from top1 column,15,2.048032,0.003116,2.085463,0.015453,0.037432
36,2019-05-22 13:37:35.865988,start from top1 column,16,2.051944,0.00274,2.085779,0.01403,0.033835


In [152]:
db_.commit()

In [21]:
db_fm = DFDB('../trial/fm.pkl', auto_commit=False)
df_trial_fm = db_fm.select()
df_trial_fm['kfold'] = df_trial_knn['param'].apply(lambda x: x['kfold'])
df_trial_fm.loc[[313]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
313,2019-05-17 04:14:15.187700,,,7,2.099356,0.00307,2.10857,0.01448,0.009214


In [154]:
param_idx = 313
column_idx = 313
db_ = df_fm
df_trial_ = df_trial_fm

mytrial = []
columns = copy.deepcopy(common_columns75)
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, common_columns75, df_test, mytrial, start_columns=common_columns, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

15


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
339,2019-05-22 13:46:01.840745,start from top1 column,15,2.07013,0.003474,2.083177,0.013582,0.013047
340,2019-05-22 13:46:02.411317,start from top1 column,16,2.070588,0.003454,2.084862,0.013989,0.014274
341,2019-05-22 13:46:02.980493,start from top1 column,16,2.070451,0.003462,2.085011,0.012999,0.01456
338,2019-05-22 13:46:01.313134,start from top1 column,14,2.073454,0.003636,2.085674,0.014152,0.012219
337,2019-05-22 13:46:00.822800,start from top1 column,13,2.074394,0.003505,2.085807,0.01423,0.011413


In [155]:
db_.commit()

In [22]:
db_lasso = DFDB('../trial/lasso.pkl', auto_commit=False)
df_trial_lasso = db_lasso.select()
df_trial_lasso['kfold'] = df_trial_lasso['param'].apply(lambda x: x['kfold'])
df_trial_lasso.loc[[8]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
8,2019-05-16 11:54:52.999400,,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",165,2.106014,0.002088,2.156098,0.008084,0.050085


In [160]:
param_idx = 8
column_idx = 8
db_ = db_lasso
df_trial_ = df_trial_lasso

mytrial = []
columns = copy.deepcopy(unique_columns)
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, unique_columns, df_test, mytrial, start_columns=common_columns, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

41


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
117,2019-05-22 13:54:00.119986,start from top1 column,42,2.123888,0.003288,2.13228,0.013294,0.008391
116,2019-05-22 13:53:59.563911,start from top1 column,41,2.123888,0.003288,2.13228,0.013294,0.008391
114,2019-05-22 13:53:58.538085,start from top1 column,40,2.124411,0.003284,2.13254,0.013255,0.008129
115,2019-05-22 13:53:59.022892,start from top1 column,41,2.124411,0.003284,2.13254,0.013255,0.008129
113,2019-05-22 13:53:58.061766,start from top1 column,39,2.124411,0.003284,2.13254,0.013255,0.008129


In [161]:
db_.commit()

In [23]:
db_ridge = DFDB('../trial/ridge.pkl', auto_commit=False)
df_trial_ridge = db_ridge.select()
df_trial_ridge['kfold'] = df_trial_ridge['param'].apply(lambda x: x['kfold'])
df_trial_ridge.loc[[15]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
15,2019-05-16 11:27:58.322760,,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",165,2.14516,0.002662,2.180493,0.012586,0.035332


In [164]:
param_idx = 15
column_idx = 15
db_ = db_ridge
df_trial_ = df_trial_ridge

mytrial = []
columns = copy.deepcopy(unique_columns)
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, unique_columns, df_test, mytrial, start_columns=common_columns, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

77


Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
126,2019-05-22 13:55:01.832389,start from top1 column,77,2.176952,0.003906,2.185538,0.014603,0.008586
127,2019-05-22 13:55:02.131244,start from top1 column,78,2.176916,0.00391,2.185648,0.014618,0.008732
125,2019-05-22 13:55:01.528713,start from top1 column,76,2.180261,0.003919,2.18879,0.014647,0.00853
124,2019-05-22 13:55:01.246867,start from top1 column,75,2.180281,0.003921,2.188805,0.014647,0.008524
123,2019-05-22 13:55:00.964545,start from top1 column,74,2.180834,0.003915,2.189315,0.014585,0.008481


In [165]:
db_.commit()

In [24]:
db_mlp = DFDB('../trial/kerasmlp.pkl', auto_commit=False)
df_trial_mlp = db_mlp.select()
df_trial_mlp['kfold'] = df_trial_mlp['param'].apply(lambda x: x['kfold'])
df_trial_mlp.loc[[13,17]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
13,2019-05-17 18:50:09.056814,,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",30,1.87789,0.000123,1.91037,8e-06,0.032479
17,2019-05-18 07:00:13.786881,,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",7,2.022019,0.003167,2.107065,0.016922,0.085046


In [180]:
db_ = db_mlp
df_trial_ = df_trial_mlp

mytrial = []

path_param={
    'input_dim':len(common_columns75),
    'hidden_layer_sizes':[64, 16],
    'activation':'relu',
    'l1l2regularizer':None,
    'dropout':.3,
}
base_save_dir = create_path('KerasMLPRegressor', path_param)

param={
    'algorithm': {
        'cls': 'KerasMLPRegressor',
        'fit': {
            'versbose':0, 
            'epochs':50, 
            'early_stopping_rounds':20,
            'eval_set':{}
        },
        'init': {
            'batch':128, 
            'solver':'adam', 
            'metric':'mean_absolute_error', 
            'lr':.0001, 
            'sgd_momentum':.9, 
            'sgd_decay':0.0001,
            'base_save_dir':base_save_dir, 
            'alias':'kerasmlp',
            **path_param
        }
    },
    'columns': common_columns75,
    'kfold': {
        'n_splits': 3,
        'random_state': 1985,
        'shuffle': True,
        'type': 'stratified'#stratified
    },
    'scaler': {
        'cls': 'StandardScaler',
        'init':{}
    }
}

df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='start from top1 column')

for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.06)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
18,2019-05-22 14:42:38.554712,start from top1 column,28,2.000214,3.6e-05,2.011596,2e-05,0.011382


In [181]:
db_.commit()

In [39]:
db_frgf = DFDB('../trial/frgf.pkl', auto_commit=False)
df_trial_frgf = db_frgf.select()
df_trial_frgf['kfold'] = df_trial_frgf['param'].apply(lambda x: x['kfold'])
df_trial_frgf.loc[[170,200]][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
170,2019-05-18 17:20:13.549950,,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",14,2.023472,0.00295,2.073119,0.013364,0.049647
200,2019-05-18 23:29:15.867777,tune by stratified,"{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'stratified'}",14,1.935952,1.1e-05,1.977754,0.000119,0.041802


In [None]:
param_idx = 200
column_idx = 200
db_ = db_frgf
df_trial_ = df_trial_frgf

mytrial = []
columns = copy.deepcopy(common_columns75)
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
# param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}
param['kfold']['type'] = 'group'
param['kfold']['n_splits'] = 8

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, start_columns=common_columns, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])
df_trial_[(df_trial_['remark']=='start from top1 column')&(df_trial_['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

In [27]:
db_.commit()

In [32]:
#stratified
trial_lst = [
    df_trial_catboost.loc[666], df_trial_xgbm.loc[1316], df_trial_lgbm.loc[2249],df_trial_gradientboosting.loc[390],
             df_trial_randomforest.loc[457],df_trial_extratrees.loc[459],
            df_trial_knn.loc[44], df_trial_svr.loc[33],df_trial_fm.loc[339],
            df_trial_lasso.loc[117], df_trial_ridge.loc[126],
        df_trial_mlp.loc[13], df_trial_frgf.loc[327],
            ]
name_lst = [
    'cb666', 'xgbm1316', 'lgbm2249',
    'gbm390','rf457','et459',
    'knn44','svr33', 'fm339',
    'lasso117','ridge126',
    'mlp13','frgf327'
]

df_train_stacknet = pd.DataFrame()
df_test_stacknet = pd.DataFrame()
for df_, name_ in zip(trial_lst, name_lst):
    try:
        df_test_stacknet_i = df_['df_test_pred']
        df_test_stacknet[name_] = np.mean(df_test_stacknet_i.drop(columns=['index']).values, axis=1)
        
        df_train_stacknet_i = df_['df_valid_pred']
        df_train_stacknet[name_] = df_train_stacknet_i['predict']
        
    except Exception as e:
        print(name_, 'exception')

df_train_stacknet['index'] = df_train_stacknet_i['index']
df_train_stacknet = pd.merge(df_train_stacknet, df_train[['y','index', 'group','label']+catboost_columns], on='index')
df_test_stacknet['index'] = df_test_stacknet_i['index']
df_test_stacknet = pd.merge(df_test_stacknet, df_test[['index']+catboost_columns], on='index')

In [33]:
param = {'columns': name_lst+catboost_columns,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler', 'init':{}},
 'algorithm': {'cls': 'cb.CatBoostRegressor',
  'init': {'num_trees': 267,
   'depth': 8,
   'learning_rate': 0.04441106014865151,
   'l2_leaf_reg': 11.463989088797742,
   'bagging_temperature': 0.8825156807375603,
   'random_strength': 0.9042666757512351,
   'random_state': 473,
          'logging_level': 'Silent'},
  'fit': {}}}

In [41]:
mytrial=[]

#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_loguniform('l2_leaf_reg', 1e-3, 1e2)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_loguniform('random_strength', 1e-3, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':name_lst+catboost_columns,
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
                'logging_level': 'Silent'
            },
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, args, df_test = df_test_stacknet, trial=mytrial, remark='tune 841 by group')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-23 13:50:50,930] Finished a trial resulted in value: 1.4297774488787542. Current best value is 1.4297774488787542 with parameters: {'num_trees': 947, 'depth': 6, 'learning_rate': 0.10459044411254348, 'l2_leaf_reg': 1.5426941056068, 'bagging_temperature': 0.698634109035391, 'random_strength': 0.004163547444524762, 'random_state': 657}.
[I 2019-05-23 13:55:11,349] Finished a trial resulted in value: 1.839191365518568. Current best value is 1.4297774488787542 with parameters: {'num_trees': 947, 'depth': 6, 'learning_rate': 0.10459044411254348, 'l2_leaf_reg': 1.5426941056068, 'bagging_temperature': 0.698634109035391, 'random_strength': 0.004163547444524762, 'random_state': 657}.
[I 2019-05-23 13:55:42,641] Finished a trial resulted in value: 0.6298733298889073. Current best value is 0.6298733298889073 with parameters: {'num_trees': 431, 'depth': 2, 'learning_rate': 0.20256256027563202, 'l2_leaf_reg': 0.14365110848156615, 'bagging_temperature': 0.7318994610168874, 'random_strengt

[I 2019-05-23 14:23:34,057] Finished a trial resulted in value: 0.47352280267914937. Current best value is 0.2559890769949264 with parameters: {'num_trees': 337, 'depth': 2, 'learning_rate': 0.020564388262900246, 'l2_leaf_reg': 0.001478348209230398, 'bagging_temperature': 0.9081958553204716, 'random_strength': 0.2723574194507172, 'random_state': 6955}.
[I 2019-05-23 14:23:53,276] Finished a trial resulted in value: 0.4549431302997294. Current best value is 0.2559890769949264 with parameters: {'num_trees': 337, 'depth': 2, 'learning_rate': 0.020564388262900246, 'l2_leaf_reg': 0.001478348209230398, 'bagging_temperature': 0.9081958553204716, 'random_strength': 0.2723574194507172, 'random_state': 6955}.
[I 2019-05-23 14:25:36,459] Finished a trial resulted in value: 1.4281933108555205. Current best value is 0.2559890769949264 with parameters: {'num_trees': 337, 'depth': 2, 'learning_rate': 0.020564388262900246, 'l2_leaf_reg': 0.001478348209230398, 'bagging_temperature': 0.9081958553204716,

KeyboardInterrupt: 

In [48]:
df_trial = pd.DataFrame(mytrial)

In [51]:
# for trial_i in mytrial:
#     db.insert(trial_i)
df_trial = db.select()

In [53]:
df_trial[(df_trial['remark']=='tune 841')&(df_trial['mae_diff']<.05)][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].sort_values(by=['val_mae'])

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
908,2019-05-23 08:11:08.690207,tune 841,33,1.744007,5.815254e-06,1.79341,0.0004229638,0.049403
987,2019-05-23 08:37:15.310782,tune 841,33,1.745569,3.130226e-05,1.795199,0.0003238179,0.049629
892,2019-05-23 08:05:07.965156,tune 841,33,1.750078,1.875708e-05,1.795452,0.0004903574,0.045374
918,2019-05-23 08:13:34.482783,tune 841,33,1.74701,2.780616e-05,1.796329,0.0004127268,0.04932
1124,2019-05-23 09:07:04.782163,tune 841,33,1.748211,5.295848e-05,1.797322,0.0003293125,0.049111
1103,2019-05-23 09:01:09.741672,tune 841,33,1.750677,2.735865e-06,1.797822,0.0003639672,0.047145
1119,2019-05-23 09:05:43.963145,tune 841,33,1.75401,2.514303e-05,1.798119,0.0003717037,0.044109
1150,2019-05-23 09:10:28.097367,tune 841,33,1.754599,3.443594e-05,1.799918,0.0003909556,0.045319
1235,2019-05-23 09:20:32.843825,tune 841,33,1.757708,3.504452e-05,1.800294,0.0003756393,0.042586
1329,2019-05-23 09:30:52.305092,tune 841,33,1.753802,1.290605e-05,1.800317,0.0003681818,0.046515


In [200]:
mytrial = []
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, param, df_test = df_test_stacknet, trial=mytrial, remark='half revert-rfe')

In [201]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [202]:
df_trial[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].tail(1)

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
841,2019-05-22 14:50:04.764500,half revert-rfe,33,1.754045,2.1e-05,1.802209,0.000269,0.048164


In [54]:
idx=1235
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_mystacknet_{}.csv'.format(idx), index=False)

In [57]:
# param = {'columns': name_lst+catboost_columns,
#  'kfold': {'n_splits': 3,
#   'random_state': 1985,
#   'shuffle': True,
#   'type': 'stratified'},#stratified
#  'scaler': {'cls': 'StandardScaler'},
#  'algorithm': 
         
# # {'cls': 'xgb.XGBRegressor',
# #  'init': {'max_depth': 3,
# #   'max_bin': 38,
# #   'eta': 0.27801915385245873,
# #   'colsample_bytree': 0.9416983653127328,
# #   'min_child_weight': 238,
# #   'n_estimators': 165,
# #   'subsample': 0.7471829960670435,
# #   'reg_lambda': 0.6813060508093833,
# #   'reg_alpha': 0.36085980027529035,
# #   'n_jobs': 16},
# #  'fit': {'eval_metric': 'mae', 'verbose': False, 'early_stopping_rounds': 200}},
         
#          {'cls': 'cb.CatBoostRegressor',
#   'init': {'num_trees': 589,
#    'depth': 6,
#    'learning_rate': 0.05293979792364842,
#    'l2_leaf_reg': 78.065140245968,
#    'bagging_temperature': 0.9302786271852079,
#    'random_strength': 0.4247048326178351,
#    'random_state': 651},
#   'fit': {}},
         
# #          {'cls': 'lgb.LGBMRegressor',
# #  'init': {'learning_rate': 0.17076106120259138,
# #   'feature_fraction': 0.6842101917408698,
# #   'bagging_fraction': 0.8986268312800509,
# #   'min_data_in_leaf': 243,
# #   'lambda_l1': 4.612300279009062,
# #   'lambda_l2': 97.21686371760525,
# #   'max_bin': 28,
# #   'num_leaves': 11,
# #   'random_state': 6805,
# #   'n_jobs': 32},
# #  'fit': {'eval_metric': 'mae', 'verbose': False, 'early_stopping_rounds': 200}}
         
# }

In [62]:
# #  tune hypterparameters
# def objective(trial):
        
#     num_trees = trial.suggest_int('num_trees', 200, 1000)
#     depth = trial.suggest_int('depth', 2, 10)
#     learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
#     l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.001, 100)
#     bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
#     random_strength = trial.suggest_uniform('random_strength', .001, 1)
#     random_state = trial.suggest_int('random_state', 1, 9999)
        
#     args={
#         'columns':name_lst+catboost_columns,
#         'kfold':{
#             'n_splits': 3,
#             'random_state': 1985,
#             'shuffle': True,
#             'type': 'stratified'
#         },
#         'scaler':{
#             'cls':'StandardScaler',
#         },
#         'algorithm':{
#             'cls':'cb.CatBoostRegressor',
#             'init':{
#                 "num_trees":num_trees,
#                 "depth":depth,
#                 "learning_rate":learning_rate,
#                 "l2_leaf_reg":l2_leaf_reg,
#                 "bagging_temperature":bagging_temperature,
#                 "random_strength":random_strength,
#                 "random_state":random_state,
#             },
#             'fit':{
#             },
#         },
#     }
    
#     df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, args, df_test = df_test_stacknet, trial=mytrial, remark='add mlp,rgf tune stratified ')
#     val_mae_mean = np.mean(df_his.valid)
#     val_mae_var = np.var(df_his.valid)
#     train_mae_mean = np.mean(df_his.train)
#     train_mae_var = np.var(df_his.train)
    
#     trial.set_user_attr('val_mae', val_mae_mean)
#     trial.set_user_attr('train_mae', train_mae_mean)
#     trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
#     trial.set_user_attr('val_mae_var', val_mae_var)

#     return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

# study = optuna.create_study()
# study.optimize(objective, n_trials=200)

In [63]:
df_trial = pd.DataFrame(mytrial)
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial[(df_trial['remark']=='add mlp,rgf tune stratified ')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'kfold-type', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,remark,kfold-type,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
562,2019-05-19 20:38:58.981647,"add mlp,rgf tune stratified",stratified,43,1.734277,1.430581e-05,1.784253,0.000267,0.049976
506,2019-05-19 19:32:11.848462,"add mlp,rgf tune stratified",stratified,43,1.739045,1.455078e-05,1.786425,0.000477,0.04738
499,2019-05-19 19:29:55.797143,"add mlp,rgf tune stratified",stratified,43,1.738286,8.44112e-06,1.787195,0.000318,0.048909
554,2019-05-19 20:33:16.144706,"add mlp,rgf tune stratified",stratified,43,1.752076,3.836694e-07,1.792045,0.00032,0.039969
488,2019-05-19 19:01:01.028597,"add mlp,rgf tune stratified",stratified,43,1.75198,4.276654e-06,1.792814,0.000426,0.040834


In [64]:
idx = 562
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_mystacknet_{}.csv'.format(idx), index=False)

In [67]:
# group
trial_lst = [
    df_trial_catboost.loc[452], df_trial_xgbm.loc[1172], df_trial_lgbm.loc[2156],df_trial_gradientboosting.loc[306],
             df_trial_randomforest.loc[297],df_trial_extratrees.loc[459],
            df_trial_knn.loc[17], df_trial_svr.loc[7],df_trial_fm.loc[313],
            df_trial_lasso.loc[8], df_trial_ridge.loc[15],
        df_trial_mlp.loc[17], df_trial_frgf.loc[170],
            ]
name_lst = [
    'cb452', 'xgbm1172', 'lgbm2156',
    'gbm306','rf297','et459',
    'knn17','svr7', 'fm313',
    'lasso8','ridge15',
    'mlp17','frgf170'
]

df_train_stacknet = pd.DataFrame()
df_test_stacknet = pd.DataFrame()
for df_, name_ in zip(trial_lst, name_lst):
    try:
        df_test_stacknet_i = df_['df_test_pred']
        df_test_stacknet[name_] = np.mean(df_test_stacknet_i.drop(columns=['index']).values, axis=1)
        
        df_train_stacknet_i = df_['df_valid_pred']
        df_train_stacknet[name_] = df_train_stacknet_i['predict']
        
    except Exception as e:
        print(name_, 'exception')

df_train_stacknet['index'] = df_train_stacknet_i['index']
df_train_stacknet = pd.merge(df_train_stacknet, df_train[['y','index', 'group','label']+catboost_columns], on='index')
df_test_stacknet['index'] = df_test_stacknet_i['index']
df_test_stacknet = pd.merge(df_test_stacknet, df_test[['index']+catboost_columns], on='index')

In [61]:
# #  tune hypterparameters
# def objective(trial):
        
#     num_trees = trial.suggest_int('num_trees', 200, 1000)
#     depth = trial.suggest_int('depth', 2, 10)
#     learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
#     l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.001, 100)
#     bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
#     random_strength = trial.suggest_uniform('random_strength', .001, 1)
#     random_state = trial.suggest_int('random_state', 1, 9999)
        
#     args={
#         'columns':name_lst+catboost_columns,
#         'kfold':{
#             'n_splits': 3,
#             'random_state': 1985,
#             'shuffle': True,
#             'type': 'stratified'
#         },
#         'scaler':{
#             'cls':'StandardScaler',
#         },
#         'algorithm':{
#             'cls':'cb.CatBoostRegressor',
#             'init':{
#                 "num_trees":num_trees,
#                 "depth":depth,
#                 "learning_rate":learning_rate,
#                 "l2_leaf_reg":l2_leaf_reg,
#                 "bagging_temperature":bagging_temperature,
#                 "random_strength":random_strength,
#                 "random_state":random_state,
#             },
#             'fit':{
#             },
#         },
#     }
    
#     df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, args, df_test = df_test_stacknet, trial=mytrial, remark='add mlp,rgf tune group ')
#     val_mae_mean = np.mean(df_his.valid)
#     val_mae_var = np.var(df_his.valid)
#     train_mae_mean = np.mean(df_his.train)
#     train_mae_var = np.var(df_his.train)
    
#     trial.set_user_attr('val_mae', val_mae_mean)
#     trial.set_user_attr('train_mae', train_mae_mean)
#     trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
#     trial.set_user_attr('val_mae_var', val_mae_var)

#     return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

# study = optuna.create_study()
# study.optimize(objective, n_trials=200)

In [75]:
df_trial = pd.DataFrame(mytrial)
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial[(df_trial['remark']=='add mlp,rgf tune group ')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'kfold-type', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,remark,kfold-type,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
771,2019-05-20 02:37:44.638683,"add mlp,rgf tune group",stratified,43,1.740836,5.8e-05,1.789388,9.4e-05,0.048552
737,2019-05-20 01:55:26.866689,"add mlp,rgf tune group",stratified,43,1.745126,1.6e-05,1.79191,0.000227,0.046784
729,2019-05-20 01:34:58.625995,"add mlp,rgf tune group",stratified,43,1.744614,1.2e-05,1.793368,0.000248,0.048753
730,2019-05-20 01:52:21.969967,"add mlp,rgf tune group",stratified,43,1.749643,3e-06,1.795406,0.000238,0.045763
773,2019-05-20 02:38:42.869441,"add mlp,rgf tune group",stratified,43,1.750654,1.3e-05,1.796782,0.00016,0.046128


In [76]:
idx = 771
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_mystacknet_{}.csv'.format(idx), index=False)

In [77]:
# df_trial.to_pickle('../trial/mystack.pkl')