In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB
from models import *

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_columns', 2000)
pd.set_option('display.max_rows', 2000)
# pd.set_option('display.width', 2000)
# pd.set_option('display.expand_frame_repr', True)
pd.set_option('display.max_colwidth', -1)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [7]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [8]:
# mytrial = []
db = DFDB('../trial/mystack.pkl', auto_commit=False)
df_trial = db.select()

In [11]:
db_catboost = DFDB('../trial/catboost.pkl', auto_commit=False)
df_trial_catboost = db_catboost.select()
df_trial_catboost['kfold'] = df_trial_catboost['param'].apply(lambda x: x['kfold'])

In [12]:
db_xgbm = DFDB('../trial/xgbm.pkl', auto_commit=False)
df_trial_xgbm = db_xgbm.select()
df_trial_xgbm['kfold'] = df_trial_xgbm['param'].apply(lambda x: x['kfold'])

In [13]:
db_lgbm = DFDB('../trial/lgbm.pkl', auto_commit=False)
df_trial_lgbm = db_lgbm.select()
df_trial_lgbm['kfold'] = df_trial_lgbm['param'].apply(lambda x: x['kfold'])

In [14]:
db_randomforest = DFDB('../trial/randomforest.pkl', auto_commit=False)
df_trial_randomforest = db_randomforest.select()
df_trial_randomforest['kfold'] = df_trial_lgbm['param'].apply(lambda x: x['kfold'])

In [15]:
db_extratrees = DFDB('../trial/extratrees.pkl', auto_commit=False)
df_trial_extratrees = db_extratrees.select()
df_trial_extratrees['kfold'] = df_trial_extratrees['param'].apply(lambda x: x['kfold'])

In [126]:
# df_trial_extratrees[df_trial_extratrees['remark']=='start from top1 column'][['datetime','remark','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

In [16]:
db_gradientboosting = DFDB('../trial/gradientboosting.pkl', auto_commit=False)
df_trial_gradientboosting = db_gradientboosting.select()
df_trial_gradientboosting['kfold'] = df_trial_gradientboosting['param'].apply(lambda x: x['kfold'])

In [17]:
catboost_columns = df_trial_catboost.loc[666]['param']['columns']
xgbm_columns = df_trial_xgbm.loc[1172]['param']['columns']
lgbm_columns = df_trial_lgbm.loc[2249]['param']['columns']
rf_columns = df_trial_randomforest.loc[457]['param']['columns']
extratrees_columns = df_trial_extratrees.loc[459]['param']['columns']
gradientboosting_columns = df_trial_gradientboosting.loc[390]['param']['columns']

In [18]:
all_columns = catboost_columns+xgbm_columns+lgbm_columns+rf_columns+extratrees_columns+gradientboosting_columns
unique_columns = list(set(all_columns))
common_columns = []
common_columns25 = []
common_columns50 = []
common_columns75 = []
common_columns95 = []
N_columns = 6
count_values = [all_columns.count(col) for col in unique_columns]
for col in unique_columns:
    if all_columns.count(col)==N_columns:
        common_columns.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .75):
        common_columns75.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .95):
        common_columns95.append(col)
len(unique_columns), len(common_columns),len(common_columns75),len(common_columns95)

(107, 3, 28, 9)

In [19]:
db_knn = DFDB('../trial/knn.pkl', auto_commit=False)
df_trial_knn = db_knn.select()
df_trial_knn['kfold'] = df_trial_knn['param'].apply(lambda x: x['kfold'])

In [20]:
db_svr = DFDB('../trial/svr.pkl', auto_commit=False)
df_trial_svr = db_svr.select()
df_trial_svr['kfold'] = df_trial_knn['param'].apply(lambda x: x['kfold'])

In [21]:
db_fm = DFDB('../trial/fm.pkl', auto_commit=False)
df_trial_fm = db_fm.select()
df_trial_fm['kfold'] = df_trial_knn['param'].apply(lambda x: x['kfold'])

In [22]:
db_lasso = DFDB('../trial/lasso.pkl', auto_commit=False)
df_trial_lasso = db_lasso.select()
df_trial_lasso['kfold'] = df_trial_lasso['param'].apply(lambda x: x['kfold'])

In [23]:
db_ridge = DFDB('../trial/ridge.pkl', auto_commit=False)
df_trial_ridge = db_ridge.select()
df_trial_ridge['kfold'] = df_trial_ridge['param'].apply(lambda x: x['kfold'])

In [26]:
db_kerasmlp = DFDB('../trial/kerasmlp.pkl', auto_commit=False)
df_trial_kerasmlp = db_kerasmlp.select()
df_trial_kerasmlp['kfold'] = df_trial_kerasmlp['param'].apply(lambda x: x['kfold'])

In [25]:
db_frgf = DFDB('../trial/frgf.pkl', auto_commit=False)
df_trial_frgf = db_frgf.select()
df_trial_frgf['kfold'] = df_trial_frgf['param'].apply(lambda x: x['kfold'])

In [36]:
db_skmlp = DFDB('../trial/mlp.pkl', auto_commit=False)
df_trial_skmlp = db_skmlp.select()
df_trial_skmlp['kfold'] = df_trial_kerasmlp['param'].apply(lambda x: x['kfold'])

In [37]:
df_trial_skmlp.loc[[19]][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
19,2019-05-25 23:55:11.086624,,13,1.945347,9.3e-05,1.962482,0.000106,0.017135


In [38]:
#stratified
trial_lst = [
    df_trial_catboost.loc[666], df_trial_xgbm.loc[1316], df_trial_lgbm.loc[2249],df_trial_gradientboosting.loc[390],
             df_trial_randomforest.loc[457],df_trial_extratrees.loc[459],
            df_trial_knn.loc[44], df_trial_svr.loc[33],df_trial_fm.loc[339],
            df_trial_lasso.loc[117], df_trial_ridge.loc[126],
        df_trial_mlp.loc[13], df_trial_frgf.loc[327],df_trial_skmlp.loc[19]
            ]
name_lst = [
    'cb666', 'xgbm1316', 'lgbm2249',
    'gbm390','rf457','et459',
    'knn44','svr33', 'fm339',
    'lasso117','ridge126',
    'mlp13','frgf327','skmlp19'
]

df_train_stacknet = pd.DataFrame()
df_test_stacknet = pd.DataFrame()
for df_, name_ in zip(trial_lst, name_lst):
    try:
        df_test_stacknet_i = df_['df_test_pred']
        df_test_stacknet[name_] = np.mean(df_test_stacknet_i.drop(columns=['index']).values, axis=1)
        
        df_train_stacknet_i = df_['df_valid_pred']
        df_train_stacknet[name_] = df_train_stacknet_i['predict']
        
    except Exception as e:
        print(name_, 'exception')

df_train_stacknet['index'] = df_train_stacknet_i['index']
df_train_stacknet = pd.merge(df_train_stacknet, df_train[['y','index', 'group','label']+catboost_columns], on='index')
df_test_stacknet['index'] = df_test_stacknet_i['index']
df_test_stacknet = pd.merge(df_test_stacknet, df_test[['index']+catboost_columns], on='index')

In [39]:
name_lst+catboost_columns

['cb666',
 'xgbm1316',
 'lgbm2249',
 'gbm390',
 'rf457',
 'et459',
 'knn44',
 'svr33',
 'fm339',
 'lasso117',
 'ridge126',
 'mlp13',
 'frgf327',
 'skmlp19',
 'abs_q01_4',
 'MA_1000MA_std_mean_7',
 'spkt_welch_densitycoeff_2',
 'abs_q25_5',
 'abs_q95_2',
 'abs_max_roll_mean_1000',
 'abs_q95_6',
 'q25_roll_std_100',
 "number_peaks{'n': 10}",
 'abs_q75_2',
 'q05_roll_std_1000',
 'abs_q75_6',
 "value_count{'value': 1}",
 'FFT_Mag_75q0',
 'abs_q75_7',
 'spkt_welch_density__coeff_3',
 'q01_roll_std_100',
 "number_crossing_m{'m': 1}",
 'q05_2',
 "autocorrelation{'lag': 5}"]

In [55]:
param = {'columns': [
    'cb666',
 'xgbm1316',
 'lgbm2249',
 'gbm390',
 'rf457',
 'et459',
 'knn44',
 'svr33',
 'fm339',
 'lasso117',
 'ridge126',
 'mlp13',
 'frgf327',
 'skmlp19',
#  'abs_q01_4',
 'MA_1000MA_std_mean_7',
 'spkt_welch_densitycoeff_2',
 'abs_q25_5',
 'abs_q95_2',
 'abs_max_roll_mean_1000',
 'abs_q95_6',
 'q25_roll_std_100',
 "number_peaks{'n': 10}",
 'abs_q75_2',
 'q05_roll_std_1000',
 'abs_q75_6',
 "value_count{'value': 1}",
 'FFT_Mag_75q0',
 'abs_q75_7',
 'spkt_welch_density__coeff_3',
 'q01_roll_std_100',
 "number_crossing_m{'m': 1}",
 'q05_2',
 "autocorrelation{'lag': 5}"
                    ],
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler', 'init':{}},
 'algorithm': {'cls': 'cb.CatBoostRegressor',
  'init': {'num_trees': 267,
   'depth': 8,
   'learning_rate': 0.04441106014865151,
   'l2_leaf_reg': 11.463989088797742,
   'bagging_temperature': 0.8825156807375603,
   'random_strength': 0.9042666757512351,
   'random_state': 473,
          'logging_level': 'Silent'},
  'fit': {}}}

In [56]:
# mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, param, df_test = df_test_stacknet, trial=mytrial, remark='half revert-rfe 2')

In [61]:
df_trial = pd.DataFrame(mytrial)

In [63]:
df_trial[df_trial['mae_diff']<.05][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].sort_values(by=['val_mae'])

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
2,2019-05-26 09:05:45.907034,half revert-rfe 2,33,1.74345,2.933508e-05,1.791686,0.0003339648,0.048236
6,2019-05-26 09:12:17.979029,half revert-rfe 2,33,1.74345,2.933508e-05,1.791686,0.0003339648,0.048236
3,2019-05-26 09:06:48.856867,half revert-rfe 2,32,1.745328,4.064454e-05,1.792924,0.0003496503,0.047595
5,2019-05-26 09:08:16.743988,half revert-rfe 2,32,1.745984,3.386971e-05,1.794123,0.0004632674,0.048139
88,2019-05-26 09:53:45.782206,tune 1.339,33,1.744231,1.476304e-05,1.794165,0.0003843838,0.049934
4,2019-05-26 09:07:35.770358,half revert-rfe 2,32,1.747121,2.916836e-05,1.794718,0.0004249691,0.047597
92,2019-05-26 09:54:14.017934,tune 1.339,33,1.747297,3.218898e-05,1.795136,0.0003878674,0.047839
1,2019-05-26 09:03:34.324008,half revert-rfe 2,34,1.745862,2.658668e-05,1.795263,0.000374848,0.049401
0,2019-05-26 08:59:23.160158,half revert-rfe 2,33,1.746862,2.652375e-05,1.795525,0.0003050716,0.048663
329,2019-05-26 10:50:55.573603,tune 1.339,33,1.74982,3.021384e-05,1.796098,0.0003580732,0.046278


In [60]:
#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.001, 100)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':[
    'cb666',
 'xgbm1316',
 'lgbm2249',
 'gbm390',
 'rf457',
 'et459',
 'knn44',
 'svr33',
 'fm339',
 'lasso117',
 'ridge126',
 'mlp13',
 'frgf327',
 'skmlp19',
#  'abs_q01_4',
 'MA_1000MA_std_mean_7',
 'spkt_welch_densitycoeff_2',
 'abs_q25_5',
 'abs_q95_2',
 'abs_max_roll_mean_1000',
 'abs_q95_6',
 'q25_roll_std_100',
 "number_peaks{'n': 10}",
 'abs_q75_2',
 'q05_roll_std_1000',
 'abs_q75_6',
 "value_count{'value': 1}",
 'FFT_Mag_75q0',
 'abs_q75_7',
 'spkt_welch_density__coeff_3',
 'q01_roll_std_100',
 "number_crossing_m{'m': 1}",
 'q05_2',
 "autocorrelation{'lag': 5}"
                    ],
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
                'logging_level': 'Silent'
            },
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, args, df_test = df_test_stacknet, trial= c ffh, remark='tune 1.339')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=500)

[I 2019-05-26 09:16:37,801] Finished a trial resulted in value: 0.4581808002495412. Current best value is 0.4581808002495412 with parameters: {'num_trees': 715, 'depth': 9, 'learning_rate': 0.21772174587371654, 'l2_leaf_reg': 99.11557940342765, 'bagging_temperature': 0.6474404088542715, 'random_strength': 0.31981859654808587, 'random_state': 3425}.
[I 2019-05-26 09:16:47,222] Finished a trial resulted in value: 0.20927681482797247. Current best value is 0.20927681482797247 with parameters: {'num_trees': 657, 'depth': 3, 'learning_rate': 0.3209415507903822, 'l2_leaf_reg': 64.34351444173235, 'bagging_temperature': 0.9162733867546031, 'random_strength': 0.27315521016199873, 'random_state': 6425}.
[I 2019-05-26 09:16:51,971] Finished a trial resulted in value: 0.022464302059534126. Current best value is 0.022464302059534126 with parameters: {'num_trees': 347, 'depth': 2, 'learning_rate': 0.01945818406927026, 'l2_leaf_reg': 5.9745838798267625, 'bagging_temperature': 0.9574324394138662, 'ran

[I 2019-05-26 09:34:51,625] Finished a trial resulted in value: 0.32180754593449273. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 09:34:58,142] Finished a trial resulted in value: 0.0633832601272598. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 09:35:09,746] Finished a trial resulted in value: 0.12040198618477764. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'ra

[I 2019-05-26 09:46:03,837] Finished a trial resulted in value: 0.36740154071545716. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 09:46:16,393] Finished a trial resulted in value: 0.19741441842260427. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 09:46:30,487] Finished a trial resulted in value: 0.05846719455617197. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'r

[I 2019-05-26 09:52:35,467] Finished a trial resulted in value: 0.025543113767920317. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 09:52:40,512] Finished a trial resulted in value: 0.04182306488558209. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 09:52:46,943] Finished a trial resulted in value: 0.054221092280932054. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 

[I 2019-05-26 09:56:49,920] Finished a trial resulted in value: 0.18811554111929274. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 09:57:11,820] Finished a trial resulted in value: 0.044643327729764044. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 09:57:18,103] Finished a trial resulted in value: 0.044730073423936575. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 

[I 2019-05-26 10:05:24,895] Finished a trial resulted in value: 0.13716831484610867. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:05:30,169] Finished a trial resulted in value: 0.1266095485182739. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:05:36,914] Finished a trial resulted in value: 0.09709319863642847. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'ra

[I 2019-05-26 10:11:00,169] Finished a trial resulted in value: 0.13913448757374816. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:11:04,585] Finished a trial resulted in value: 0.01711259229839405. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:11:17,323] Finished a trial resulted in value: 0.20958127410516839. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'r

[I 2019-05-26 10:15:49,252] Finished a trial resulted in value: 0.04529548961423392. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:15:55,206] Finished a trial resulted in value: 0.01316784410883439. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:15:59,542] Finished a trial resulted in value: 0.02957824885228644. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'r

[I 2019-05-26 10:18:52,080] Finished a trial resulted in value: 0.01538783950536878. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:18:56,484] Finished a trial resulted in value: 0.14583854751060374. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:19:12,593] Finished a trial resulted in value: 0.14430465356050992. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'r

[I 2019-05-26 10:23:23,642] Finished a trial resulted in value: 0.05032932175053727. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:23:30,313] Finished a trial resulted in value: 0.08376019935512825. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:23:36,780] Finished a trial resulted in value: 0.019709003775725556. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, '

[I 2019-05-26 10:33:09,176] Finished a trial resulted in value: 0.025843118439129127. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:33:14,604] Finished a trial resulted in value: 0.07802496237462245. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:33:25,426] Finished a trial resulted in value: 0.06506843581301804. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, '

[I 2019-05-26 10:38:39,631] Finished a trial resulted in value: 0.1858800488517438. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:38:47,145] Finished a trial resulted in value: 0.16307197385892655. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'random_strength': 0.4705474685925809, 'random_state': 112}.
[I 2019-05-26 10:38:55,085] Finished a trial resulted in value: 0.05170535847202846. Current best value is 0.00938765319380357 with parameters: {'num_trees': 300, 'depth': 3, 'learning_rate': 0.010956609120136586, 'l2_leaf_reg': 38.506823011138394, 'bagging_temperature': 0.6037570674044999, 'ra

[I 2019-05-26 10:40:55,617] Finished a trial resulted in value: 0.024858068816671793. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 10:41:03,401] Finished a trial resulted in value: 0.008410836644546013. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 10:41:14,616] Finished a trial resulted in value: 0.021902965071565233. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.881846347631315

[I 2019-05-26 10:44:32,795] Finished a trial resulted in value: 0.08520419037648985. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 10:44:44,964] Finished a trial resulted in value: 0.010751594117470037. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 10:44:50,845] Finished a trial resulted in value: 0.08604434808315213. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153,

[I 2019-05-26 10:50:55,614] Finished a trial resulted in value: 0.08311932174566974. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 10:52:07,407] Finished a trial resulted in value: 0.38978155332299647. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 10:53:28,456] Finished a trial resulted in value: 0.03947089583157397. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 

[I 2019-05-26 11:00:00,367] Finished a trial resulted in value: 0.018484245523539642. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 11:00:16,005] Finished a trial resulted in value: 0.1162573988531824. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 11:00:23,491] Finished a trial resulted in value: 0.14472542283587678. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 

[I 2019-05-26 11:08:22,350] Finished a trial resulted in value: 0.007351680007709647. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 11:08:26,343] Finished a trial resulted in value: 0.03474220506693873. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 11:08:29,939] Finished a trial resulted in value: 0.0403308903431752. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 

[I 2019-05-26 11:12:11,431] Finished a trial resulted in value: 0.022143335476996175. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 11:12:17,340] Finished a trial resulted in value: 0.08197311757121124. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153, 'random_strength': 0.9371499353058853, 'random_state': 5917}.
[I 2019-05-26 11:12:21,649] Finished a trial resulted in value: 0.011447909435647632. Current best value is 0.007041661593464543 with parameters: {'num_trees': 277, 'depth': 2, 'learning_rate': 0.010700545312910049, 'l2_leaf_reg': 49.51554158350584, 'bagging_temperature': 0.8818463476313153

[I 2019-05-26 11:14:45,150] Finished a trial resulted in value: 0.024249746310452198. Current best value is 0.006558352158962393 with parameters: {'num_trees': 253, 'depth': 3, 'learning_rate': 0.010139034886322859, 'l2_leaf_reg': 39.12741233733978, 'bagging_temperature': 0.8511291887341198, 'random_strength': 0.8827521387514722, 'random_state': 5647}.
[I 2019-05-26 11:14:49,990] Finished a trial resulted in value: 0.11242593247300377. Current best value is 0.006558352158962393 with parameters: {'num_trees': 253, 'depth': 3, 'learning_rate': 0.010139034886322859, 'l2_leaf_reg': 39.12741233733978, 'bagging_temperature': 0.8511291887341198, 'random_strength': 0.8827521387514722, 'random_state': 5647}.
[I 2019-05-26 11:14:55,617] Finished a trial resulted in value: 0.03734700281718171. Current best value is 0.006558352158962393 with parameters: {'num_trees': 253, 'depth': 3, 'learning_rate': 0.010139034886322859, 'l2_leaf_reg': 39.12741233733978, 'bagging_temperature': 0.8511291887341198,

[I 2019-05-26 11:16:30,362] Finished a trial resulted in value: 0.025472412253628465. Current best value is 0.005724265735481934 with parameters: {'num_trees': 209, 'depth': 2, 'learning_rate': 0.010138649276739345, 'l2_leaf_reg': 54.37279201711011, 'bagging_temperature': 0.7680211705992602, 'random_strength': 0.9520605974959968, 'random_state': 3504}.
[I 2019-05-26 11:16:35,458] Finished a trial resulted in value: 0.12755266653411784. Current best value is 0.005724265735481934 with parameters: {'num_trees': 209, 'depth': 2, 'learning_rate': 0.010138649276739345, 'l2_leaf_reg': 54.37279201711011, 'bagging_temperature': 0.7680211705992602, 'random_strength': 0.9520605974959968, 'random_state': 3504}.
[I 2019-05-26 11:16:39,164] Finished a trial resulted in value: 0.010830323709611754. Current best value is 0.005724265735481934 with parameters: {'num_trees': 209, 'depth': 2, 'learning_rate': 0.010138649276739345, 'l2_leaf_reg': 54.37279201711011, 'bagging_temperature': 0.7680211705992602

[I 2019-05-26 11:18:35,815] Finished a trial resulted in value: 0.08593662709446848. Current best value is 0.005724265735481934 with parameters: {'num_trees': 209, 'depth': 2, 'learning_rate': 0.010138649276739345, 'l2_leaf_reg': 54.37279201711011, 'bagging_temperature': 0.7680211705992602, 'random_strength': 0.9520605974959968, 'random_state': 3504}.
[I 2019-05-26 11:18:39,939] Finished a trial resulted in value: 0.09466028238407502. Current best value is 0.005724265735481934 with parameters: {'num_trees': 209, 'depth': 2, 'learning_rate': 0.010138649276739345, 'l2_leaf_reg': 54.37279201711011, 'bagging_temperature': 0.7680211705992602, 'random_strength': 0.9520605974959968, 'random_state': 3504}.
[I 2019-05-26 11:18:44,351] Finished a trial resulted in value: 0.03612605410589337. Current best value is 0.005724265735481934 with parameters: {'num_trees': 209, 'depth': 2, 'learning_rate': 0.010138649276739345, 'l2_leaf_reg': 54.37279201711011, 'bagging_temperature': 0.7680211705992602, 

[I 2019-05-26 11:20:40,873] Finished a trial resulted in value: 0.023160360189254717. Current best value is 0.005724265735481934 with parameters: {'num_trees': 209, 'depth': 2, 'learning_rate': 0.010138649276739345, 'l2_leaf_reg': 54.37279201711011, 'bagging_temperature': 0.7680211705992602, 'random_strength': 0.9520605974959968, 'random_state': 3504}.
[I 2019-05-26 11:20:45,704] Finished a trial resulted in value: 0.0537270510589914. Current best value is 0.005724265735481934 with parameters: {'num_trees': 209, 'depth': 2, 'learning_rate': 0.010138649276739345, 'l2_leaf_reg': 54.37279201711011, 'bagging_temperature': 0.7680211705992602, 'random_strength': 0.9520605974959968, 'random_state': 3504}.
[I 2019-05-26 11:20:55,250] Finished a trial resulted in value: 0.21931996998485215. Current best value is 0.005724265735481934 with parameters: {'num_trees': 209, 'depth': 2, 'learning_rate': 0.010138649276739345, 'l2_leaf_reg': 54.37279201711011, 'bagging_temperature': 0.7680211705992602, 

In [64]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [68]:
df_trial[(df_trial['mae_diff']<.05)][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].sort_values(by=['val_mae'])

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
12,2019-05-13 07:55:51.671913,catboost342|xgbm865|lgbm1398|randomforest122|extratrees250|gradientboosting83->catboost,36,1.714399,1.467609e-05,1.755442,0.0001619534,0.041044
8,2019-05-13 07:16:06.946028,catboost342|xgboost865|lgbm1398->catboost,50,1.727243,2.806551e-05,1.772981,0.0001920496,0.045737
14,2019-05-13 13:28:10.131911,,6,1.852499,0.02352576,1.77646,0.483025,-0.076038
13,2019-05-13 13:25:41.305146,,6,1.852499,0.02352576,1.77646,0.483025,-0.076038
15,2019-05-13 13:32:23.433123,,6,1.852499,0.02352576,1.77646,0.483025,-0.076038
16,2019-05-13 13:32:48.530420,,5,1.853104,0.02298058,1.777866,0.4797556,-0.075238
1093,2019-05-25 11:08:52.335262,tune 562,33,1.736854,1.087594e-05,1.783753,0.0002964615,0.046898
562,2019-05-19 20:38:58.981647,"add mlp,rgf tune stratified",43,1.734277,1.430581e-05,1.784253,0.0002672915,0.049976
7,2019-05-13 06:52:23.907165,catboost342|xgboost865|lgbm1398->catboost,33,1.744401,1.587228e-05,1.784729,0.000219964,0.040328
506,2019-05-19 19:32:11.848462,"add mlp,rgf tune stratified",43,1.739045,1.455078e-05,1.786425,0.0004765935,0.04738


In [70]:
db.commit()

In [69]:
idx=1647
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_mystacknet_{}.csv'.format(idx), index=False)

In [57]:
# param = {'columns': name_lst+catboost_columns,
#  'kfold': {'n_splits': 3,
#   'random_state': 1985,
#   'shuffle': True,
#   'type': 'stratified'},#stratified
#  'scaler': {'cls': 'StandardScaler'},
#  'algorithm': 
         
# # {'cls': 'xgb.XGBRegressor',
# #  'init': {'max_depth': 3,
# #   'max_bin': 38,
# #   'eta': 0.27801915385245873,
# #   'colsample_bytree': 0.9416983653127328,
# #   'min_child_weight': 238,
# #   'n_estimators': 165,
# #   'subsample': 0.7471829960670435,
# #   'reg_lambda': 0.6813060508093833,
# #   'reg_alpha': 0.36085980027529035,
# #   'n_jobs': 16},
# #  'fit': {'eval_metric': 'mae', 'verbose': False, 'early_stopping_rounds': 200}},
         
#          {'cls': 'cb.CatBoostRegressor',
#   'init': {'num_trees': 589,
#    'depth': 6,
#    'learning_rate': 0.05293979792364842,
#    'l2_leaf_reg': 78.065140245968,
#    'bagging_temperature': 0.9302786271852079,
#    'random_strength': 0.4247048326178351,
#    'random_state': 651},
#   'fit': {}},
         
# #          {'cls': 'lgb.LGBMRegressor',
# #  'init': {'learning_rate': 0.17076106120259138,
# #   'feature_fraction': 0.6842101917408698,
# #   'bagging_fraction': 0.8986268312800509,
# #   'min_data_in_leaf': 243,
# #   'lambda_l1': 4.612300279009062,
# #   'lambda_l2': 97.21686371760525,
# #   'max_bin': 28,
# #   'num_leaves': 11,
# #   'random_state': 6805,
# #   'n_jobs': 32},
# #  'fit': {'eval_metric': 'mae', 'verbose': False, 'early_stopping_rounds': 200}}
         
# }

In [62]:
#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.001, 100)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':name_lst+catboost_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
            },
            'fit':{
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, args, df_test = df_test_stacknet, trial=mytrial, remark='add mlp,rgf tune stratified ')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

In [63]:
df_trial = pd.DataFrame(mytrial)
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial[(df_trial['remark']=='add mlp,rgf tune stratified ')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'kfold-type', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,remark,kfold-type,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
562,2019-05-19 20:38:58.981647,"add mlp,rgf tune stratified",stratified,43,1.734277,1.430581e-05,1.784253,0.000267,0.049976
506,2019-05-19 19:32:11.848462,"add mlp,rgf tune stratified",stratified,43,1.739045,1.455078e-05,1.786425,0.000477,0.04738
499,2019-05-19 19:29:55.797143,"add mlp,rgf tune stratified",stratified,43,1.738286,8.44112e-06,1.787195,0.000318,0.048909
554,2019-05-19 20:33:16.144706,"add mlp,rgf tune stratified",stratified,43,1.752076,3.836694e-07,1.792045,0.00032,0.039969
488,2019-05-19 19:01:01.028597,"add mlp,rgf tune stratified",stratified,43,1.75198,4.276654e-06,1.792814,0.000426,0.040834


In [64]:
idx = 562
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_mystacknet_{}.csv'.format(idx), index=False)

In [67]:
# group
trial_lst = [
    df_trial_catboost.loc[452], df_trial_xgbm.loc[1172], df_trial_lgbm.loc[2156],df_trial_gradientboosting.loc[306],
             df_trial_randomforest.loc[297],df_trial_extratrees.loc[459],
            df_trial_knn.loc[17], df_trial_svr.loc[7],df_trial_fm.loc[313],
            df_trial_lasso.loc[8], df_trial_ridge.loc[15],
        df_trial_mlp.loc[17], df_trial_frgf.loc[170],
            ]
name_lst = [
    'cb452', 'xgbm1172', 'lgbm2156',
    'gbm306','rf297','et459',
    'knn17','svr7', 'fm313',
    'lasso8','ridge15',
    'mlp17','frgf170'
]

df_train_stacknet = pd.DataFrame()
df_test_stacknet = pd.DataFrame()
for df_, name_ in zip(trial_lst, name_lst):
    try:
        df_test_stacknet_i = df_['df_test_pred']
        df_test_stacknet[name_] = np.mean(df_test_stacknet_i.drop(columns=['index']).values, axis=1)
        
        df_train_stacknet_i = df_['df_valid_pred']
        df_train_stacknet[name_] = df_train_stacknet_i['predict']
        
    except Exception as e:
        print(name_, 'exception')

df_train_stacknet['index'] = df_train_stacknet_i['index']
df_train_stacknet = pd.merge(df_train_stacknet, df_train[['y','index', 'group','label']+catboost_columns], on='index')
df_test_stacknet['index'] = df_test_stacknet_i['index']
df_test_stacknet = pd.merge(df_test_stacknet, df_test[['index']+catboost_columns], on='index')

In [61]:
# #  tune hypterparameters
# def objective(trial):
        
#     num_trees = trial.suggest_int('num_trees', 200, 1000)
#     depth = trial.suggest_int('depth', 2, 10)
#     learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
#     l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.001, 100)
#     bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
#     random_strength = trial.suggest_uniform('random_strength', .001, 1)
#     random_state = trial.suggest_int('random_state', 1, 9999)
        
#     args={
#         'columns':name_lst+catboost_columns,
#         'kfold':{
#             'n_splits': 3,
#             'random_state': 1985,
#             'shuffle': True,
#             'type': 'stratified'
#         },
#         'scaler':{
#             'cls':'StandardScaler',
#         },
#         'algorithm':{
#             'cls':'cb.CatBoostRegressor',
#             'init':{
#                 "num_trees":num_trees,
#                 "depth":depth,
#                 "learning_rate":learning_rate,
#                 "l2_leaf_reg":l2_leaf_reg,
#                 "bagging_temperature":bagging_temperature,
#                 "random_strength":random_strength,
#                 "random_state":random_state,
#             },
#             'fit':{
#             },
#         },
#     }
    
#     df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train_stacknet, args, df_test = df_test_stacknet, trial=mytrial, remark='add mlp,rgf tune group ')
#     val_mae_mean = np.mean(df_his.valid)
#     val_mae_var = np.var(df_his.valid)
#     train_mae_mean = np.mean(df_his.train)
#     train_mae_var = np.var(df_his.train)
    
#     trial.set_user_attr('val_mae', val_mae_mean)
#     trial.set_user_attr('train_mae', train_mae_mean)
#     trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
#     trial.set_user_attr('val_mae_var', val_mae_var)

#     return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

# study = optuna.create_study()
# study.optimize(objective, n_trials=200)

In [75]:
df_trial = pd.DataFrame(mytrial)
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial[(df_trial['remark']=='add mlp,rgf tune group ')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'kfold-type', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,remark,kfold-type,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
771,2019-05-20 02:37:44.638683,"add mlp,rgf tune group",stratified,43,1.740836,5.8e-05,1.789388,9.4e-05,0.048552
737,2019-05-20 01:55:26.866689,"add mlp,rgf tune group",stratified,43,1.745126,1.6e-05,1.79191,0.000227,0.046784
729,2019-05-20 01:34:58.625995,"add mlp,rgf tune group",stratified,43,1.744614,1.2e-05,1.793368,0.000248,0.048753
730,2019-05-20 01:52:21.969967,"add mlp,rgf tune group",stratified,43,1.749643,3e-06,1.795406,0.000238,0.045763
773,2019-05-20 02:38:42.869441,"add mlp,rgf tune group",stratified,43,1.750654,1.3e-05,1.796782,0.00016,0.046128


In [76]:
idx = 771
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_mystacknet_{}.csv'.format(idx), index=False)

In [77]:
# df_trial.to_pickle('../trial/mystack.pkl')