In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_colwidth', -1)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [7]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [8]:
db = DFDB('../trial/catboost.pkl', auto_commit=False)

In [9]:
df_trial = db.select()
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold'])

In [35]:
# df_trial[['datetime','nfeatures', 'kfold', 'remark', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']].loc[213:236]

In [17]:
param_idx = 236
column_idx = 213
db_ = db
df_trial_ = df_trial
mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, start_columns=[columns[0]], limit=20, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])

19


In [33]:
# df_trial_[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].tail(200)

In [10]:
selected_columns= ['spkt_welch_density__coeff_3',
 'spkt_welch_densitycoeff_2',
 'abs_q25_5',
 'abs_q75_6',
 'q05_roll_std_1000',
 'abs_q75_7',
 'abs_q95_2',
 'q05_5',
 'abs_q75_2',
 '5000skewness_max_',
 'fft_coefficientcoeff_80__attr_"imag"',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 '5000kurtosis_mean_',
 "number_peaks{'n': 1}",
 '5000smoothness_entropy_',
 'ave10_7',
 'q75_roll_std_1000',
 'FFT_Mag_25q0',
 'fft_coefficientcoeff_20__attr_"abs"']

In [22]:
db_.commit()

In [32]:
mytrial =[]

#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.0001, 1)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
                'logging_level': 'Silent'
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 794')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

In [26]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [34]:
df_trial[(df_trial['remark']=='tune 794')&(df_trial['mae_diff']<.1)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
1042,2019-05-24 13:36:19.086762,19,1.963028,0.011816,2.059582,0.551,0.096554
1058,2019-05-24 13:44:03.330581,19,1.985347,0.012792,2.061683,0.626603,0.076336
974,2019-05-24 12:56:06.356641,19,1.965945,0.011916,2.062453,0.526521,0.096508
1019,2019-05-24 13:26:11.128453,19,1.967497,0.011683,2.06354,0.524612,0.096043
984,2019-05-24 13:05:59.849888,19,1.995593,0.013012,2.065305,0.643432,0.069712


In [30]:
db.commit()

In [19]:
mytrial =[]

#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.0001, 1)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
                'logging_level': 'Silent'
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 794')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-25 05:49:05,201] Finished a trial resulted in value: 1.5236272055920717. Current best value is 1.5236272055920717 with parameters: {'num_trees': 601, 'depth': 8, 'learning_rate': 0.28759298254638427, 'l2_leaf_reg': 0.5508726718244653, 'bagging_temperature': 0.7625018509246404, 'random_strength': 0.4024746540061619, 'random_state': 3897}.
[I 2019-05-25 05:49:30,499] Finished a trial resulted in value: 1.0049562787963247. Current best value is 1.0049562787963247 with parameters: {'num_trees': 752, 'depth': 5, 'learning_rate': 0.21327493777443243, 'l2_leaf_reg': 0.8547807284160341, 'bagging_temperature': 0.6180404094631344, 'random_strength': 0.5258207637716908, 'random_state': 3203}.
[I 2019-05-25 05:53:48,204] Finished a trial resulted in value: 2.0359045823423876. Current best value is 1.0049562787963247 with parameters: {'num_trees': 752, 'depth': 5, 'learning_rate': 0.21327493777443243, 'l2_leaf_reg': 0.8547807284160341, 'bagging_temperature': 0.6180404094631344, 'random_s

[I 2019-05-25 06:16:47,558] Finished a trial resulted in value: 0.6814803796659183. Current best value is 0.11511826033371882 with parameters: {'num_trees': 460, 'depth': 2, 'learning_rate': 0.017188320535317946, 'l2_leaf_reg': 0.25572317875141737, 'bagging_temperature': 0.6510792658004783, 'random_strength': 0.3881646974421695, 'random_state': 2146}.
[I 2019-05-25 06:18:17,944] Finished a trial resulted in value: 0.3547804141285238. Current best value is 0.11511826033371882 with parameters: {'num_trees': 460, 'depth': 2, 'learning_rate': 0.017188320535317946, 'l2_leaf_reg': 0.25572317875141737, 'bagging_temperature': 0.6510792658004783, 'random_strength': 0.3881646974421695, 'random_state': 2146}.
[I 2019-05-25 06:18:41,504] Finished a trial resulted in value: 0.3331127326507687. Current best value is 0.11511826033371882 with parameters: {'num_trees': 460, 'depth': 2, 'learning_rate': 0.017188320535317946, 'l2_leaf_reg': 0.25572317875141737, 'bagging_temperature': 0.6510792658004783, 

[I 2019-05-25 06:29:32,294] Finished a trial resulted in value: 0.9966888301901279. Current best value is 0.048022668506714584 with parameters: {'num_trees': 235, 'depth': 2, 'learning_rate': 0.010438128691874538, 'l2_leaf_reg': 0.9547502282521007, 'bagging_temperature': 0.8466308831545852, 'random_strength': 0.7723299337438394, 'random_state': 664}.
[I 2019-05-25 06:29:51,625] Finished a trial resulted in value: 0.659384624791559. Current best value is 0.048022668506714584 with parameters: {'num_trees': 235, 'depth': 2, 'learning_rate': 0.010438128691874538, 'l2_leaf_reg': 0.9547502282521007, 'bagging_temperature': 0.8466308831545852, 'random_strength': 0.7723299337438394, 'random_state': 664}.
[I 2019-05-25 06:29:59,666] Finished a trial resulted in value: 0.2442833490063429. Current best value is 0.048022668506714584 with parameters: {'num_trees': 235, 'depth': 2, 'learning_rate': 0.010438128691874538, 'l2_leaf_reg': 0.9547502282521007, 'bagging_temperature': 0.8466308831545852, 'ra

[I 2019-05-25 06:45:23,955] Finished a trial resulted in value: 0.083606441583558. Current best value is 0.048022668506714584 with parameters: {'num_trees': 235, 'depth': 2, 'learning_rate': 0.010438128691874538, 'l2_leaf_reg': 0.9547502282521007, 'bagging_temperature': 0.8466308831545852, 'random_strength': 0.7723299337438394, 'random_state': 664}.
[I 2019-05-25 06:45:30,617] Finished a trial resulted in value: 0.11856978507887465. Current best value is 0.048022668506714584 with parameters: {'num_trees': 235, 'depth': 2, 'learning_rate': 0.010438128691874538, 'l2_leaf_reg': 0.9547502282521007, 'bagging_temperature': 0.8466308831545852, 'random_strength': 0.7723299337438394, 'random_state': 664}.
[I 2019-05-25 06:45:37,373] Finished a trial resulted in value: 0.188459851678985. Current best value is 0.048022668506714584 with parameters: {'num_trees': 235, 'depth': 2, 'learning_rate': 0.010438128691874538, 'l2_leaf_reg': 0.9547502282521007, 'bagging_temperature': 0.8466308831545852, 'ra

[I 2019-05-25 06:53:18,429] Finished a trial resulted in value: 1.26296002492654. Current best value is 0.048022668506714584 with parameters: {'num_trees': 235, 'depth': 2, 'learning_rate': 0.010438128691874538, 'l2_leaf_reg': 0.9547502282521007, 'bagging_temperature': 0.8466308831545852, 'random_strength': 0.7723299337438394, 'random_state': 664}.
[I 2019-05-25 06:53:41,105] Finished a trial resulted in value: 0.6258045518656584. Current best value is 0.048022668506714584 with parameters: {'num_trees': 235, 'depth': 2, 'learning_rate': 0.010438128691874538, 'l2_leaf_reg': 0.9547502282521007, 'bagging_temperature': 0.8466308831545852, 'random_strength': 0.7723299337438394, 'random_state': 664}.
[I 2019-05-25 06:53:51,587] Finished a trial resulted in value: 0.5747106923577736. Current best value is 0.048022668506714584 with parameters: {'num_trees': 235, 'depth': 2, 'learning_rate': 0.010438128691874538, 'l2_leaf_reg': 0.9547502282521007, 'bagging_temperature': 0.8466308831545852, 'ran

In [20]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold']) 
df_trial = db.select()

In [23]:
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold']) 
df_trial[(df_trial['remark']=='tune 794')][['datetime','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]
# df_trial[(df_trial['remark']=='tune 794')&(df_trial['mae_diff']<.1)].sort_values(by=['val_mae'])[['datetime','kfold', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(30)

Unnamed: 0,datetime,kfold,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
869,2019-05-24 10:54:59.874703,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.566479,0.008527,2.095898,0.410316,0.529419
870,2019-05-24 10:57:38.551388,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.848165,0.010382,2.041909,0.494717,0.193744
871,2019-05-24 10:58:15.418279,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.741767,0.009722,2.05686,0.433941,0.315093
872,2019-05-24 10:59:33.290337,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.530627,0.007866,2.110055,0.399711,0.579428
873,2019-05-24 11:08:16.716080,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.353236,0.005693,2.121482,0.392268,0.768247
874,2019-05-24 11:11:21.604552,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.598756,0.008286,2.054873,0.436348,0.456117
875,2019-05-24 11:23:04.076802,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.78915,0.009953,2.035288,0.475941,0.246138
876,2019-05-24 11:28:53.015056,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.530587,0.006468,2.061784,0.434305,0.531197
877,2019-05-24 11:36:15.251972,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.39986,0.005253,2.101711,0.402887,0.701851
878,2019-05-24 11:37:59.234703,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",19,1.471531,0.004035,2.122989,0.402138,0.651458


In [24]:
db.commit()

In [36]:
idx=1042
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_catboost3_{}.csv'.format(idx), index=False)

In [37]:
mytrial =[]

#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.0001, 1)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
                'logging_level': 'Silent'
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 794 by stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-24 13:58:46,267] Finished a trial resulted in value: 0.8120904775366505. Current best value is 0.8120904775366505 with parameters: {'num_trees': 905, 'depth': 9, 'learning_rate': 0.37911045852407926, 'l2_leaf_reg': 0.98974698102789, 'bagging_temperature': 0.9319332722731551, 'random_strength': 0.09605185613722703, 'random_state': 6548}.
[I 2019-05-24 13:58:56,292] Finished a trial resulted in value: 0.0679819034053826. Current best value is 0.0679819034053826 with parameters: {'num_trees': 450, 'depth': 3, 'learning_rate': 0.05184167706798716, 'l2_leaf_reg': 0.2608832917493349, 'bagging_temperature': 0.6503861196842374, 'random_strength': 0.34096002259291996, 'random_state': 4299}.
[I 2019-05-24 14:00:36,057] Finished a trial resulted in value: 0.19317424071565903. Current best value is 0.0679819034053826 with parameters: {'num_trees': 450, 'depth': 3, 'learning_rate': 0.05184167706798716, 'l2_leaf_reg': 0.2608832917493349, 'bagging_temperature': 0.6503861196842374, 'random_

[I 2019-05-24 14:19:42,439] Finished a trial resulted in value: 0.606728015343851. Current best value is 0.005897354204651148 with parameters: {'num_trees': 325, 'depth': 2, 'learning_rate': 0.011513066404951033, 'l2_leaf_reg': 0.3467056310596813, 'bagging_temperature': 0.6629876519480657, 'random_strength': 0.9494545260899883, 'random_state': 8180}.
[I 2019-05-24 14:19:47,386] Finished a trial resulted in value: 0.0667018643253399. Current best value is 0.005897354204651148 with parameters: {'num_trees': 325, 'depth': 2, 'learning_rate': 0.011513066404951033, 'l2_leaf_reg': 0.3467056310596813, 'bagging_temperature': 0.6629876519480657, 'random_strength': 0.9494545260899883, 'random_state': 8180}.
[I 2019-05-24 14:19:57,030] Finished a trial resulted in value: 0.1673156737212162. Current best value is 0.005897354204651148 with parameters: {'num_trees': 325, 'depth': 2, 'learning_rate': 0.011513066404951033, 'l2_leaf_reg': 0.3467056310596813, 'bagging_temperature': 0.6629876519480657, '

[I 2019-05-24 14:29:34,699] Finished a trial resulted in value: 0.24943260400335956. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:29:40,150] Finished a trial resulted in value: 0.06405717834328105. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:30:15,561] Finished a trial resulted in value: 0.3123913284987006. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519

[I 2019-05-24 14:42:16,809] Finished a trial resulted in value: 0.05670254439631074. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:42:22,960] Finished a trial resulted in value: 0.02010623494222374. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:42:42,297] Finished a trial resulted in value: 0.2867519420040493. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519

[I 2019-05-24 14:49:59,250] Finished a trial resulted in value: 0.13072356672571256. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:50:05,721] Finished a trial resulted in value: 0.18867171115558368. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:50:13,690] Finished a trial resulted in value: 0.12382723292170623. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.787605566400051

In [38]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [39]:
df_trial[(df_trial['remark']=='tune 794 by stratified')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
1233,2019-05-24 14:46:10.066478,19,1.882408,1e-06,1.928446,2.9e-05,0.046037
1188,2019-05-24 14:37:57.990725,19,1.885558,1.5e-05,1.929947,1.1e-05,0.04439
1145,2019-05-24 14:26:30.391643,19,1.886439,2.1e-05,1.93095,1.6e-05,0.044512
1177,2019-05-24 14:33:18.442286,19,1.882551,1.7e-05,1.931739,2e-06,0.049188
1130,2019-05-24 14:24:18.022454,19,1.884293,3.4e-05,1.932134,6e-06,0.047841


In [40]:
db.commit()

In [41]:
idx=1233
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_catboost3_{}.csv'.format(idx), index=False)

In [37]:
mytrial =[]

#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.0001, 1)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
                'logging_level': 'Silent'
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 794 by stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-24 13:58:46,267] Finished a trial resulted in value: 0.8120904775366505. Current best value is 0.8120904775366505 with parameters: {'num_trees': 905, 'depth': 9, 'learning_rate': 0.37911045852407926, 'l2_leaf_reg': 0.98974698102789, 'bagging_temperature': 0.9319332722731551, 'random_strength': 0.09605185613722703, 'random_state': 6548}.
[I 2019-05-24 13:58:56,292] Finished a trial resulted in value: 0.0679819034053826. Current best value is 0.0679819034053826 with parameters: {'num_trees': 450, 'depth': 3, 'learning_rate': 0.05184167706798716, 'l2_leaf_reg': 0.2608832917493349, 'bagging_temperature': 0.6503861196842374, 'random_strength': 0.34096002259291996, 'random_state': 4299}.
[I 2019-05-24 14:00:36,057] Finished a trial resulted in value: 0.19317424071565903. Current best value is 0.0679819034053826 with parameters: {'num_trees': 450, 'depth': 3, 'learning_rate': 0.05184167706798716, 'l2_leaf_reg': 0.2608832917493349, 'bagging_temperature': 0.6503861196842374, 'random_

[I 2019-05-24 14:19:42,439] Finished a trial resulted in value: 0.606728015343851. Current best value is 0.005897354204651148 with parameters: {'num_trees': 325, 'depth': 2, 'learning_rate': 0.011513066404951033, 'l2_leaf_reg': 0.3467056310596813, 'bagging_temperature': 0.6629876519480657, 'random_strength': 0.9494545260899883, 'random_state': 8180}.
[I 2019-05-24 14:19:47,386] Finished a trial resulted in value: 0.0667018643253399. Current best value is 0.005897354204651148 with parameters: {'num_trees': 325, 'depth': 2, 'learning_rate': 0.011513066404951033, 'l2_leaf_reg': 0.3467056310596813, 'bagging_temperature': 0.6629876519480657, 'random_strength': 0.9494545260899883, 'random_state': 8180}.
[I 2019-05-24 14:19:57,030] Finished a trial resulted in value: 0.1673156737212162. Current best value is 0.005897354204651148 with parameters: {'num_trees': 325, 'depth': 2, 'learning_rate': 0.011513066404951033, 'l2_leaf_reg': 0.3467056310596813, 'bagging_temperature': 0.6629876519480657, '

[I 2019-05-24 14:29:34,699] Finished a trial resulted in value: 0.24943260400335956. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:29:40,150] Finished a trial resulted in value: 0.06405717834328105. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:30:15,561] Finished a trial resulted in value: 0.3123913284987006. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519

[I 2019-05-24 14:42:16,809] Finished a trial resulted in value: 0.05670254439631074. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:42:22,960] Finished a trial resulted in value: 0.02010623494222374. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:42:42,297] Finished a trial resulted in value: 0.2867519420040493. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519

[I 2019-05-24 14:49:59,250] Finished a trial resulted in value: 0.13072356672571256. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:50:05,721] Finished a trial resulted in value: 0.18867171115558368. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.7876055664000519, 'random_strength': 0.8429249230780215, 'random_state': 5162}.
[I 2019-05-24 14:50:13,690] Finished a trial resulted in value: 0.12382723292170623. Current best value is 0.004694939658350088 with parameters: {'num_trees': 203, 'depth': 3, 'learning_rate': 0.010081790617458867, 'l2_leaf_reg': 0.4838003764816028, 'bagging_temperature': 0.787605566400051

In [38]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [39]:
df_trial[(df_trial['remark']=='tune 794 by stratified')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
1233,2019-05-24 14:46:10.066478,19,1.882408,1e-06,1.928446,2.9e-05,0.046037
1188,2019-05-24 14:37:57.990725,19,1.885558,1.5e-05,1.929947,1.1e-05,0.04439
1145,2019-05-24 14:26:30.391643,19,1.886439,2.1e-05,1.93095,1.6e-05,0.044512
1177,2019-05-24 14:33:18.442286,19,1.882551,1.7e-05,1.931739,2e-06,0.049188
1130,2019-05-24 14:24:18.022454,19,1.884293,3.4e-05,1.932134,6e-06,0.047841


In [40]:
db.commit()

In [11]:
mytrial =[]

#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.0001, 1)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
                'logging_level': 'Silent'
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 794 by stratified k8')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)



[I 2019-05-25 01:28:53,612] Finished a trial resulted in value: 0.42508147342714997. Current best value is 0.42508147342714997 with parameters: {'num_trees': 717, 'depth': 9, 'learning_rate': 0.11517226963732578, 'l2_leaf_reg': 0.2597977623174756, 'bagging_temperature': 0.6084026621129538, 'random_strength': 0.0747887717884732, 'random_state': 9899}.
[I 2019-05-25 01:29:25,138] Finished a trial resulted in value: 0.12525662614124308. Current best value is 0.12525662614124308 with parameters: {'num_trees': 766, 'depth': 3, 'learning_rate': 0.12567103164370155, 'l2_leaf_reg': 0.4051528998710589, 'bagging_temperature': 0.7070512243833961, 'random_strength': 0.6116497311845802, 'random_state': 458}.
[I 2019-05-25 01:29:51,960] Finished a trial resulted in value: 0.13906164690823433. Current best value is 0.12525662614124308 with parameters: {'num_trees': 766, 'depth': 3, 'learning_rate': 0.12567103164370155, 'l2_leaf_reg': 0.4051528998710589, 'bagging_temperature': 0.7070512243833961, 'ran

[I 2019-05-25 02:40:48,964] Finished a trial resulted in value: 0.3905923819761319. Current best value is 0.004052954876576161 with parameters: {'num_trees': 241, 'depth': 2, 'learning_rate': 0.011177873578646164, 'l2_leaf_reg': 0.5357065827986499, 'bagging_temperature': 0.7854165117925749, 'random_strength': 0.720393440286919, 'random_state': 8296}.
[I 2019-05-25 02:41:30,210] Finished a trial resulted in value: 0.1916869375508789. Current best value is 0.004052954876576161 with parameters: {'num_trees': 241, 'depth': 2, 'learning_rate': 0.011177873578646164, 'l2_leaf_reg': 0.5357065827986499, 'bagging_temperature': 0.7854165117925749, 'random_strength': 0.720393440286919, 'random_state': 8296}.
[I 2019-05-25 02:48:11,931] Finished a trial resulted in value: 0.31528166318201367. Current best value is 0.004052954876576161 with parameters: {'num_trees': 241, 'depth': 2, 'learning_rate': 0.011177873578646164, 'l2_leaf_reg': 0.5357065827986499, 'bagging_temperature': 0.7854165117925749, '

[I 2019-05-25 03:21:24,995] Finished a trial resulted in value: 0.0643401465772385. Current best value is 0.0023189669324267742 with parameters: {'num_trees': 204, 'depth': 2, 'learning_rate': 0.01009113381365357, 'l2_leaf_reg': 0.8835022128249224, 'bagging_temperature': 0.8509353647715436, 'random_strength': 0.7563533925309727, 'random_state': 7779}.
[I 2019-05-25 03:21:51,429] Finished a trial resulted in value: 0.14608410536184246. Current best value is 0.0023189669324267742 with parameters: {'num_trees': 204, 'depth': 2, 'learning_rate': 0.01009113381365357, 'l2_leaf_reg': 0.8835022128249224, 'bagging_temperature': 0.8509353647715436, 'random_strength': 0.7563533925309727, 'random_state': 7779}.
[I 2019-05-25 03:22:09,492] Finished a trial resulted in value: 0.05022750083867338. Current best value is 0.0023189669324267742 with parameters: {'num_trees': 204, 'depth': 2, 'learning_rate': 0.01009113381365357, 'l2_leaf_reg': 0.8835022128249224, 'bagging_temperature': 0.8509353647715436

[I 2019-05-25 04:09:58,700] Finished a trial resulted in value: 0.07071604120665172. Current best value is 0.0023189669324267742 with parameters: {'num_trees': 204, 'depth': 2, 'learning_rate': 0.01009113381365357, 'l2_leaf_reg': 0.8835022128249224, 'bagging_temperature': 0.8509353647715436, 'random_strength': 0.7563533925309727, 'random_state': 7779}.
[I 2019-05-25 04:11:00,135] Finished a trial resulted in value: 0.09755946071851658. Current best value is 0.0023189669324267742 with parameters: {'num_trees': 204, 'depth': 2, 'learning_rate': 0.01009113381365357, 'l2_leaf_reg': 0.8835022128249224, 'bagging_temperature': 0.8509353647715436, 'random_strength': 0.7563533925309727, 'random_state': 7779}.
[I 2019-05-25 04:11:25,231] Finished a trial resulted in value: 0.0761198293177888. Current best value is 0.0023189669324267742 with parameters: {'num_trees': 204, 'depth': 2, 'learning_rate': 0.01009113381365357, 'l2_leaf_reg': 0.8835022128249224, 'bagging_temperature': 0.8509353647715436

[I 2019-05-25 04:31:40,076] Finished a trial resulted in value: 0.017449026235324808. Current best value is 0.0023189669324267742 with parameters: {'num_trees': 204, 'depth': 2, 'learning_rate': 0.01009113381365357, 'l2_leaf_reg': 0.8835022128249224, 'bagging_temperature': 0.8509353647715436, 'random_strength': 0.7563533925309727, 'random_state': 7779}.
[I 2019-05-25 04:32:02,295] Finished a trial resulted in value: 0.05328169538717535. Current best value is 0.0023189669324267742 with parameters: {'num_trees': 204, 'depth': 2, 'learning_rate': 0.01009113381365357, 'l2_leaf_reg': 0.8835022128249224, 'bagging_temperature': 0.8509353647715436, 'random_strength': 0.7563533925309727, 'random_state': 7779}.
[I 2019-05-25 04:32:53,397] Finished a trial resulted in value: 0.07091439506400225. Current best value is 0.0023189669324267742 with parameters: {'num_trees': 204, 'depth': 2, 'learning_rate': 0.01009113381365357, 'l2_leaf_reg': 0.8835022128249224, 'bagging_temperature': 0.85093536477154

In [12]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [14]:
df_trial[(df_trial['remark']=='tune 794 by stratified k8')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
1297,2019-05-25 02:15:11.876861,19,1.859916,1.1e-05,1.908354,0.000196,0.048438
1450,2019-05-25 04:29:59.541389,19,1.866386,7e-06,1.914758,0.000208,0.048372
1405,2019-05-25 04:06:19.504002,19,1.867439,4.5e-05,1.915271,0.000193,0.047831
1391,2019-05-25 03:49:32.200648,19,1.869759,5e-06,1.917332,0.000235,0.047572
1300,2019-05-25 02:16:18.261020,19,1.871031,8e-06,1.918259,0.000162,0.047227


In [15]:
db.commit()

In [16]:
idx=1297
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_catboost3_{}.csv'.format(idx), index=False)

In [18]:
len(df_trial.loc[342]['param']['columns'])

30