In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types

  from numpy.core.umath_tests import inner1d


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('feats/df_train.pkl')
df_test = pd.read_pickle('feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [13]:
mytrial = []
# mytrial = list(pd.read_pickle('trial/randomforest.pkl').T.to_dict().values())
# df_trial = pd.DataFrame(mytrial)
# len(mytrial)

In [14]:
param={
    'algorithm': {
        'cls': 'RandomForestRegressor',
        'fit': {
#             'early_stopping_rounds': 200,
#             'eval_metric': 'mae',
#             'verbose': False
        },
        'init': {
#             'n_jobs':16,
        }
    },
    'columns': tsfresh_columns,
    'feature_importance': {
        'is_output': True,
        'permutation_feature_importance': True,
        'permutation_random_state': 1
    },
    'kfold': {
        'n_splits': 8,
        'random_state': 1985,
        'shuffle': True,
        'type': 'group'
    },
    'scaler': {
        'cls': 'StandardScaler'
    }
}

In [15]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = None, trial=mytrial)

In [19]:
df_trial = pd.DataFrame(mytrial)
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
0,2019-05-11 14:48:38.289806,1071,0.606209,0.001154,2.254551,0.465856,1.648342


In [23]:
#check feature_importances
df_feature_importances = df_trial.loc[0]['df_feature_importances']
if type(df_feature_importances)==pd.DataFrame:
    sorted_columns = EP.evaluate(df_feature_importances, key='average_model_weight')
else:
    sorted_columns = df_trial.loc[0]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)

In [24]:
param['columns']=sorted_columns[:200]
#  select features by permutation_weight
EP.select_features_(df_train, param, mytrial, nfeats_best=25, nfeats_removed_per_try=10, key='average_model_weight')

In [33]:
df_trial = pd.DataFrame(mytrial)
#[(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)
df_trial[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].tail()

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
234,2019-05-12 05:30:45.581934,tune feats selected by group,80,3.039669,1.035059e-07,3.039672,5e-06,2e-06
235,2019-05-12 05:39:57.864418,tune feats selected by group,80,2.237364,1.552515e-05,2.240208,0.000518,0.002844
236,2019-05-12 05:45:23.329933,tune feats selected by group,80,2.246592,2.857291e-05,2.249777,0.000462,0.003185
237,2019-05-12 05:49:21.260505,tune feats selected by group,80,2.242615,2.164779e-05,2.245845,0.000486,0.003231
238,2019-05-12 05:49:26.565928,tune feats selected by group,80,3.039575,1.127699e-07,3.039577,4e-06,2e-06


In [26]:
#  tune hypterparameters
def objective(trial):
        
    n_estimators = trial.suggest_int('n_estimators', 300, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 16)
    max_features = trial.suggest_uniform('max_features', .6, 1)
    min_samples_leaf = trial.suggest_uniform('min_samples_leaf', 0.1, 0.5)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':df_trial.loc[32]['param']['columns'],
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'RandomForestRegressor',
            'init':{
                "n_estimators":n_estimators,
                "max_depth":max_depth,
                "max_features":max_features,
                "min_samples_leaf":min_samples_leaf,
                "random_state":random_state,
            },
            'fit':{
#                 'eval_metric':'mae',
#                 'verbose':False,
#                 'early_stopping_rounds':200,
            },
        },
        'feature_importance':{
            'is_output':False,
            'permutation_feature_importance':False,
            'permutation_random_state':1,
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune feats selected by group')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[32m[I 2019-05-11 21:35:49,506][0m Finished trial#0 resulted in value: 0.007304077864263028. Current best value is 0.007304077864263028 with parameters: {'n_estimators': 411, 'max_depth': 14, 'max_features': 0.8753638230834919, 'min_samples_leaf': 0.1996008553284878, 'random_state': 521}.[0m
[32m[I 2019-05-11 21:36:02,938][0m Finished trial#1 resulted in value: 4.885183997822251e-06. Current best value is 4.885183997822251e-06 with parameters: {'n_estimators': 627, 'max_depth': 13, 'max_features': 0.7745858629441442, 'min_samples_leaf': 0.3699101746861958, 'random_state': 7269}.[0m
[32m[I 2019-05-11 21:51:05,871][0m Finished trial#2 resulted in value: 0.007366644561250712. Current best value is 4.885183997822251e-06 with parameters: {'n_estimators': 627, 'max_depth': 13, 'max_features': 0.7745858629441442, 'min_samples_leaf': 0.3699101746861958, 'random_state': 7269}.[0m
[32m[I 2019-05-11 21:56:59,996][0m Finished trial#3 resulted in value: 0.007023724404012694. Current best

[32m[I 2019-05-11 23:29:13,047][0m Finished trial#28 resulted in value: 0.007194063850016293. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-11 23:41:32,822][0m Finished trial#29 resulted in value: 0.007336222074372818. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-11 23:41:41,925][0m Finished trial#30 resulted in value: 4.5024129584962276e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-11 23:41:51,555][0m Finished trial#31 resulted in value: 3.2591466859188953e-06. 

[32m[I 2019-05-12 00:39:28,831][0m Finished trial#56 resulted in value: 0.007809686853551967. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 00:44:36,780][0m Finished trial#57 resulted in value: 0.006063999506110687. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 00:50:10,369][0m Finished trial#58 resulted in value: 0.0066141388298525125. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 01:01:15,396][0m Finished trial#59 resulted in value: 0.007095868597998149. Cur

[32m[I 2019-05-12 01:47:17,047][0m Finished trial#84 resulted in value: 4.364015098807079e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 01:47:24,363][0m Finished trial#85 resulted in value: 5.5178496921246445e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 01:47:30,627][0m Finished trial#86 resulted in value: 4.6821387733291405e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 01:47:36,206][0m Finished trial#87 resulted in value: 5.446076923788423e-06

[32m[I 2019-05-12 02:44:09,467][0m Finished trial#112 resulted in value: 0.007558139241242982. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 02:44:17,509][0m Finished trial#113 resulted in value: 4.736946647948397e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 02:44:25,100][0m Finished trial#114 resulted in value: 4.884271767024369e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 02:44:31,856][0m Finished trial#115 resulted in value: 4.521660357995276e-0

[32m[I 2019-05-12 03:56:43,442][0m Finished trial#140 resulted in value: 4.627611817062077e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 03:56:49,310][0m Finished trial#141 resulted in value: 5.047805222202671e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 03:56:58,575][0m Finished trial#142 resulted in value: 4.199843564111785e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 03:57:05,278][0m Finished trial#143 resulted in value: 5.527373717724562e-

[32m[I 2019-05-12 04:41:32,425][0m Finished trial#168 resulted in value: 5.698877132152248e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 04:41:40,459][0m Finished trial#169 resulted in value: 4.66956227564352e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 04:41:47,943][0m Finished trial#170 resulted in value: 7.76982503205175e-06. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 04:41:53,033][0m Finished trial#171 resulted in value: 6.52026472029551e-06.

[32m[I 2019-05-12 05:39:57,946][0m Finished trial#196 resulted in value: 0.006371624452948431. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 05:45:23,413][0m Finished trial#197 resulted in value: 0.00716559497838352. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 05:49:21,342][0m Finished trial#198 resulted in value: 0.007255446039761398. Current best value is 5.918914265507803e-07 with parameters: {'n_estimators': 445, 'max_depth': 12, 'max_features': 0.6644333128599246, 'min_samples_leaf': 0.35572192395429864, 'random_state': 8242}.[0m
[32m[I 2019-05-12 05:49:26,649][0m Finished trial#199 resulted in value: 4.640369230453086e-06. 

In [43]:
df_test_pred = df_trial.loc[342]['df_test_pred']

In [44]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [27]:
df_trial = pd.DataFrame(mytrial)

In [30]:
#[df_trial['mae_diff']<.05].sort_values(by=['val_mae']) | (df_trial['remark']=='tune feats selected by group ')
df_trial[(df_trial['remark']=='tune feats selected by group')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(10)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
122,2019-05-12 01:47:09.098802,80,2.085693,1.3e-05,2.090886,0.000503,0.005193
49,2019-05-11 22:46:17.488931,80,2.086041,1e-05,2.09127,0.000532,0.005228
230,2019-05-12 05:30:11.008231,80,2.087717,1.6e-05,2.093452,0.000471,0.005734
150,2019-05-12 02:39:08.804657,80,2.089392,1e-05,2.094655,0.000502,0.005263
55,2019-05-11 22:58:03.264857,80,2.089978,1.2e-05,2.095168,0.000536,0.00519
81,2019-05-12 00:18:26.283519,80,2.098043,1.7e-05,2.102824,0.00059,0.004782
162,2019-05-12 03:06:51.276631,80,2.188041,0.000171,2.191513,0.000498,0.003472
95,2019-05-12 00:39:28.769875,80,2.230114,0.00011,2.233611,0.000473,0.003496
97,2019-05-12 00:50:10.306935,80,2.236375,1.3e-05,2.239329,0.000529,0.002954
72,2019-05-11 23:47:09.580241,80,2.236961,1.5e-05,2.239935,0.000523,0.002974


In [31]:
df_trial.loc[122:122][['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
122,2019-05-12 01:47:09.098802,tune feats selected by group,80,2.085693,1.3e-05,2.090886,0.000503,0.005193


In [32]:
df_trial.to_pickle('trial/randomforest.pkl')

In [35]:
df_trial.loc[122]['param']

{'columns': ['q25_roll_std_100',
  'abs_q01_4',
  'q05_roll_std_1000',
  'abs_q25_5',
  'spkt_welch_density__coeff_3',
  'q05_roll_std_100',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'median__roll_std',
  'min__roll_std',
  "number_peaks{'n': 10}",
  'abs_q75_6',
  'q01_roll_mean_1000',
  'abs_q75_7',
  'spkt_welch_densitycoeff_2',
  'min_roll_std_100',
  'max_to_min',
  'max_to_min_6',
  'iqr_6',
  'abs_max_8',
  'abs_max_roll_mean_100',
  'abs_max_roll_mean_1000',
  'min__roll_mean',
  'abs_max_7',
  'max_to_min_1',
  '5000rms_quantile25',
  'abs_max_6',
  'q99_roll_mean_1000',
  'max_to_min_5',
  'max_9',
  'abs_max_1',
  'skew_1',
  'mean_change_rate',
  "autocorrelation{'lag': 5}",
  'q95_roll_mean_1000',
  'abs_max_5',
  "number_crossing_m{'m': 1}",
  'max_to_min_diff_5',
  "value_count{'value': 1}",
  'abs_q99_7',
  'min_9',
  'abs_q75_2',
  'med_8',
  'abs_max_4',
  'abs_q05_2',
  "change_quantiles{'ql': 0.2, 'qh': 0.4, 'isabs': True, 'f_agg'