In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [7]:
# mytrial = []
mytrial = list(pd.read_pickle('../trial/randomforest.pkl').T.to_dict().values())
df_trial = pd.DataFrame(mytrial)
len(mytrial)

239

In [9]:
param = {'columns': ['q25_roll_std_100',
  'abs_q01_4',
  'q05_roll_std_1000',
  'abs_q25_5',
  'spkt_welch_density__coeff_3',
  'q05_roll_std_100',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'median__roll_std',
  'min__roll_std',
  "number_peaks{'n': 10}",
  'abs_q75_6',
  'q01_roll_mean_1000',
  'abs_q75_7',
  'spkt_welch_densitycoeff_2',
  'min_roll_std_100',
  'max_to_min',
  'max_to_min_6',
  'iqr_6',
  'abs_max_8',
  'abs_max_roll_mean_100',
  'abs_max_roll_mean_1000',
  'min__roll_mean',
  'abs_max_7',
  'max_to_min_1',
  '5000rms_quantile25',
  'abs_max_6',
  'q99_roll_mean_1000',
  'max_to_min_5',
  'max_9',
  'abs_max_1',
  'skew_1',
  'mean_change_rate',
  "autocorrelation{'lag': 5}",
  'q95_roll_mean_1000',
  'abs_max_5',
  "number_crossing_m{'m': 1}",
  'max_to_min_diff_5',
  "value_count{'value': 1}",
  'abs_q99_7',
  'min_9',
  'abs_q75_2',
  'med_8',
  'abs_max_4',
  'abs_q05_2',
  "change_quantiles{'ql': 0.2, 'qh': 0.4, 'isabs': True, 'f_agg': 'mean'}",
  '5000form_factor_quantile75',
  'abs_q25_9',
  'agg_autocorrelationf_agg_"mean"__maxlag_40',
  "binned_entropy{'max_bins': 10}",
  'abs_min_2',
  'spkt_welch_density__coeff_30',
  'abs_min_5',
  'q05_2',
  'abs_q95_7',
  'kurt_7',
  'q95_9',
  'fft_coefficientcoeff_80__attr_"imag"',
  'fft_coefficientcoeff_32__attr_"imag"',
  'fft_coefficientcoeff_24__attr_"real"',
  'ave10_6',
  'abs_q01_6',
  'iqr_8',
  "change_quantiles{'ql': 0.2, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'}",
  'med_5',
  'partial_autocorrelationlag_1',
  'ave10_2',
  'spkt_welch_density__coeff_42',
  'peak_to_average_power_ratio__roll_mean',
  'abs_min_7',
  'abs_q25_7',
  '5000skewness_max_',
  'spkt_welch_density__coeff_115',
  '5000skewness_mean_',
  'fft_coefficientcoeff_8__attr_"imag"',
  'abs_q95_2',
  'ave10_7',
  'abs_min_3',
  "change_quantiles{'ql': 0.6, 'qh': 0.8, 'isabs': True, 'f_agg': 'mean'}",
  'abs_min_6',
  'fft_coefficientcoeff_20__attr_"abs"'],
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler'},
 'algorithm': {'cls': 'RandomForestRegressor',
  'init': {'n_estimators': 747,
   'max_depth': 9,
   'max_features': 0.6639543350506906,
   'min_samples_leaf': 0.10968001196429095,
   'random_state': 2519},
  'fit': {}},
}

In [11]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial)

In [12]:
df_trial = pd.DataFrame(mytrial)
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].tail(1)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
239,2019-05-16 11:19:24.663372,80,2.082471,4e-06,2.088349,7.4e-05,0.005878


In [23]:
#check feature_importances
# df_feature_importances = df_trial.loc[0]['df_feature_importances']
# if type(df_feature_importances)==pd.DataFrame:
#     sorted_columns = EP.evaluate(df_feature_importances, key='average_model_weight')
# else:
#     sorted_columns = df_trial.loc[0]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)

In [24]:
param['columns']=sorted_columns[:200]
#  select features by permutation_weight
EP.select_features_(df_train, param, mytrial, nfeats_best=25, nfeats_removed_per_try=10, key='average_model_weight')

In [13]:
#  tune hypterparameters
def objective(trial):
        
    n_estimators = trial.suggest_int('n_estimators', 300, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 16)
    max_features = trial.suggest_uniform('max_features', .6, 1)
    min_samples_leaf = trial.suggest_uniform('min_samples_leaf', 0.1, 0.5)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':param['columns'],
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'RandomForestRegressor',
            'init':{
                "n_estimators":n_estimators,
                "max_depth":max_depth,
                "max_features":max_features,
                "min_samples_leaf":min_samples_leaf,
                "random_state":random_state,
            },
            'fit':{
#                 'eval_metric':'mae',
#                 'verbose':False,
#                 'early_stopping_rounds':200,
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 239')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-16 11:21:54,560] Finished a trial resulted in value: 0.009791532414118622. Current best value is 0.009791532414118622 with parameters: {'n_estimators': 313, 'max_depth': 15, 'max_features': 0.8665690788014577, 'min_samples_leaf': 0.17569654189945114, 'random_state': 1079}.
[I 2019-05-16 11:21:56,744] Finished a trial resulted in value: 3.214850544226126e-05. Current best value is 3.214850544226126e-05 with parameters: {'n_estimators': 415, 'max_depth': 5, 'max_features': 0.7258816716602419, 'min_samples_leaf': 0.41814773556999696, 'random_state': 8703}.
[I 2019-05-16 11:24:52,614] Finished a trial resulted in value: 0.003947792491582396. Current best value is 3.214850544226126e-05 with parameters: {'n_estimators': 415, 'max_depth': 5, 'max_features': 0.7258816716602419, 'min_samples_leaf': 0.41814773556999696, 'random_state': 8703}.
[I 2019-05-16 11:24:56,193] Finished a trial resulted in value: 2.4381182510774586e-05. Current best value is 2.4381182510774586e-05 with parame

[I 2019-05-16 11:56:08,355] Finished a trial resulted in value: 0.009553415370049239. Current best value is 2.0912574225032743e-05 with parameters: {'n_estimators': 610, 'max_depth': 11, 'max_features': 0.750742298901108, 'min_samples_leaf': 0.32782125720845473, 'random_state': 7711}.
[I 2019-05-16 11:56:11,581] Finished a trial resulted in value: 2.9448741722195382e-05. Current best value is 2.0912574225032743e-05 with parameters: {'n_estimators': 610, 'max_depth': 11, 'max_features': 0.750742298901108, 'min_samples_leaf': 0.32782125720845473, 'random_state': 7711}.
[I 2019-05-16 11:56:15,004] Finished a trial resulted in value: 2.0303237668410467e-05. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 12:00:32,445] Finished a trial resulted in value: 0.012424915720774076. Current best value is 2.0303237668410467e-05 with par

[I 2019-05-16 12:46:07,134] Finished a trial resulted in value: 3.563665290727624e-05. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 12:50:31,619] Finished a trial resulted in value: 0.012226140496411296. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 12:50:34,918] Finished a trial resulted in value: 3.375775010670393e-05. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 12:53:16,326] Finished a trial resulted in value: 0.009681019435607893. Current best value is 2.0303237668410467e-05 with paramet

[I 2019-05-16 13:09:22,507] Finished a trial resulted in value: 2.9262662017342613e-05. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 13:13:50,748] Finished a trial resulted in value: 0.009777737364160573. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 13:13:53,974] Finished a trial resulted in value: 2.4367266380041664e-05. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 13:18:06,855] Finished a trial resulted in value: 0.009058289181296822. Current best value is 2.0303237668410467e-05 with param

[I 2019-05-16 13:46:59,032] Finished a trial resulted in value: 2.9243926286452747e-05. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 13:47:02,115] Finished a trial resulted in value: 2.7661707493504013e-05. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 13:47:05,284] Finished a trial resulted in value: 2.8433850250541157e-05. Current best value is 2.0303237668410467e-05 with parameters: {'n_estimators': 711, 'max_depth': 13, 'max_features': 0.9603185922586197, 'min_samples_leaf': 0.3295477459606905, 'random_state': 682}.
[I 2019-05-16 13:51:05,450] Finished a trial resulted in value: 0.009300572103118987. Current best value is 2.0303237668410467e-05 with par

[I 2019-05-16 14:18:39,360] Finished a trial resulted in value: 2.6444444087157557e-05. Current best value is 1.9900041562898928e-05 with parameters: {'n_estimators': 652, 'max_depth': 16, 'max_features': 0.7500629797446907, 'min_samples_leaf': 0.4429268393520309, 'random_state': 6847}.
[I 2019-05-16 14:18:41,999] Finished a trial resulted in value: 2.6520450414066636e-05. Current best value is 1.9900041562898928e-05 with parameters: {'n_estimators': 652, 'max_depth': 16, 'max_features': 0.7500629797446907, 'min_samples_leaf': 0.4429268393520309, 'random_state': 6847}.
[I 2019-05-16 14:18:45,971] Finished a trial resulted in value: 2.742585299977418e-05. Current best value is 1.9900041562898928e-05 with parameters: {'n_estimators': 652, 'max_depth': 16, 'max_features': 0.7500629797446907, 'min_samples_leaf': 0.4429268393520309, 'random_state': 6847}.
[I 2019-05-16 14:18:49,424] Finished a trial resulted in value: 2.8017125278094542e-05. Current best value is 1.9900041562898928e-05 with

[I 2019-05-16 14:37:27,311] Finished a trial resulted in value: 0.009319104201636065. Current best value is 1.6534029788788096e-05 with parameters: {'n_estimators': 580, 'max_depth': 15, 'max_features': 0.710077468358555, 'min_samples_leaf': 0.3553661107864019, 'random_state': 1331}.
[I 2019-05-16 14:37:30,097] Finished a trial resulted in value: 2.9660782084349953e-05. Current best value is 1.6534029788788096e-05 with parameters: {'n_estimators': 580, 'max_depth': 15, 'max_features': 0.710077468358555, 'min_samples_leaf': 0.3553661107864019, 'random_state': 1331}.
[I 2019-05-16 14:39:49,661] Finished a trial resulted in value: 0.009326463085768589. Current best value is 1.6534029788788096e-05 with parameters: {'n_estimators': 580, 'max_depth': 15, 'max_features': 0.710077468358555, 'min_samples_leaf': 0.3553661107864019, 'random_state': 1331}.
[I 2019-05-16 14:39:51,461] Finished a trial resulted in value: 2.2182492378641733e-05. Current best value is 1.6534029788788096e-05 with param

[I 2019-05-16 14:40:44,767] Finished a trial resulted in value: 3.459673084583092e-05. Current best value is 1.6534029788788096e-05 with parameters: {'n_estimators': 580, 'max_depth': 15, 'max_features': 0.710077468358555, 'min_samples_leaf': 0.3553661107864019, 'random_state': 1331}.
[I 2019-05-16 14:42:21,592] Finished a trial resulted in value: 0.009028769174299752. Current best value is 1.6534029788788096e-05 with parameters: {'n_estimators': 580, 'max_depth': 15, 'max_features': 0.710077468358555, 'min_samples_leaf': 0.3553661107864019, 'random_state': 1331}.
[I 2019-05-16 14:42:23,357] Finished a trial resulted in value: 3.0172228895557495e-05. Current best value is 1.6534029788788096e-05 with parameters: {'n_estimators': 580, 'max_depth': 15, 'max_features': 0.710077468358555, 'min_samples_leaf': 0.3553661107864019, 'random_state': 1331}.
[I 2019-05-16 14:42:25,389] Finished a trial resulted in value: 2.6266003725257064e-05. Current best value is 1.6534029788788096e-05 with para

In [43]:
df_test_pred = df_trial.loc[342]['df_test_pred']

In [44]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [15]:
df_trial = pd.DataFrame(mytrial)

In [16]:
#[df_trial['mae_diff']<.05].sort_values(by=['val_mae']) | (df_trial['remark']=='tune feats selected by group ')
df_trial[(df_trial['remark']=='tune 239')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(10)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
297,2019-05-16 12:50:31.611238,80,2.080347,3e-06,2.086208,6.3e-05,0.00586
253,2019-05-16 11:44:19.202206,80,2.083453,5e-06,2.089122,7e-05,0.005669
364,2019-05-16 14:15:13.513453,80,2.084805,9e-06,2.090664,8.9e-05,0.005859
341,2019-05-16 13:40:14.663732,80,2.085468,6e-06,2.091099,7.4e-05,0.005631
293,2019-05-16 12:45:59.477880,80,2.090297,1.5e-05,2.095703,0.000101,0.005406
271,2019-05-16 12:00:32.439998,80,2.090928,2e-05,2.096853,0.000124,0.005926
357,2019-05-16 13:59:51.703358,80,2.128144,0.000204,2.133297,0.000332,0.005153
274,2019-05-16 12:07:51.055373,80,2.156412,4e-06,2.160924,6.5e-05,0.004512
282,2019-05-16 12:22:36.347932,80,2.164576,6.5e-05,2.169141,0.000146,0.004565
334,2019-05-16 13:25:59.906170,80,2.234614,5e-06,2.237242,3.8e-05,0.002628


In [31]:
df_trial.loc[122:122][['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
122,2019-05-12 01:47:09.098802,tune feats selected by group,80,2.085693,1.3e-05,2.090886,0.000503,0.005193


In [18]:
df_trial.to_pickle('../trial/randomforest.pkl')

In [35]:
df_trial.loc[122]['param']

{'columns': ['q25_roll_std_100',
  'abs_q01_4',
  'q05_roll_std_1000',
  'abs_q25_5',
  'spkt_welch_density__coeff_3',
  'q05_roll_std_100',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'median__roll_std',
  'min__roll_std',
  "number_peaks{'n': 10}",
  'abs_q75_6',
  'q01_roll_mean_1000',
  'abs_q75_7',
  'spkt_welch_densitycoeff_2',
  'min_roll_std_100',
  'max_to_min',
  'max_to_min_6',
  'iqr_6',
  'abs_max_8',
  'abs_max_roll_mean_100',
  'abs_max_roll_mean_1000',
  'min__roll_mean',
  'abs_max_7',
  'max_to_min_1',
  '5000rms_quantile25',
  'abs_max_6',
  'q99_roll_mean_1000',
  'max_to_min_5',
  'max_9',
  'abs_max_1',
  'skew_1',
  'mean_change_rate',
  "autocorrelation{'lag': 5}",
  'q95_roll_mean_1000',
  'abs_max_5',
  "number_crossing_m{'m': 1}",
  'max_to_min_diff_5',
  "value_count{'value': 1}",
  'abs_q99_7',
  'min_9',
  'abs_q75_2',
  'med_8',
  'abs_max_4',
  'abs_q05_2',
  "change_quantiles{'ql': 0.2, 'qh': 0.4, 'isabs': True, 'f_agg'