In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [7]:
# mytrial = []
mytrial = list(pd.read_pickle('../trial/extratrees.pkl').T.to_dict().values())
df_trial = pd.DataFrame(mytrial)
len(mytrial)

427

In [9]:
param={'columns': ['q25_roll_std_100',
  'abs_q25_5',
  'abs_q01_4',
  'iqr_6',
  'abs_q75_6',
  'mean_change_rate',
  'abs_q25_7',
  'abs_q75_7',
  'q05_2',
  'q01_roll_std_10',
  "change_quantiles{'ql': 0.4, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'q25_1',
  'spkt_welch_density__coeff_29',
  '3th_peak_freq',
  'spkt_welch_densitycoeff_2',
  'abs_q95_2',
  'abs_q25_9',
  'abs_q25_1',
  'min__roll_std',
  'spkt_welch_density__coeff_30',
  'spkt_welch_density__coeff_28',
  'iqr_8',
  'abs_q75_1',
  'abs_q95_8',
  'abs_max_8',
  'kurt_7',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'fft_coefficientcoeff_26__attr_"abs"',
  'abs_max_7',
  'fft_coefficientcoeff_21__attr_"abs"',
  'q95_9',
  'spkt_welch_density__coeff_3',
  'abs_q01_7',
  'abs_q99_8',
  'abs_q99_7',
  'fft_coefficientcoeff_32__attr_"imag"',
  'peak_to_average_power_ratio__roll_mean',
  'Hilbert_mean_1',
  'iqr_9',
  'fft_coefficientcoeff_20__attr_"abs"',
  'min_roll_std_100',
  'FFT_Mag_75q0',
  'fft_coefficientcoeff_16__attr_"imag"',
  "change_quantiles{'ql': 0.0, 'qh': 0.8, 'isabs': False, 'f_agg': 'mean'}",
  'partial_autocorrelationlag_5',
  'med_6',
  'abs_q25_8',
  'spkt_welch_density__coeff_63',
  'med_8',
  'med_4',
  'spkt_welch_density__coeff_65',
  "autocorrelation{'lag': 5}",
  'abs_q05_9',
  'fft_coefficientcoeff_24__attr_"imag"',
  'med_2',
  'spkt_welch_density__coeff_38',
  'fft_coefficientcoeff_50__attr_"abs"',
  'spkt_welch_density__coeff_50',
  '5000skewness_mean_',
  'ave10_7',
  'spkt_welch_density__coeff_115',
  'spkt_welch_density__coeff_117',
  'spkt_welch_density__coeff_79',
  "change_quantiles{'ql': 0.4, 'qh': 0.6, 'isabs': False, 'f_agg': 'mean'}",
  'spkt_welch_density__coeff_41',
  'partial_autocorrelationlag_1',
  'spkt_welch_density__coeff_113',
  'fft_coefficientcoeff_8__attr_"imag"',
  'spkt_welch_density__coeff_42',
  'spkt_welch_density__coeff_84'],
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler'},
 'algorithm': {'cls': 'ExtraTreesRegressor',
  'init': {'n_estimators': 449,
   'max_depth': 13,
   'max_features': 0.9606948036865893,
   'min_samples_leaf': 0.10365931931330866,
   'random_state': 1425},
  'fit': {}},
}

In [10]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial)

In [11]:
#check feature_importances
# df_feature_importances = df_trial.loc[0]['df_feature_importances']
# if type(df_feature_importances)==pd.DataFrame:
#     sorted_columns = EP.evaluate(df_feature_importances, key='average_model_weight')
# else:
#     sorted_columns = df_trial.loc[0]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)

In [21]:
#  select features by permutation_weight
EP.select_features_(df_train, param, mytrial, nfeats_best=25, nfeats_removed_per_try=5, key='average_permutation_weight')

In [11]:
# [(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)
df_trial = pd.DataFrame(mytrial)
df_trial[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].tail(1)

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
427,2019-05-16 11:17:34.168161,,70,2.107233,1.1e-05,2.108282,2.9e-05,0.00105


In [12]:
#  tune hypterparameters
def objective(trial):
        
    n_estimators = trial.suggest_int('n_estimators', 300, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 16)
    max_features = trial.suggest_uniform('max_features', .6, 1)
    min_samples_leaf = trial.suggest_uniform('min_samples_leaf', 0.1, 0.5)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':param['columns'],
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'ExtraTreesRegressor',
            'init':{
                "n_estimators":n_estimators,
                "max_depth":max_depth,
                "max_features":max_features,
                "min_samples_leaf":min_samples_leaf,
                "random_state":random_state,
            },
            'fit':{
#                 'eval_metric':'mae',
#                 'verbose':False,
#                 'early_stopping_rounds':200,
            },
        },
        'feature_importance':{
            'is_output':False,
            'permutation_feature_importance':False,
            'permutation_random_state':1,
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 427')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-16 11:19:24,055] Finished a trial resulted in value: 5.9455416189456365e-05. Current best value is 5.9455416189456365e-05 with parameters: {'n_estimators': 568, 'max_depth': 7, 'max_features': 0.658490832845435, 'min_samples_leaf': 0.48023468199422237, 'random_state': 8336}.
[I 2019-05-16 11:19:46,720] Finished a trial resulted in value: 0.0018226672439548488. Current best value is 5.9455416189456365e-05 with parameters: {'n_estimators': 568, 'max_depth': 7, 'max_features': 0.658490832845435, 'min_samples_leaf': 0.48023468199422237, 'random_state': 8336}.
[I 2019-05-16 11:19:53,734] Finished a trial resulted in value: 0.00039494338592002313. Current best value is 5.9455416189456365e-05 with parameters: {'n_estimators': 568, 'max_depth': 7, 'max_features': 0.658490832845435, 'min_samples_leaf': 0.48023468199422237, 'random_state': 8336}.
[I 2019-05-16 11:20:25,488] Finished a trial resulted in value: 0.0016347691654713177. Current best value is 5.9455416189456365e-05 with par

[I 2019-05-16 11:28:22,378] Finished a trial resulted in value: 2.539969968711043e-05. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:28:33,342] Finished a trial resulted in value: 8.094698828800649e-05. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:29:16,420] Finished a trial resulted in value: 0.0008683732359859134. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:30:08,437] Finished a trial resulted in value: 0.0025331172747831885. Current best value is 1.413075457631864e-06 with parame

[I 2019-05-16 11:37:07,722] Finished a trial resulted in value: 0.00038860014644449115. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:37:50,876] Finished a trial resulted in value: 0.0013504302200625716. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:38:00,658] Finished a trial resulted in value: 0.0006577608501063127. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:38:11,854] Finished a trial resulted in value: 2.28884425417377e-05. Current best value is 1.413075457631864e-06 with parame

[I 2019-05-16 11:44:54,133] Finished a trial resulted in value: 0.00033649160932251954. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:45:09,074] Finished a trial resulted in value: 0.0005549512943279037. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:45:20,092] Finished a trial resulted in value: 0.0005389969004426302. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:45:32,653] Finished a trial resulted in value: 0.00020741362021878634. Current best value is 1.413075457631864e-06 with para

[I 2019-05-16 11:51:41,499] Finished a trial resulted in value: 0.0007335099849813293. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:51:57,617] Finished a trial resulted in value: 0.0010958047722433497. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:52:24,451] Finished a trial resulted in value: 0.0007594323325203479. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 11:52:40,883] Finished a trial resulted in value: 0.0006072645782483069. Current best value is 1.413075457631864e-06 with parame

[I 2019-05-16 12:00:16,180] Finished a trial resulted in value: 0.00029864409225557745. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 12:00:33,575] Finished a trial resulted in value: 0.0012729468705312092. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 12:00:58,222] Finished a trial resulted in value: 0.0006910009979206161. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 12:01:10,128] Finished a trial resulted in value: 6.718797831918511e-05. Current best value is 1.413075457631864e-06 with param

[I 2019-05-16 12:06:35,379] Finished a trial resulted in value: 0.00018707668636231136. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 12:06:50,500] Finished a trial resulted in value: 0.0004871785391019453. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 12:07:05,622] Finished a trial resulted in value: 0.0002945429088690703. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 12:07:19,294] Finished a trial resulted in value: 0.0005725958099408899. Current best value is 1.413075457631864e-06 with param

[I 2019-05-16 12:15:57,739] Finished a trial resulted in value: 6.025859328634948e-05. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 12:16:33,375] Finished a trial resulted in value: 0.000977925188692752. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 12:16:47,679] Finished a trial resulted in value: 6.313780165550698e-05. Current best value is 1.413075457631864e-06 with parameters: {'n_estimators': 569, 'max_depth': 7, 'max_features': 0.6568053816341548, 'min_samples_leaf': 0.49530453030419447, 'random_state': 8330}.
[I 2019-05-16 12:17:24,529] Finished a trial resulted in value: 0.001588125799618006. Current best value is 1.413075457631864e-06 with paramete

In [43]:
df_test_pred = df_trial.loc[342]['df_test_pred']

In [44]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [14]:
df_trial = pd.DataFrame(mytrial)

In [15]:
#[df_trial['mae_diff']<.05].sort_values(by=['val_mae']) | (df_trial['remark']=='tune feats selected by group ')
df_trial[(df_trial['remark']=='tune 427')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(10)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
459,2019-05-16 11:30:08.432873,70,2.109147,1.1e-05,2.110347,2.5e-05,0.0012
616,2019-05-16 12:13:38.852798,70,2.115638,1.7e-05,2.116674,2.5e-05,0.001036
432,2019-05-16 11:21:27.170786,70,2.118178,8e-06,2.119219,2.6e-05,0.001041
470,2019-05-16 11:33:16.899580,70,2.124185,2e-05,2.12534,7e-06,0.001155
433,2019-05-16 11:22:16.142229,70,2.13769,1.5e-05,2.138632,3.3e-05,0.000941
431,2019-05-16 11:20:25.486902,70,2.142068,1.5e-05,2.142831,2.9e-05,0.000763
507,2019-05-16 11:43:36.810574,70,2.145746,8e-06,2.146541,3.1e-05,0.000795
548,2019-05-16 11:54:16.469147,70,2.149583,2e-06,2.150016,3.2e-05,0.000433
505,2019-05-16 11:42:44.985943,70,2.150204,3.2e-05,2.151213,1.1e-05,0.001009
565,2019-05-16 11:59:41.200415,70,2.152462,2e-05,2.15337,1.1e-05,0.000908


In [16]:
df_trial.loc[459:459][['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
459,2019-05-16 11:30:08.432873,tune 427,70,2.109147,1.1e-05,2.110347,2.5e-05,0.0012


In [18]:
df_trial.to_pickle('../trial/extratrees.pkl')

In [29]:
df_trial.loc[250]['param']

{'columns': ['q25_roll_std_100',
  'abs_q25_5',
  'abs_q01_4',
  'iqr_6',
  'abs_q75_6',
  'mean_change_rate',
  'abs_q25_7',
  'abs_q75_7',
  'q05_2',
  'q01_roll_std_10',
  "change_quantiles{'ql': 0.4, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'q25_1',
  'spkt_welch_density__coeff_29',
  '3th_peak_freq',
  'spkt_welch_densitycoeff_2',
  'abs_q95_2',
  'abs_q25_9',
  'abs_q25_1',
  'min__roll_std',
  'spkt_welch_density__coeff_30',
  'spkt_welch_density__coeff_28',
  'iqr_8',
  'abs_q75_1',
  'abs_q95_8',
  'abs_max_8',
  'kurt_7',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'fft_coefficientcoeff_26__attr_"abs"',
  'abs_max_7',
  'fft_coefficientcoeff_21__attr_"abs"',
  'q95_9',
  'spkt_welch_density__coeff_3',
  'abs_q01_7',
  'abs_q99_8',
  'abs_q99_7',
  'fft_coefficientcoeff_32__attr_"imag"',
  'peak_to_average_power_ratio__roll_mean',
  'Hilbert_mean_1',
  'iqr_9',
  'fft_coefficientcoeff_20__attr_"abs"',
  'min_roll_std_100',
  'FFT_Mag_75q