In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types


numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.



In [None]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [3]:
# mytrial = []
mytrial = list(pd.read_pickle('trial/catboost.pkl').T.to_dict().values())
df_trial = pd.DataFrame(mytrial)
len(mytrial)

437

In [16]:
#check feature_importances
df_feature_importances = df_trial.loc[235]['df_feature_importances']
if type(df_feature_importances)==pd.DataFrame:
    sorted_columns = EP.evaluate(df_feature_importances, key='average_model_weight')
else:
    sorted_columns = df_trial.loc[235]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)

In [17]:
len(sorted_columns)

30

In [12]:
param={
    'algorithm': {
        'cls': 'cb.CatBoostRegressor',
        'fit': {
            'early_stopping_rounds': 200,
#             'eval_metric': 'mae',
            'verbose': False
        },
        'init': {
#             'task_type':"GPU",
        }
    },
    'columns': sorted_columns,
    'feature_importance': {
        'is_output': True,
        'permutation_feature_importance': True,
        'permutation_random_state': 1
    },
    'kfold': {
        'n_splits': 8,
        'random_state': 1985,
        'shuffle': True,
        'type': 'group'
    },
    'scaler': {
        'cls': 'StandardScaler'
    }
}

In [55]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, df_trial.loc[8]['param'], df_test = df_test, trial=mytrial)

In [13]:
#  select features by permutation_weight
EP.select_features_(df_train, param, mytrial, nfeats_best=25, nfeats_removed_per_try=10, key='average_model_weight')

In [19]:
#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.001, 100)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':df_trial.loc[235]['param']['columns'],
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
            },
            'fit':{
#                 'eval_metric':'mae',
                'verbose':False,
                'early_stopping_rounds':200,
            },
        },
        'feature_importance':{
            'is_output':False,
            'permutation_feature_importance':False,
            'permutation_random_state':1,
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune feats selected by group ')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-09 23:52:15,496] Finished a trial resulted in value: 0.23782823874437153. Current best value is 0.23782823874437153 with parameters: {'num_trees': 427, 'depth': 5, 'learning_rate': 0.38144574114176105, 'l2_leaf_reg': 59.8749840518477, 'bagging_temperature': 0.946313442752404, 'random_strength': 0.13930353388997965, 'random_state': 3775}.
[I 2019-05-10 00:01:05,266] Finished a trial resulted in value: 0.2653814536119777. Current best value is 0.23782823874437153 with parameters: {'num_trees': 427, 'depth': 5, 'learning_rate': 0.38144574114176105, 'l2_leaf_reg': 59.8749840518477, 'bagging_temperature': 0.946313442752404, 'random_strength': 0.13930353388997965, 'random_state': 3775}.
[I 2019-05-10 00:02:08,705] Finished a trial resulted in value: 0.3005346704633378. Current best value is 0.23782823874437153 with parameters: {'num_trees': 427, 'depth': 5, 'learning_rate': 0.38144574114176105, 'l2_leaf_reg': 59.8749840518477, 'bagging_temperature': 0.946313442752404, 'random_stre

[I 2019-05-10 01:00:24,242] Finished a trial resulted in value: 0.22789557033303964. Current best value is 0.015452438321236802 with parameters: {'num_trees': 671, 'depth': 3, 'learning_rate': 0.011444162636734707, 'l2_leaf_reg': 76.95163767459611, 'bagging_temperature': 0.8227980652968351, 'random_strength': 0.9784055466056077, 'random_state': 2054}.
[I 2019-05-10 01:03:54,712] Finished a trial resulted in value: 0.15591423359918127. Current best value is 0.015452438321236802 with parameters: {'num_trees': 671, 'depth': 3, 'learning_rate': 0.011444162636734707, 'l2_leaf_reg': 76.95163767459611, 'bagging_temperature': 0.8227980652968351, 'random_strength': 0.9784055466056077, 'random_state': 2054}.
[I 2019-05-10 01:10:33,902] Finished a trial resulted in value: 0.45636090127055284. Current best value is 0.015452438321236802 with parameters: {'num_trees': 671, 'depth': 3, 'learning_rate': 0.011444162636734707, 'l2_leaf_reg': 76.95163767459611, 'bagging_temperature': 0.8227980652968351, 

[I 2019-05-10 01:54:36,953] Finished a trial resulted in value: 0.20283246271929173. Current best value is 0.015452438321236802 with parameters: {'num_trees': 671, 'depth': 3, 'learning_rate': 0.011444162636734707, 'l2_leaf_reg': 76.95163767459611, 'bagging_temperature': 0.8227980652968351, 'random_strength': 0.9784055466056077, 'random_state': 2054}.
[I 2019-05-10 01:55:02,546] Finished a trial resulted in value: 0.082879097616782. Current best value is 0.015452438321236802 with parameters: {'num_trees': 671, 'depth': 3, 'learning_rate': 0.011444162636734707, 'l2_leaf_reg': 76.95163767459611, 'bagging_temperature': 0.8227980652968351, 'random_strength': 0.9784055466056077, 'random_state': 2054}.
[I 2019-05-10 01:56:16,063] Finished a trial resulted in value: 0.251055848826. Current best value is 0.015452438321236802 with parameters: {'num_trees': 671, 'depth': 3, 'learning_rate': 0.011444162636734707, 'l2_leaf_reg': 76.95163767459611, 'bagging_temperature': 0.8227980652968351, 'random

[I 2019-05-10 03:03:05,428] Finished a trial resulted in value: 0.02595757144358451. Current best value is 0.012773930232106656 with parameters: {'num_trees': 794, 'depth': 2, 'learning_rate': 0.011034654224196944, 'l2_leaf_reg': 95.69985890951777, 'bagging_temperature': 0.8326475639973028, 'random_strength': 0.8024641397038226, 'random_state': 937}.
[I 2019-05-10 03:04:09,466] Finished a trial resulted in value: 0.31165833678588917. Current best value is 0.012773930232106656 with parameters: {'num_trees': 794, 'depth': 2, 'learning_rate': 0.011034654224196944, 'l2_leaf_reg': 95.69985890951777, 'bagging_temperature': 0.8326475639973028, 'random_strength': 0.8024641397038226, 'random_state': 937}.
[I 2019-05-10 03:04:50,775] Finished a trial resulted in value: 0.13953294140595787. Current best value is 0.012773930232106656 with parameters: {'num_trees': 794, 'depth': 2, 'learning_rate': 0.011034654224196944, 'l2_leaf_reg': 95.69985890951777, 'bagging_temperature': 0.8326475639973028, 'r

[I 2019-05-10 03:40:55,386] Finished a trial resulted in value: 0.11516964066058367. Current best value is 0.007850106884887185 with parameters: {'num_trees': 303, 'depth': 4, 'learning_rate': 0.010304563320182062, 'l2_leaf_reg': 87.06643425778495, 'bagging_temperature': 0.9109520418792436, 'random_strength': 0.6357872506626183, 'random_state': 4485}.
[I 2019-05-10 03:41:11,817] Finished a trial resulted in value: 0.036896238730050566. Current best value is 0.007850106884887185 with parameters: {'num_trees': 303, 'depth': 4, 'learning_rate': 0.010304563320182062, 'l2_leaf_reg': 87.06643425778495, 'bagging_temperature': 0.9109520418792436, 'random_strength': 0.6357872506626183, 'random_state': 4485}.
[I 2019-05-10 03:42:44,438] Finished a trial resulted in value: 0.0657915107590635. Current best value is 0.007850106884887185 with parameters: {'num_trees': 303, 'depth': 4, 'learning_rate': 0.010304563320182062, 'l2_leaf_reg': 87.06643425778495, 'bagging_temperature': 0.9109520418792436, 

In [43]:
df_test_pred = df_trial.loc[342]['df_test_pred']

In [44]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [45]:
df_trial = pd.DataFrame(mytrial)

In [41]:
#[df_trial['mae_diff']<.05].sort_values(by=['val_mae']) | (df_trial['remark']=='tune feats selected by group ')
df_trial[(df_trial['remark']=='tune feats selected by group ')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(10)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
342,2019-05-10 02:15:55.351409,30,1.822379,1.6e-05,1.870234,0.000263,0.047855
419,2019-05-10 03:39:47.282671,30,1.823879,2.3e-05,1.871669,0.000277,0.04779
325,2019-05-10 01:50:01.376788,30,1.82516,2.2e-05,1.875135,0.000253,0.049976
350,2019-05-10 02:23:27.277481,30,1.831164,2.1e-05,1.875346,0.000252,0.044182
260,2019-05-10 00:27:10.312556,30,1.830573,1.3e-05,1.876977,0.0002,0.046404
295,2019-05-10 01:18:30.771743,30,1.835525,4.1e-05,1.877723,0.000261,0.042197
299,2019-05-10 01:23:09.730396,30,1.831785,1.5e-05,1.878398,0.000167,0.046613
413,2019-05-10 03:36:48.905582,30,1.829911,2.9e-05,1.878617,0.000211,0.048706
307,2019-05-10 01:27:30.759789,30,1.831915,1e-05,1.87889,0.000221,0.046975
280,2019-05-10 00:56:29.669298,30,1.835339,3.1e-05,1.879742,0.000213,0.044403


In [42]:
df_trial.loc[342:342][['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
342,2019-05-10 02:15:55.351409,tune feats selected by group,30,1.822379,1.6e-05,1.870234,0.000263,0.047855


In [46]:
df_trial.to_pickle('../trial/catboost.pkl')

In [47]:
df_trial.loc[342]['param']['columns']

['spkt_welch_density__coeff_3',
 'abs_q25_5',
 'q25_roll_std_100',
 'abs_q75_6',
 'abs_q75_7',
 'spkt_welch_densitycoeff_2',
 'abs_q01_4',
 'iqr_6',
 'q05_roll_std_100',
 'q05_roll_std_1000',
 'median__roll_std',
 'abs_q01_5',
 "number_peaks{'n': 10}",
 'FFT_Mag_75q0',
 "value_count{'value': 1}",
 'q01_roll_std_100',
 'abs_q95_2',
 'abs_q95_6',
 'MA_1000MA_std_mean_7',
 'q05_roll_std_10',
 'q01_roll_std_1000',
 'abs_max_roll_mean_1000',
 'abs_q75_2',
 'abs_q05_6',
 '5000std_quantile25',
 "number_crossing_m{'m': 1}",
 "autocorrelation{'lag': 5}",
 'q75_roll_std_10',
 'q05_2',
 '5000smoothness_quantile05']

In [5]:
df_trial.loc[342]['param']

{'columns': ['spkt_welch_density__coeff_3',
  'abs_q25_5',
  'q25_roll_std_100',
  'abs_q75_6',
  'abs_q75_7',
  'spkt_welch_densitycoeff_2',
  'abs_q01_4',
  'iqr_6',
  'q05_roll_std_100',
  'q05_roll_std_1000',
  'median__roll_std',
  'abs_q01_5',
  "number_peaks{'n': 10}",
  'FFT_Mag_75q0',
  "value_count{'value': 1}",
  'q01_roll_std_100',
  'abs_q95_2',
  'abs_q95_6',
  'MA_1000MA_std_mean_7',
  'q05_roll_std_10',
  'q01_roll_std_1000',
  'abs_max_roll_mean_1000',
  'abs_q75_2',
  'abs_q05_6',
  '5000std_quantile25',
  "number_crossing_m{'m': 1}",
  "autocorrelation{'lag': 5}",
  'q75_roll_std_10',
  'q05_2',
  '5000smoothness_quantile05'],
 'kfold': {'n_splits': 8,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler'},
 'algorithm': {'cls': 'cb.CatBoostRegressor',
  'init': {'num_trees': 589,
   'depth': 6,
   'learning_rate': 0.05293979792364842,
   'l2_leaf_reg': 78.065140245968,
   'bagging_temperature': 0.9302786271852079,
  