In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [7]:
# mytrial = []
mytrial = list(pd.read_pickle('../trial/catboost.pkl').T.to_dict().values())
df_trial = pd.DataFrame(mytrial)
len(mytrial)

437

In [8]:
#check feature_importances
# df_feature_importances = df_trial.loc[235]['df_feature_importances']
# if type(df_feature_importances)==pd.DataFrame:
#     sorted_columns = EP.evaluate(df_feature_importances, key='average_model_weight')
# else:
#     sorted_columns = df_trial.loc[235]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)
# len(sorted_columns)

In [35]:
# param = df_trial.loc[342]['param'].copy()
param = {'columns': ['spkt_welch_density__coeff_3',
  'abs_q25_5',
  'q25_roll_std_100',
  'abs_q75_6',
  'abs_q75_7',
  'spkt_welch_densitycoeff_2',
  'abs_q01_4',
  'iqr_6',
  'q05_roll_std_100',
  'q05_roll_std_1000',
  'median__roll_std',
  'abs_q01_5',
  "number_peaks{'n': 10}",
  'FFT_Mag_75q0',
  "value_count{'value': 1}",
  'q01_roll_std_100',
  'abs_q95_2',
  'abs_q95_6',
  'MA_1000MA_std_mean_7',
  'q05_roll_std_10',
  'q01_roll_std_1000',
  'abs_max_roll_mean_1000',
  'abs_q75_2',
  'abs_q05_6',
  '5000std_quantile25',
  "number_crossing_m{'m': 1}",
  "autocorrelation{'lag': 5}",
  'q75_roll_std_10',
  'q05_2',
  '5000smoothness_quantile05'],
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'timeseries'},
 'scaler': {'cls': 'StandardScaler'},
 'algorithm': {'cls': 'cb.CatBoostRegressor',
  'init': {'num_trees': 589,
   'depth': 6,
   'learning_rate': 0.05293979792364842,
   'l2_leaf_reg': 78.065140245968,
   'bagging_temperature': 0.9302786271852079,
   'random_strength': 0.4247048326178351,
   'random_state': 651},
  'fit': {'verbose': False, 'early_stopping_rounds': 200}},
}

In [12]:
# param={
#     'algorithm': {
#         'cls': 'cb.CatBoostRegressor',
#         'fit': {
#             'early_stopping_rounds': 200,
# #             'eval_metric': 'mae',
#             'verbose': False
#         },
#         'init': {
#         }
#     },
#     'columns': sorted_columns,
#     'feature_importance': {
#         'is_output': True,
#         'permutation_feature_importance': True,
#         'permutation_random_state': 1
#     },
#     'kfold': {
#         'n_splits': 8,
#         'random_state': 1985,
#         'shuffle': True,
#         'type': 'group'
#     },
#     'scaler': {
#         'cls': 'StandardScaler'
#     }
# }

In [36]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial)

In [13]:
#  select features by permutation_weight
EP.select_features_(df_train, param, mytrial, nfeats_best=25, nfeats_removed_per_try=10, key='average_model_weight')

In [15]:
#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.0001, 1)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':param['columns'].copy(),
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
            },
            'fit':{
#                 'eval_metric':'mae',
                'verbose':False,
                'early_stopping_rounds':200,
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 437')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-16 06:40:37,822] Finished a trial resulted in value: 0.39048786322361084. Current best value is 0.39048786322361084 with parameters: {'num_trees': 443, 'depth': 8, 'learning_rate': 0.14009548720416376, 'l2_leaf_reg': 0.19118246571936606, 'bagging_temperature': 0.7249268739404343, 'random_strength': 0.10770304834217433, 'random_state': 6976}.
[I 2019-05-16 06:41:52,138] Finished a trial resulted in value: 0.4248433723672612. Current best value is 0.39048786322361084 with parameters: {'num_trees': 443, 'depth': 8, 'learning_rate': 0.14009548720416376, 'l2_leaf_reg': 0.19118246571936606, 'bagging_temperature': 0.7249268739404343, 'random_strength': 0.10770304834217433, 'random_state': 6976}.
[I 2019-05-16 06:43:04,704] Finished a trial resulted in value: 0.2728656948376179. Current best value is 0.2728656948376179 with parameters: {'num_trees': 224, 'depth': 10, 'learning_rate': 0.08599833898747268, 'l2_leaf_reg': 0.12054084484030389, 'bagging_temperature': 0.7411893498597109, 

[I 2019-05-16 07:02:44,023] Finished a trial resulted in value: 0.01758654432029695. Current best value is 0.01758654432029695 with parameters: {'num_trees': 227, 'depth': 2, 'learning_rate': 0.02558645221599814, 'l2_leaf_reg': 0.5193796128237813, 'bagging_temperature': 0.7868376900793114, 'random_strength': 0.41116064346878806, 'random_state': 3886}.
[I 2019-05-16 07:02:50,727] Finished a trial resulted in value: 0.05653289011494533. Current best value is 0.01758654432029695 with parameters: {'num_trees': 227, 'depth': 2, 'learning_rate': 0.02558645221599814, 'l2_leaf_reg': 0.5193796128237813, 'bagging_temperature': 0.7868376900793114, 'random_strength': 0.41116064346878806, 'random_state': 3886}.
[I 2019-05-16 07:02:57,212] Finished a trial resulted in value: 0.09099540961849038. Current best value is 0.01758654432029695 with parameters: {'num_trees': 227, 'depth': 2, 'learning_rate': 0.02558645221599814, 'l2_leaf_reg': 0.5193796128237813, 'bagging_temperature': 0.7868376900793114, '

[I 2019-05-16 07:12:14,900] Finished a trial resulted in value: 0.2641992973605272. Current best value is 0.010976358974680982 with parameters: {'num_trees': 205, 'depth': 4, 'learning_rate': 0.012267487045081477, 'l2_leaf_reg': 0.3129657897970629, 'bagging_temperature': 0.8197608355801962, 'random_strength': 0.5288165795250432, 'random_state': 2236}.
[I 2019-05-16 07:12:22,023] Finished a trial resulted in value: 0.21279401752645347. Current best value is 0.010976358974680982 with parameters: {'num_trees': 205, 'depth': 4, 'learning_rate': 0.012267487045081477, 'l2_leaf_reg': 0.3129657897970629, 'bagging_temperature': 0.8197608355801962, 'random_strength': 0.5288165795250432, 'random_state': 2236}.
[I 2019-05-16 07:12:31,161] Finished a trial resulted in value: 0.14516031438026997. Current best value is 0.010976358974680982 with parameters: {'num_trees': 205, 'depth': 4, 'learning_rate': 0.012267487045081477, 'l2_leaf_reg': 0.3129657897970629, 'bagging_temperature': 0.8197608355801962

[I 2019-05-16 07:22:20,517] Finished a trial resulted in value: 0.06091398142682294. Current best value is 0.010976358974680982 with parameters: {'num_trees': 205, 'depth': 4, 'learning_rate': 0.012267487045081477, 'l2_leaf_reg': 0.3129657897970629, 'bagging_temperature': 0.8197608355801962, 'random_strength': 0.5288165795250432, 'random_state': 2236}.
[I 2019-05-16 07:22:49,964] Finished a trial resulted in value: 0.3672354669182138. Current best value is 0.010976358974680982 with parameters: {'num_trees': 205, 'depth': 4, 'learning_rate': 0.012267487045081477, 'l2_leaf_reg': 0.3129657897970629, 'bagging_temperature': 0.8197608355801962, 'random_strength': 0.5288165795250432, 'random_state': 2236}.
[I 2019-05-16 07:22:56,561] Finished a trial resulted in value: 0.11774755453126651. Current best value is 0.010976358974680982 with parameters: {'num_trees': 205, 'depth': 4, 'learning_rate': 0.012267487045081477, 'l2_leaf_reg': 0.3129657897970629, 'bagging_temperature': 0.8197608355801962

[I 2019-05-16 07:30:15,108] Finished a trial resulted in value: 0.2256973907534088. Current best value is 0.007076336210938805 with parameters: {'num_trees': 200, 'depth': 3, 'learning_rate': 0.010360908233833718, 'l2_leaf_reg': 0.21177712057553388, 'bagging_temperature': 0.7691199754765087, 'random_strength': 0.720840502590669, 'random_state': 8654}.
[I 2019-05-16 07:30:19,878] Finished a trial resulted in value: 0.009371142472729152. Current best value is 0.007076336210938805 with parameters: {'num_trees': 200, 'depth': 3, 'learning_rate': 0.010360908233833718, 'l2_leaf_reg': 0.21177712057553388, 'bagging_temperature': 0.7691199754765087, 'random_strength': 0.720840502590669, 'random_state': 8654}.
[I 2019-05-16 07:30:23,926] Finished a trial resulted in value: 0.045019401056371436. Current best value is 0.007076336210938805 with parameters: {'num_trees': 200, 'depth': 3, 'learning_rate': 0.010360908233833718, 'l2_leaf_reg': 0.21177712057553388, 'bagging_temperature': 0.7691199754765

In [43]:
df_test_pred = df_trial.loc[342]['df_test_pred']

In [44]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [20]:
df_trial = pd.DataFrame(mytrial)

In [28]:
#[df_trial['mae_diff']<.05].sort_values(by=['val_mae']) | (df_trial['remark']=='tune feats selected by group ')
df_trial[(df_trial['remark']=='tune 437')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
587,2019-05-16 07:24:47.809001,30,1.846284,1.8e-05,1.896202,0.000107,0.049918
452,2019-05-16 06:46:11.662876,30,1.853478,5e-06,1.900535,7.2e-05,0.047056
486,2019-05-16 07:02:57.205850,30,1.854923,1e-05,1.902746,0.000108,0.047823
453,2019-05-16 06:46:20.441903,30,1.8552,2e-06,1.903429,5.7e-05,0.048229
569,2019-05-16 07:20:58.104760,30,1.861948,5e-06,1.905226,9.4e-05,0.043278


In [30]:
df_trial.loc[452:452][['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
452,2019-05-16 06:46:11.662876,tune 437,30,1.853478,5e-06,1.900535,7.2e-05,0.047056


In [29]:
df_trial.to_pickle('../trial/catboost.pkl')