In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types

  from numpy.core.umath_tests import inner1d


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('../feats2/df_train.pkl')
df_test = pd.read_pickle('../feats2/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [31]:
#check feature_importances
df_feature_importances = df_trial.loc[4]['df_feature_importances']
if type(df_feature_importances)==pd.DataFrame:
    sorted_columns = EP.evaluate(df_feature_importances, key='average_model_weight')
else:
    sorted_columns = df_trial.loc[4]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)

In [32]:
df_feature_importances.sort_values(by=['average_model_weight'], ascending=False)

Unnamed: 0,feature,model_weight,weight,std,model_weight1,weight1,std1,model_weight2,weight2,std2,...,weight5,std5,model_weight6,weight6,std6,model_weight7,weight7,std7,average_permutation_weight,average_model_weight
47,spkt_welch_density__coeff_3,4.988149,-0.194858,0.00858,4.481089,-0.186348,0.014337,4.757821,-0.188325,0.01686,...,-0.181299,0.017517,4.911619,-0.195289,0.014746,4.491564,-0.200561,0.015163,-0.196414,4.687102
22,abs_q75_7,4.446631,-0.205921,0.0115,4.560484,-0.227333,0.015076,4.012104,-0.19122,0.005846,...,-0.177236,0.007413,3.812221,-0.145478,0.005197,3.727975,-0.178639,0.008251,-0.192069,4.176562
48,spkt_welch_densitycoeff_2,3.021998,-0.127359,0.011181,3.085853,-0.142378,0.008947,3.49695,-0.136841,0.010533,...,-0.133676,0.012038,2.856263,-0.106592,0.00599,3.158382,-0.128325,0.009177,-0.128062,3.083131
14,abs_max_roll_mean_1000,2.87462,-0.048463,0.003412,2.670005,-0.043473,0.002582,2.556403,-0.038594,0.000606,...,-0.037698,0.002038,3.404281,-0.044812,0.002104,3.116254,-0.04955,0.002302,-0.045282,2.970577
16,abs_q01_4,3.775791,-0.134519,0.006484,2.520751,-0.122053,0.00327,3.043375,-0.151515,0.017259,...,-0.087893,0.004647,2.599832,-0.114956,0.007358,2.789807,-0.118194,0.009416,-0.116114,2.754902
25,iqr_6,2.929616,-0.087857,0.002876,2.257818,-0.124814,0.00708,2.445765,-0.088003,0.002005,...,-0.089838,0.006569,3.01324,-0.116163,0.011083,2.653859,-0.125561,0.005083,-0.112061,2.594902
30,median__roll_std,1.939263,-0.054365,0.006918,2.8687,-0.106618,0.004541,2.975224,-0.122433,0.006728,...,-0.055902,0.005187,2.288807,-0.043464,0.003837,2.362376,-0.055231,0.00489,-0.076427,2.507453
21,abs_q75_6,2.207087,-0.080482,0.006309,2.26131,-0.101694,0.006051,2.528315,-0.178353,0.005207,...,-0.173954,0.007953,2.458459,-0.105899,0.004739,2.017898,-0.062812,0.003394,-0.129198,2.473392
19,abs_q25_5,2.405157,-0.066776,0.007167,2.975524,-0.147181,0.004123,1.817034,-0.086789,0.003508,...,-0.073679,0.00524,2.173679,-0.087991,0.00636,3.347884,-0.114285,0.009059,-0.102408,2.461337
44,q25_roll_std_100,2.526114,-0.08918,0.004912,2.552139,-0.099923,0.007171,1.825851,-0.103438,0.01257,...,-0.079606,0.004638,3.104202,-0.156503,0.007654,3.103385,-0.126377,0.005914,-0.10284,2.423584


In [7]:
mytrial = []
# mytrial = list(pd.read_pickle('trial/catboost.pkl').T.to_dict().values())

In [34]:
param={
    'algorithm': {
        'cls': 'cb.CatBoostRegressor',
        'fit': {
            'early_stopping_rounds': 200,
#             'eval_metric': 'mae',
            'verbose': False
        },
        'init': {
#             'task_type':"GPU",
        }
    },
    'columns': sorted_columns,
    'feature_importance': {
        'is_output': True,
        'permutation_feature_importance': True,
        'permutation_random_state': 1
    },
    'kfold': {
        'n_splits': 8,
        'random_state': 1985,
        'shuffle': True,
        'type': 'stratified'
    },
    'scaler': {
        'cls': 'StandardScaler'
    }
}

In [55]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, df_trial.loc[8]['param'], df_test = df_test, trial=mytrial)

In [35]:
#  select features by permutation_weight
EP.select_features_(df_train, param, mytrial, nfeats_best=20, nfeats_removed_per_try=5, key='average_model_weight')

In [47]:
cb.CatBoostRegressor?

In [46]:
#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.001, 100)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':df_trial.loc[8]['param']['columns'],
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
            },
            'fit':{
#                 'eval_metric':'mae',
                'verbose':False,
                'early_stopping_rounds':200,
            },
        },
        'feature_importance':{
            'is_output':False,
            'permutation_feature_importance':False,
            'permutation_random_state':1,
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune stratified feats')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

KeyboardInterrupt: 

In [59]:
df_test_pred = df_trial.loc[12]['df_test_pred']

In [61]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [62]:
df_trial = pd.DataFrame(mytrial)

In [57]:
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(100)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
0,2019-05-08 09:13:51.441156,1071,1.594586,2.2e-05,1.740538,0.000213,0.145952
1,2019-05-08 12:28:21.328235,200,1.576306,1.2e-05,1.704236,0.000296,0.127929
2,2019-05-08 13:27:41.334635,150,1.58093,2.1e-05,1.706429,0.000226,0.125499
3,2019-05-08 13:39:54.456132,100,1.576934,1.9e-05,1.696824,0.000187,0.11989
4,2019-05-08 13:47:35.439605,50,1.60094,3.2e-05,1.700155,0.000159,0.099215
5,2019-05-08 14:03:19.640908,50,1.601548,5.7e-05,1.701784,0.000143,0.100236
6,2019-05-08 14:10:21.323747,45,1.599223,4.4e-05,1.697374,0.000184,0.098151
7,2019-05-08 14:16:57.268365,40,1.603685,1.8e-05,1.699155,0.000205,0.09547
8,2019-05-08 14:23:25.597068,35,1.600778,5.2e-05,1.696229,0.000154,0.095451
9,2019-05-08 14:29:08.694610,30,1.616618,9.3e-05,1.708761,0.000215,0.092142


In [63]:
df_trial.to_pickle('trial/catboost.pkl')

In [65]:
df_trial.loc[12]['param']['columns']

['spkt_welch_density__coeff_3',
 'abs_q75_7',
 'abs_q01_4',
 'spkt_welch_densitycoeff_2',
 'abs_max_roll_mean_1000',
 'abs_max_7',
 'abs_q75_6',
 'q25_roll_std_100',
 'q05_2',
 'median__roll_std',
 'abs_q25_5',
 'abs_max_roll_mean_100',
 'q05_roll_std_1000',
 'abs_max_1',
 'iqr_6',
 'q05_roll_std_100',
 "number_crossing_m{'m': 1}",
 'abs_q75_2',
 'abs_max_2',
 'abs_max_8',
 '5000peak_peak_amp_max_',
 'abs_max_4',
 "value_count{'value': 1}",
 'abs_max_3',
 'max_to_min_diff_5',
 'min__roll_std',
 'max_to_min',
 'min_9',
 'abs_max_6',
 'min_roll_mean_100',
 'q75_roll_mean_10',
 'abs_max_5',
 'Hilbert_mean_6',
 "number_peaks{'n': 10}",
 'min_roll_std_100']