In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types

  from numpy.core.umath_tests import inner1d


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('feats/df_train.pkl')
df_test = pd.read_pickle('feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [7]:
# mytrial = []
mytrial = list(pd.read_pickle('trial/gradientboosting.pkl').T.to_dict().values())
df_trial = pd.DataFrame(mytrial)
len(mytrial)

15

In [8]:
param={
    'algorithm': {
        'cls': 'GradientBoostingRegressor',
        'fit': {
#             'early_stopping_rounds': 200,
#             'eval_metric': 'mae',
#             'verbose': False
        },
        'init': {
#             'n_jobs':16,
        }
    },
    'columns': tsfresh_columns,
    'feature_importance': {
        'is_output': True,
        'permutation_feature_importance': True,
        'permutation_random_state': 1
    },
    'kfold': {
        'n_splits': 8,
        'random_state': 1985,
        'shuffle': True,
        'type': 'group'
    },
    'scaler': {
        'cls': 'StandardScaler'
    }
}

In [13]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = None, trial=mytrial)

In [16]:
#check feature_importances
df_feature_importances = df_trial.loc[14]['df_feature_importances']
if type(df_feature_importances)==pd.DataFrame:
    sorted_columns = EP.evaluate(df_feature_importances, key='average_model_weight')
else:
    sorted_columns = df_trial.loc[14]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)

In [17]:
len(sorted_columns)

70

In [18]:
param['columns']=sorted_columns[:-10]
#  select features by permutation_weight
EP.select_features_(df_train, param, mytrial, nfeats_best=25, nfeats_removed_per_try=10, key='average_permutation_weight')

In [22]:
df_trial = pd.DataFrame(mytrial)
#[(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
0,2019-05-10 10:08:45.942462,1071,1.818027,0.010086,2.195905,0.527265,0.377878
1,2019-05-10 12:36:44.746797,200,1.819021,0.009948,2.184827,0.523375,0.365805
2,2019-05-10 12:44:53.047447,190,1.820537,0.010106,2.179309,0.525266,0.358772
3,2019-05-10 12:52:49.016791,180,1.821202,0.010218,2.174044,0.530444,0.352842
4,2019-05-10 13:00:23.427527,170,1.822041,0.009875,2.171204,0.531807,0.349163
5,2019-05-10 13:07:32.021287,160,1.826642,0.010263,2.168563,0.541657,0.341921
6,2019-05-10 13:14:10.965752,150,1.830268,0.01018,2.162702,0.531846,0.332434
7,2019-05-10 13:20:35.009869,140,1.834635,0.010421,2.14593,0.539925,0.311295
8,2019-05-10 13:26:21.571473,130,1.835565,0.010528,2.14705,0.537258,0.311484
9,2019-05-10 13:31:45.137073,120,1.838652,0.011059,2.141269,0.548358,0.302617


In [21]:
GradientBoostingRegressor.fit?

In [None]:
###  tune hypterparameters
def objective(trial):
    
    max_depth = trial.suggest_int('max_depth', 2, 6)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    n_estimators = trial.suggest_int('n_estimators', 100,500)
    subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
    alpha = trial.suggest_uniform('alpha', 0.00001, 1.0)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':df_trial.loc[14]['param']['columns'],
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'GradientBoostingRegressor',
            'init':{
                "max_depth":max_depth,
                "learning_rate":learning_rate,
                "n_estimators":n_estimators,
                "subsample":subsample,
                "alpha":alpha,
                "random_state":random_state,
            },
            'fit':{
#                 'eval_metric':'mae',
#                 'verbose':False,
#                 'early_stopping_rounds':200,
            },
        },
        'feature_importance':{
            'is_output':False,
            'permutation_feature_importance':False,
            'permutation_random_state':1,
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune feats selected by group')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

In [43]:
df_test_pred = df_trial.loc[342]['df_test_pred']

In [44]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [25]:
df_trial = pd.DataFrame(mytrial)

In [32]:
#[df_trial['mae_diff']<.05].sort_values(by=['val_mae']) | (df_trial['remark']=='tune feats selected by group ')
df_trial[(df_trial['remark']=='tune feats selected by group')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head()

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
83,2019-05-12 02:39:59.250350,tune feats selected by group,70,1.856828,6e-06,1.903058,0.000434,0.046231
60,2019-05-11 23:38:40.848657,tune feats selected by group,70,1.856952,9e-06,1.906536,0.000461,0.049584
183,2019-05-12 09:48:16.149257,tune feats selected by group,70,1.863059,3e-06,1.910631,0.000432,0.047571
160,2019-05-12 09:00:36.183418,tune feats selected by group,70,1.870492,7e-06,1.914961,0.000454,0.044469
79,2019-05-12 02:01:09.027461,tune feats selected by group,70,1.876371,8e-06,1.919986,0.000399,0.043615


In [33]:
df_trial.to_pickle('trial/gradientboosting.pkl')

In [34]:
df_trial.loc[83]['param']

{'columns': ['q25_roll_std_100',
  'spkt_welch_density__coeff_3',
  'abs_q01_4',
  'abs_q25_5',
  'q05_roll_std_1000',
  'median__roll_std',
  'iqr_6',
  'spkt_welch_densitycoeff_2',
  'q05_roll_std_100',
  'q05_2',
  'abs_q75_7',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  "value_count{'value': -1}",
  'abs_q95_2',
  'MA_1000MA_std_mean_7',
  '5000smoothness_std_',
  '5000quantile25peak_to_average_power_ratio_',
  'FFT_Mag_25q0',
  '5000skewness_max_',
  'q05_5',
  'max_to_min_5',
  "autocorrelation{'lag': 5}",
  'abs_q05_2',
  'min__roll_std',
  '5000peak_peak_amp_max_',
  'max_to_min_diff_5',
  '5000form_factor_quantile75',
  'iqr',
  'kurt_7',
  'spkt_welch_density__coeff_4',
  "number_peaks{'n': 3}",
  'spkt_welch_density__coeff_42',
  'fft_coefficientcoeff_20__attr_"abs"',
  '5000quantile05median_',
  'abs_max_2',
  "number_peaks{'n': 1}",
  'abs_max_8',
  "number_peaks{'n': 5}",
  'spkt_welch_density__coeff_17',
  'abs_max_3',
  '4th_peak_psd',
