In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_rows', 2000)

In [6]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [7]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [8]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [9]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [10]:
#check feature_importances
# df_feature_importances = df_trial.loc[294]['df_feature_importances']
# sorted_columns = EP.evaluate(df_feature_importances, key='average_permutation_weight')
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False).head(100)
# len(sorted_columns)

In [11]:
# mytrial = []
mytrial = list(pd.read_pickle('../trial/xgbm.pkl').T.to_dict().values())
df_trial = pd.DataFrame(mytrial)

In [12]:
param = df_trial.loc[865]['param']

In [14]:
param = {'columns': ['q25_roll_std_100',
  'abs_q01_4',
  'spkt_welch_density__coeff_3',
  'q05_roll_std_100',
  'abs_q25_5',
  'spkt_welch_densitycoeff_2',
  'iqr_6',
  "number_peaks{'n': 10}",
  'q05_roll_std_1000',
  'abs_q75_6',
  'abs_q95_2',
  'median__roll_std',
  'q05_5',
  'abs_q75_7',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'spkt_welch_density__coeff_25',
  'spkt_welch_density__coeff_4',
  'abs_max_1',
  '5000std_quantile05',
  'agg_autocorrelationf_agg_"mean"__maxlag_40',
  'Hilbert_mean_2',
  'FFT_Mag_75q0',
  '5000smoothness_entropy_',
  'MA_1000MA_std_mean_7',
  "number_peaks{'n': 5}"],
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler'},
 'algorithm': {'cls': 'xgb.XGBRegressor',
  'init': {'max_depth': 3,
   'max_bin': 38,
   'eta': 0.27801915385245873,
   'colsample_bytree': 0.9416983653127328,
   'min_child_weight': 238,
   'n_estimators': 165,
   'subsample': 0.7471829960670435,
   'reg_lambda': 0.6813060508093833,
   'reg_alpha': 0.36085980027529035,
   'n_jobs': 16},
  'fit': {'eval_metric': 'mae',
   'verbose': False,
   'early_stopping_rounds': 200}},
}

In [15]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial)

In [82]:
#  select features by permutation_weight
EP.select_features_(df_train, param, mytrial, nfeats_best=20, nfeats_removed_per_try=5, key='average_permutation_weight')

In [103]:
len(columns294)

25

In [18]:
#  tune hypterparameters
def objective(trial):
        
    max_depth = trial.suggest_int('max_depth', 2, 6)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    eta = trial.suggest_uniform('eta', 0.01, 0.4)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 200, 600)
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
    reg_lambda = trial.suggest_uniform('reg_lambda', 0.000001, 1.0)
    reg_alpha = trial.suggest_uniform('reg_alpha', 0.000001, 1.0)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':param['columns'].copy(),
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'xgb.XGBRegressor',
            'init':{
                "max_depth":max_depth,
                "max_bin":max_bin,
                "eta":eta,
                "colsample_bytree":colsample_bytree,
                "min_child_weight":min_child_weight,
                "n_estimators":n_estimators,
                "subsample":subsample,
                "reg_lambda":reg_lambda,
                "reg_alpha":reg_alpha,
                'n_jobs':16
            },
            'fit':{
                'eval_metric':'mae',
                'verbose':False,
                'early_stopping_rounds':200,
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 1099')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-16 09:14:05,009] Finished a trial resulted in value: 0.22332616119117185. Current best value is 0.22332616119117185 with parameters: {'max_depth': 5, 'max_bin': 70, 'eta': 0.15413747048153745, 'colsample_bytree': 0.997771832730266, 'min_child_weight': 345, 'n_estimators': 261, 'subsample': 0.6125066380774034, 'reg_lambda': 0.2087631822784529, 'reg_alpha': 0.5501064541517754, 'random_state': 2240}.
[I 2019-05-16 09:14:17,812] Finished a trial resulted in value: 0.34135284820188955. Current best value is 0.22332616119117185 with parameters: {'max_depth': 5, 'max_bin': 70, 'eta': 0.15413747048153745, 'colsample_bytree': 0.997771832730266, 'min_child_weight': 345, 'n_estimators': 261, 'subsample': 0.6125066380774034, 'reg_lambda': 0.2087631822784529, 'reg_alpha': 0.5501064541517754, 'random_state': 2240}.
[I 2019-05-16 09:14:21,409] Finished a trial resulted in value: 0.11315994770983447. Current best value is 0.11315994770983447 with parameters: {'max_depth': 2, 'max_bin': 73, 

[I 2019-05-16 09:15:27,133] Finished a trial resulted in value: 0.07385803881693298. Current best value is 0.06739604299285634 with parameters: {'max_depth': 3, 'max_bin': 32, 'eta': 0.28712134044767545, 'colsample_bytree': 0.6281088800171014, 'min_child_weight': 503, 'n_estimators': 109, 'subsample': 0.6808896930660422, 'reg_lambda': 0.04045632786454262, 'reg_alpha': 0.3247036557971885, 'random_state': 7187}.
[I 2019-05-16 09:15:36,407] Finished a trial resulted in value: 0.18504255769761563. Current best value is 0.06739604299285634 with parameters: {'max_depth': 3, 'max_bin': 32, 'eta': 0.28712134044767545, 'colsample_bytree': 0.6281088800171014, 'min_child_weight': 503, 'n_estimators': 109, 'subsample': 0.6808896930660422, 'reg_lambda': 0.04045632786454262, 'reg_alpha': 0.3247036557971885, 'random_state': 7187}.
[I 2019-05-16 09:15:38,968] Finished a trial resulted in value: 0.0893943018722566. Current best value is 0.06739604299285634 with parameters: {'max_depth': 3, 'max_bin': 3

[I 2019-05-16 09:16:49,586] Finished a trial resulted in value: 0.08028552393954957. Current best value is 0.05683156488218611 with parameters: {'max_depth': 2, 'max_bin': 84, 'eta': 0.13798883187977934, 'colsample_bytree': 0.6425718028802578, 'min_child_weight': 531, 'n_estimators': 140, 'subsample': 0.8889148108583331, 'reg_lambda': 0.27145770735296665, 'reg_alpha': 0.8096204469996953, 'random_state': 7823}.
[I 2019-05-16 09:16:55,674] Finished a trial resulted in value: 0.21239377852704508. Current best value is 0.05683156488218611 with parameters: {'max_depth': 2, 'max_bin': 84, 'eta': 0.13798883187977934, 'colsample_bytree': 0.6425718028802578, 'min_child_weight': 531, 'n_estimators': 140, 'subsample': 0.8889148108583331, 'reg_lambda': 0.27145770735296665, 'reg_alpha': 0.8096204469996953, 'random_state': 7823}.
[I 2019-05-16 09:16:57,513] Finished a trial resulted in value: 0.06120943551994754. Current best value is 0.05683156488218611 with parameters: {'max_depth': 2, 'max_bin': 

[I 2019-05-16 09:18:06,873] Finished a trial resulted in value: 0.3097385740667385. Current best value is 0.04703886080491549 with parameters: {'max_depth': 2, 'max_bin': 60, 'eta': 0.1768410843742944, 'colsample_bytree': 0.7367186252762243, 'min_child_weight': 566, 'n_estimators': 100, 'subsample': 0.9077497450330404, 'reg_lambda': 0.45876214908583335, 'reg_alpha': 0.7519475870876794, 'random_state': 8150}.
[I 2019-05-16 09:18:14,770] Finished a trial resulted in value: 0.1804071999411052. Current best value is 0.04703886080491549 with parameters: {'max_depth': 2, 'max_bin': 60, 'eta': 0.1768410843742944, 'colsample_bytree': 0.7367186252762243, 'min_child_weight': 566, 'n_estimators': 100, 'subsample': 0.9077497450330404, 'reg_lambda': 0.45876214908583335, 'reg_alpha': 0.7519475870876794, 'random_state': 8150}.
[I 2019-05-16 09:18:19,361] Finished a trial resulted in value: 0.1506218540085801. Current best value is 0.04703886080491549 with parameters: {'max_depth': 2, 'max_bin': 60, '

[I 2019-05-16 09:19:03,833] Finished a trial resulted in value: 0.07829011691598281. Current best value is 0.045125228230492094 with parameters: {'max_depth': 2, 'max_bin': 77, 'eta': 0.19375694888369022, 'colsample_bytree': 0.9651791591525931, 'min_child_weight': 573, 'n_estimators': 113, 'subsample': 0.6931806237371879, 'reg_lambda': 0.7550648959719608, 'reg_alpha': 0.21356375395321225, 'random_state': 9015}.
[I 2019-05-16 09:19:06,058] Finished a trial resulted in value: 0.06058442734067909. Current best value is 0.045125228230492094 with parameters: {'max_depth': 2, 'max_bin': 77, 'eta': 0.19375694888369022, 'colsample_bytree': 0.9651791591525931, 'min_child_weight': 573, 'n_estimators': 113, 'subsample': 0.6931806237371879, 'reg_lambda': 0.7550648959719608, 'reg_alpha': 0.21356375395321225, 'random_state': 9015}.
[I 2019-05-16 09:19:08,148] Finished a trial resulted in value: 0.05169696229899783. Current best value is 0.045125228230492094 with parameters: {'max_depth': 2, 'max_bin

[I 2019-05-16 09:20:12,970] Finished a trial resulted in value: 0.06729347377094243. Current best value is 0.045125228230492094 with parameters: {'max_depth': 2, 'max_bin': 77, 'eta': 0.19375694888369022, 'colsample_bytree': 0.9651791591525931, 'min_child_weight': 573, 'n_estimators': 113, 'subsample': 0.6931806237371879, 'reg_lambda': 0.7550648959719608, 'reg_alpha': 0.21356375395321225, 'random_state': 9015}.
[I 2019-05-16 09:20:17,058] Finished a trial resulted in value: 0.13970878890619798. Current best value is 0.045125228230492094 with parameters: {'max_depth': 2, 'max_bin': 77, 'eta': 0.19375694888369022, 'colsample_bytree': 0.9651791591525931, 'min_child_weight': 573, 'n_estimators': 113, 'subsample': 0.6931806237371879, 'reg_lambda': 0.7550648959719608, 'reg_alpha': 0.21356375395321225, 'random_state': 9015}.
[I 2019-05-16 09:20:21,786] Finished a trial resulted in value: 0.16539842008105246. Current best value is 0.045125228230492094 with parameters: {'max_depth': 2, 'max_bin

[I 2019-05-16 09:21:19,304] Finished a trial resulted in value: 0.1179953317134778. Current best value is 0.045125228230492094 with parameters: {'max_depth': 2, 'max_bin': 77, 'eta': 0.19375694888369022, 'colsample_bytree': 0.9651791591525931, 'min_child_weight': 573, 'n_estimators': 113, 'subsample': 0.6931806237371879, 'reg_lambda': 0.7550648959719608, 'reg_alpha': 0.21356375395321225, 'random_state': 9015}.
[I 2019-05-16 09:21:21,658] Finished a trial resulted in value: 0.06491254989997902. Current best value is 0.045125228230492094 with parameters: {'max_depth': 2, 'max_bin': 77, 'eta': 0.19375694888369022, 'colsample_bytree': 0.9651791591525931, 'min_child_weight': 573, 'n_estimators': 113, 'subsample': 0.6931806237371879, 'reg_lambda': 0.7550648959719608, 'reg_alpha': 0.21356375395321225, 'random_state': 9015}.
[I 2019-05-16 09:21:23,168] Finished a trial resulted in value: 0.043682682185597045. Current best value is 0.043682682185597045 with parameters: {'max_depth': 2, 'max_bin

[I 2019-05-16 09:22:26,128] Finished a trial resulted in value: 0.053089756489457365. Current best value is 0.043682682185597045 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta': 0.16971531294720194, 'colsample_bytree': 0.7986767339359048, 'min_child_weight': 570, 'n_estimators': 108, 'subsample': 0.6638606954842163, 'reg_lambda': 0.7483672066652254, 'reg_alpha': 0.057329806805137995, 'random_state': 6194}.
[I 2019-05-16 09:22:28,923] Finished a trial resulted in value: 0.10218545083108882. Current best value is 0.043682682185597045 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta': 0.16971531294720194, 'colsample_bytree': 0.7986767339359048, 'min_child_weight': 570, 'n_estimators': 108, 'subsample': 0.6638606954842163, 'reg_lambda': 0.7483672066652254, 'reg_alpha': 0.057329806805137995, 'random_state': 6194}.
[I 2019-05-16 09:22:31,810] Finished a trial resulted in value: 0.07745222945652137. Current best value is 0.043682682185597045 with parameters: {'max_depth': 2, 'max_

[I 2019-05-16 09:23:27,051] Finished a trial resulted in value: 0.050498713246271636. Current best value is 0.043682682185597045 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta': 0.16971531294720194, 'colsample_bytree': 0.7986767339359048, 'min_child_weight': 570, 'n_estimators': 108, 'subsample': 0.6638606954842163, 'reg_lambda': 0.7483672066652254, 'reg_alpha': 0.057329806805137995, 'random_state': 6194}.
[I 2019-05-16 09:23:29,145] Finished a trial resulted in value: 0.0784597377718161. Current best value is 0.043682682185597045 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta': 0.16971531294720194, 'colsample_bytree': 0.7986767339359048, 'min_child_weight': 570, 'n_estimators': 108, 'subsample': 0.6638606954842163, 'reg_lambda': 0.7483672066652254, 'reg_alpha': 0.057329806805137995, 'random_state': 6194}.
[I 2019-05-16 09:23:30,416] Finished a trial resulted in value: 0.04680115854626666. Current best value is 0.043682682185597045 with parameters: {'max_depth': 2, 'max_b

[I 2019-05-16 09:24:07,822] Finished a trial resulted in value: 0.06319991911247884. Current best value is 0.04345219677362077 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta': 0.3330136278457244, 'colsample_bytree': 0.9994467568793932, 'min_child_weight': 518, 'n_estimators': 101, 'subsample': 0.6573916871012608, 'reg_lambda': 0.55057174985273, 'reg_alpha': 0.2609187406625414, 'random_state': 6111}.
[I 2019-05-16 09:24:09,332] Finished a trial resulted in value: 0.06835523817638041. Current best value is 0.04345219677362077 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta': 0.3330136278457244, 'colsample_bytree': 0.9994467568793932, 'min_child_weight': 518, 'n_estimators': 101, 'subsample': 0.6573916871012608, 'reg_lambda': 0.55057174985273, 'reg_alpha': 0.2609187406625414, 'random_state': 6111}.
[I 2019-05-16 09:24:11,411] Finished a trial resulted in value: 0.05569544921624145. Current best value is 0.04345219677362077 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta

[I 2019-05-16 09:25:09,966] Finished a trial resulted in value: 0.07901004929525424. Current best value is 0.04345219677362077 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta': 0.3330136278457244, 'colsample_bytree': 0.9994467568793932, 'min_child_weight': 518, 'n_estimators': 101, 'subsample': 0.6573916871012608, 'reg_lambda': 0.55057174985273, 'reg_alpha': 0.2609187406625414, 'random_state': 6111}.
[I 2019-05-16 09:25:11,760] Finished a trial resulted in value: 0.05065985705218221. Current best value is 0.04345219677362077 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta': 0.3330136278457244, 'colsample_bytree': 0.9994467568793932, 'min_child_weight': 518, 'n_estimators': 101, 'subsample': 0.6573916871012608, 'reg_lambda': 0.55057174985273, 'reg_alpha': 0.2609187406625414, 'random_state': 6111}.
[I 2019-05-16 09:25:14,858] Finished a trial resulted in value: 0.1297017104004351. Current best value is 0.04345219677362077 with parameters: {'max_depth': 2, 'max_bin': 71, 'eta'

In [19]:
df_trial = pd.DataFrame(mytrial)
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])

In [20]:
df_trial[(df_trial['remark']=='tune 1099')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','kfold-type', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,kfold-type,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
1229,2019-05-16 09:22:11.699960,stratified,25,1.909132,4.802897e-06,1.958156,0.000155,0.049024
1104,2019-05-16 09:14:30.883310,stratified,25,1.909005,1.147812e-05,1.958252,8.5e-05,0.049247
1231,2019-05-16 09:22:19.630253,stratified,25,1.910503,1.409861e-05,1.959671,8.5e-05,0.049167
1278,2019-05-16 09:24:33.492560,stratified,25,1.911468,1.553884e-05,1.959677,5.4e-05,0.048209
1112,2019-05-16 09:15:10.652280,stratified,25,1.917552,1.450193e-05,1.96286,6.2e-05,0.045309
1106,2019-05-16 09:14:41.810814,stratified,25,1.916741,1.048464e-05,1.964034,0.000111,0.047293
1172,2019-05-16 09:18:54.750120,stratified,25,1.924779,1.634338e-05,1.96556,6.6e-05,0.040782
1282,2019-05-16 09:24:45.958261,stratified,25,1.922498,1.111919e-05,1.965673,5.2e-05,0.043175
1236,2019-05-16 09:22:35.698510,stratified,25,1.922214,8.130392e-06,1.965697,0.000123,0.043483
1202,2019-05-16 09:20:42.911479,stratified,25,1.920873,1.811181e-05,1.96594,8.6e-05,0.045068


In [22]:
df_trial.to_pickle('../trial/xgbm.pkl')

In [130]:
columns146 = df_trial.loc[146]['param']['columns']
columns474 = df_trial.loc[474]['param']['columns']#865
columns294 = df_trial.loc[294]['param']['columns']#903

In [23]:
df_trial.loc[1172:1172][['datetime','nfeatures', 'remark', 'kfold-type','train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,remark,kfold-type,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
1099,2019-05-16 09:12:22.544278,25,,stratified,1.890071,9e-06,1.947984,0.000115,0.057913


In [24]:
df_trial = pd.DataFrame(mytrial)
df_trial.to_pickle('../trial/xgbm.pkl')