In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from rgf.sklearn import RGFRegressor

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types

  from numpy.core.umath_tests import inner1d
Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [7]:
db = DFDB('../trial/frgf.pkl', auto_commit=False)

In [8]:
#check feature_importances
# df_feature_importances = df_trial.loc[235]['df_feature_importances']
# if type(df_feature_importances)==pd.DataFrame:
#     sorted_columns = EP.evaluate(df_feature_importances, key='average_model_weight')
# else:
#     sorted_columns = df_trial.loc[235]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)
# len(sorted_columns)

In [9]:
catboost_columns = ['spkt_welch_density__coeff_3',
 'abs_q25_5',
 'q25_roll_std_100',
 'abs_q75_6',
 'abs_q75_7',
 'spkt_welch_densitycoeff_2',
 'abs_q01_4',
 'iqr_6',
 'q05_roll_std_100',
 'q05_roll_std_1000',
 'median__roll_std',
 'abs_q01_5',
 "number_peaks{'n': 10}",
 'FFT_Mag_75q0',
 "value_count{'value': 1}",
 'q01_roll_std_100',
 'abs_q95_2',
 'abs_q95_6',
 'MA_1000MA_std_mean_7',
 'q05_roll_std_10',
 'q01_roll_std_1000',
 'abs_max_roll_mean_1000',
 'abs_q75_2',
 'abs_q05_6',
 '5000std_quantile25',
 "number_crossing_m{'m': 1}",
 "autocorrelation{'lag': 5}",
 'q75_roll_std_10',
 'q05_2',
 '5000smoothness_quantile05']

In [29]:
common_columns = ['q25_roll_std_100',
 'abs_q25_5',
 'iqr_6',
 'abs_q01_4',
 'abs_q75_7',
 'spkt_welch_density__coeff_3',
 'spkt_welch_densitycoeff_2']

In [38]:
main_columns = ['q25_roll_std_100',
 'abs_q25_5',
 'q05_roll_std_1000',
 'abs_q95_2',
 'abs_q75_6',
 'iqr_6',
 "autocorrelation{'lag': 5}",
 'median__roll_std',
 'abs_q01_4',
 'q05_roll_std_100',
 'abs_q75_7',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'spkt_welch_density__coeff_3',
 'spkt_welch_densitycoeff_2']

In [39]:
# param = df_trial.loc[342]['param'].copy()
param = {'columns': main_columns,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler'},
 'algorithm': {'cls': 'RGFRegressor',
  'init': {
      'max_leaf':77, 
      'reg_depth':39.6652, 
      'l2':0.0765817, 
      'min_samples_leaf':51, 
      'learning_rate':.421769, 
  },
  'fit': {}},
}

In [43]:
mytrial = []

#  tune hypterparameters
def objective(trial):
        
    max_leaf = trial.suggest_int('max_leaf', 50, 5000)
    reg_depth = trial.suggest_uniform('reg_depth', 1.0, 100.0)
    l2 = trial.suggest_uniform('l2', 0.001, .1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 100)
    learning_rate = trial.suggest_uniform('learning_rate', .05, .5)
        
    args={
        'columns':main_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'RGFRegressor',
            'init':{
                "max_leaf":max_leaf,
                "reg_depth":reg_depth,
                "l2":l2,
                "min_samples_leaf":min_samples_leaf,
                "learning_rate":learning_rate,
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-18 13:59:20,508] Finished trial#0 resulted in value: 0.18346490096806758. Current best value is 0.18346490096806758 with parameters: {'max_leaf': 3191, 'reg_depth': 27.931100446126052, 'l2': 0.06794390256291125, 'min_samples_leaf': 100, 'learning_rate': 0.19466099051641772}.
[I 2019-05-18 14:01:45,374] Finished trial#1 resulted in value: 0.1546740567966697. Current best value is 0.1546740567966697 with parameters: {'max_leaf': 3021, 'reg_depth': 68.32488482527373, 'l2': 0.04416040601618562, 'min_samples_leaf': 67, 'learning_rate': 0.2570302232925987}.
[I 2019-05-18 14:04:17,075] Finished trial#2 resulted in value: 0.1410766624317462. Current best value is 0.1410766624317462 with parameters: {'max_leaf': 3109, 'reg_depth': 51.06823909643579, 'l2': 0.07830449380536389, 'min_samples_leaf': 45, 'learning_rate': 0.2792543522723773}.
[I 2019-05-18 14:04:53,571] Finished trial#3 resulted in value: 0.09311141789975302. Current best value is 0.09311141789975302 with parameters: {'max

[I 2019-05-18 14:44:58,132] Finished trial#28 resulted in value: 0.18389335863391387. Current best value is 0.07586773902651446 with parameters: {'max_leaf': 203, 'reg_depth': 96.89824210850634, 'l2': 0.031991663358375944, 'min_samples_leaf': 80, 'learning_rate': 0.3757755420254981}.
[I 2019-05-18 14:46:20,622] Finished trial#29 resulted in value: 0.11698604906589401. Current best value is 0.07586773902651446 with parameters: {'max_leaf': 203, 'reg_depth': 96.89824210850634, 'l2': 0.031991663358375944, 'min_samples_leaf': 80, 'learning_rate': 0.3757755420254981}.
[I 2019-05-18 14:48:29,981] Finished trial#30 resulted in value: 0.22923571543413654. Current best value is 0.07586773902651446 with parameters: {'max_leaf': 203, 'reg_depth': 96.89824210850634, 'l2': 0.031991663358375944, 'min_samples_leaf': 80, 'learning_rate': 0.3757755420254981}.
[I 2019-05-18 14:48:53,806] Finished trial#31 resulted in value: 0.11173834635366389. Current best value is 0.07586773902651446 with parameters: 

[I 2019-05-18 15:26:55,196] Finished trial#56 resulted in value: 0.14681812320217627. Current best value is 0.0650817502745598 with parameters: {'max_leaf': 351, 'reg_depth': 95.00754911455914, 'l2': 0.08333771062983356, 'min_samples_leaf': 14, 'learning_rate': 0.21915752324522883}.
[I 2019-05-18 15:27:03,309] Finished trial#57 resulted in value: 0.032428187712032416. Current best value is 0.032428187712032416 with parameters: {'max_leaf': 87, 'reg_depth': 88.45130091028811, 'l2': 0.08507606410483247, 'min_samples_leaf': 20, 'learning_rate': 0.0772022715608974}.
[I 2019-05-18 15:28:58,519] Finished trial#58 resulted in value: 0.10592808767822463. Current best value is 0.032428187712032416 with parameters: {'max_leaf': 87, 'reg_depth': 88.45130091028811, 'l2': 0.08507606410483247, 'min_samples_leaf': 20, 'learning_rate': 0.0772022715608974}.
[I 2019-05-18 15:29:28,897] Finished trial#59 resulted in value: 0.09263470015104955. Current best value is 0.032428187712032416 with parameters: {

[I 2019-05-18 15:48:49,321] Finished trial#84 resulted in value: 0.08916300267767784. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 15:49:42,594] Finished trial#85 resulted in value: 0.09866566852687056. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 15:53:24,545] Finished trial#86 resulted in value: 0.12028611062996353. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 15:54:24,491] Finished trial#87 resulted in value: 0.09665278223009495. Current best value is 0.023923312827178578 with parameters:

[I 2019-05-18 16:27:07,008] Finished trial#112 resulted in value: 0.11739310131213071. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 16:31:08,112] Finished trial#113 resulted in value: 0.13988570299281242. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 16:31:24,350] Finished trial#114 resulted in value: 0.11823370801239164. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 16:31:36,463] Finished trial#115 resulted in value: 0.06726415717033886. Current best value is 0.023923312827178578 with paramet

[I 2019-05-18 16:58:05,664] Finished trial#140 resulted in value: 0.16840772084937888. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 17:01:25,364] Finished trial#141 resulted in value: 0.1303687739455445. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 17:02:09,397] Finished trial#142 resulted in value: 0.0981675030251313. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 17:02:21,251] Finished trial#143 resulted in value: 0.09591354047026299. Current best value is 0.023923312827178578 with parameter

[I 2019-05-18 17:18:04,509] Finished trial#168 resulted in value: 0.09490507440698522. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 17:18:55,918] Finished trial#169 resulted in value: 0.09941273867743562. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 17:20:13,586] Finished trial#170 resulted in value: 0.10292420953700418. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 17:20:40,867] Finished trial#171 resulted in value: 0.08561692057497325. Current best value is 0.023923312827178578 with paramet

[I 2019-05-18 17:49:31,452] Finished trial#196 resulted in value: 0.09269972617881145. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 17:49:43,706] Finished trial#197 resulted in value: 0.05806455521861496. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 17:50:37,898] Finished trial#198 resulted in value: 0.10747498053981268. Current best value is 0.023923312827178578 with parameters: {'max_leaf': 83, 'reg_depth': 83.82931138712091, 'l2': 0.07807088494293479, 'min_samples_leaf': 21, 'learning_rate': 0.05048217323078135}.
[I 2019-05-18 17:50:54,191] Finished trial#199 resulted in value: 0.07814495169666791. Current best value is 0.023923312827178578 with paramet

In [44]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [49]:
df_trial[df_trial['mae_diff']<.05][['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].sort_values(by=['val_mae']).head(10)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
170,2019-05-18 17:20:13.549950,14,2.023472,0.00295,2.073119,0.013364,0.049647
80,2019-05-18 15:46:41.828134,14,2.023507,0.002961,2.073329,0.013351,0.049822
125,2019-05-18 16:45:09.986183,14,2.025042,0.002941,2.074763,0.013242,0.049721
129,2019-05-18 16:47:48.576534,14,2.026822,0.002941,2.075045,0.013109,0.048223
167,2019-05-18 17:17:06.429221,14,2.026301,0.002944,2.075081,0.01316,0.04878
178,2019-05-18 17:29:31.720605,14,2.026615,0.002934,2.075469,0.013162,0.048854
102,2019-05-18 16:11:40.156996,14,2.027142,0.002942,2.075606,0.01312,0.048464
64,2019-05-18 15:38:57.267591,14,2.029016,0.002932,2.07679,0.013013,0.047774
169,2019-05-18 17:18:55.883610,14,2.029002,0.002936,2.076869,0.013055,0.047867
85,2019-05-18 15:49:42.582485,14,2.02998,0.002928,2.077473,0.012952,0.047493


In [50]:
db.commit()

In [None]:
mytrial = []

#  tune hypterparameters
def objective(trial):
        
    max_leaf = trial.suggest_int('max_leaf', 50, 5000)
    reg_depth = trial.suggest_uniform('reg_depth', 1.0, 100.0)
    l2 = trial.suggest_uniform('l2', 0.001, .1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 100)
    learning_rate = trial.suggest_uniform('learning_rate', .05, .5)
        
    args={
        'columns':main_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'RGFRegressor',
            'init':{
                "max_leaf":max_leaf,
                "reg_depth":reg_depth,
                "l2":l2,
                "min_samples_leaf":min_samples_leaf,
                "learning_rate":learning_rate,
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune by stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-18 23:29:15,870] Finished trial#0 resulted in value: 0.08267396054501926. Current best value is 0.08267396054501926 with parameters: {'max_leaf': 3424, 'reg_depth': 17.29510357020571, 'l2': 0.014605736255217133, 'min_samples_leaf': 93, 'learning_rate': 0.3374237229665843}.
[I 2019-05-18 23:31:47,389] Finished trial#1 resulted in value: 0.02523979388254469. Current best value is 0.02523979388254469 with parameters: {'max_leaf': 2062, 'reg_depth': 68.98297025848068, 'l2': 0.09728122040164115, 'min_samples_leaf': 34, 'learning_rate': 0.3656202113087001}.
[I 2019-05-18 23:32:37,147] Finished trial#2 resulted in value: 0.038385125785398384. Current best value is 0.02523979388254469 with parameters: {'max_leaf': 2062, 'reg_depth': 68.98297025848068, 'l2': 0.09728122040164115, 'min_samples_leaf': 34, 'learning_rate': 0.3656202113087001}.
[I 2019-05-18 23:34:09,476] Finished trial#3 resulted in value: 0.02610444628234666. Current best value is 0.02523979388254469 with parameters: {'

[I 2019-05-19 00:31:34,717] Finished trial#28 resulted in value: 0.06601944994646712. Current best value is 0.012022015396694792 with parameters: {'max_leaf': 67, 'reg_depth': 98.76822608778025, 'l2': 0.037311523297428664, 'min_samples_leaf': 64, 'learning_rate': 0.18457753781736452}.
[I 2019-05-19 00:35:56,435] Finished trial#29 resulted in value: 0.0363675795252305. Current best value is 0.012022015396694792 with parameters: {'max_leaf': 67, 'reg_depth': 98.76822608778025, 'l2': 0.037311523297428664, 'min_samples_leaf': 64, 'learning_rate': 0.18457753781736452}.
[I 2019-05-19 00:38:21,933] Finished trial#30 resulted in value: 0.03237996389012643. Current best value is 0.012022015396694792 with parameters: {'max_leaf': 67, 'reg_depth': 98.76822608778025, 'l2': 0.037311523297428664, 'min_samples_leaf': 64, 'learning_rate': 0.18457753781736452}.
[I 2019-05-19 00:40:01,854] Finished trial#31 resulted in value: 0.047221962674402505. Current best value is 0.012022015396694792 with paramete

[I 2019-05-19 01:45:32,054] Finished trial#56 resulted in value: 0.030048896472722728. Current best value is 0.010684431020685108 with parameters: {'max_leaf': 66, 'reg_depth': 77.42297370314765, 'l2': 0.06832121701921648, 'min_samples_leaf': 55, 'learning_rate': 0.22036996135168074}.
[I 2019-05-19 01:52:56,413] Finished trial#57 resulted in value: 0.03341976663513361. Current best value is 0.010684431020685108 with parameters: {'max_leaf': 66, 'reg_depth': 77.42297370314765, 'l2': 0.06832121701921648, 'min_samples_leaf': 55, 'learning_rate': 0.22036996135168074}.
[I 2019-05-19 01:53:15,359] Finished trial#58 resulted in value: 0.01659447122826272. Current best value is 0.010684431020685108 with parameters: {'max_leaf': 66, 'reg_depth': 77.42297370314765, 'l2': 0.06832121701921648, 'min_samples_leaf': 55, 'learning_rate': 0.22036996135168074}.
[I 2019-05-19 01:58:37,328] Finished trial#59 resulted in value: 0.032040007009437425. Current best value is 0.010684431020685108 with parameter

[I 2019-05-19 02:23:25,596] Finished trial#90 resulted in value: 0.017841100270884744. Current best value is 0.010684431020685108 with parameters: {'max_leaf': 66, 'reg_depth': 77.42297370314765, 'l2': 0.06832121701921648, 'min_samples_leaf': 55, 'learning_rate': 0.22036996135168074}.
[I 2019-05-19 02:25:20,865] Finished trial#91 resulted in value: 0.029849953549687957. Current best value is 0.010684431020685108 with parameters: {'max_leaf': 66, 'reg_depth': 77.42297370314765, 'l2': 0.06832121701921648, 'min_samples_leaf': 55, 'learning_rate': 0.22036996135168074}.
[I 2019-05-19 02:25:34,078] Finished trial#92 resulted in value: 0.015056202349856888. Current best value is 0.010684431020685108 with parameters: {'max_leaf': 66, 'reg_depth': 77.42297370314765, 'l2': 0.06832121701921648, 'min_samples_leaf': 55, 'learning_rate': 0.22036996135168074}.
[I 2019-05-19 02:29:20,529] Finished trial#93 resulted in value: 0.033517636280968544. Current best value is 0.010684431020685108 with paramet

KeyboardInterrupt: 