In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [10]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_colwidth', -1)

In [11]:
def revert_rfe(df_train, param, sorted_columns, df_test, trial, start_columns, limit=None, remark=None):
    
    # init cv_score and try only base feature
    selected_columns = copy.deepcopy(start_columns)
    if type(limit) == type(None):
        limit = len(sorted_columns)
    args = copy.deepcopy(param)
    args['columns'] = selected_columns
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
    val_mae_mean = np.mean(df_his.valid)
    cv_score = val_mae_mean
    
    # add feature one by one and check cv score change
    for idx,col in enumerate(sorted_columns):
#         if idx in start_column_index:
#             continue
        args = copy.deepcopy(param)
        args['columns'] = list(set(selected_columns + [col]))
        df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
        val_mae_mean = np.mean(df_his.valid)
        if val_mae_mean < cv_score:
            selected_columns.append(col)
            cv_score = val_mae_mean
        if len(selected_columns) >= limit:
            break
        
    return selected_columns


In [3]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [7]:
db = DFDB('../trial/catboost.pkl', auto_commit=False)

In [8]:
df_trial = db.select()
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold'])

In [13]:
df_trial[['datetime','nfeatures', 'kfold', 'remark', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']].loc[213:236]

Unnamed: 0,datetime,nfeatures,kfold,remark,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark.1
213,2019-05-09 12:18:40.305298,200,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.862054,0.108043,2.021767,0.675172,0.159713,
214,2019-05-09 12:38:14.106394,195,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.877067,0.102867,2.019344,0.668432,0.142277,
215,2019-05-09 12:58:07.295867,190,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.838417,0.100596,2.020392,0.677335,0.181975,
216,2019-05-09 13:16:11.460278,185,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.864663,0.111162,2.018575,0.673013,0.153912,
217,2019-05-09 13:33:48.805698,180,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.879699,0.103621,2.019272,0.673049,0.139573,
218,2019-05-09 13:50:25.898565,175,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.886582,0.097584,2.021365,0.675371,0.134782,
219,2019-05-09 14:05:58.643515,170,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.893144,0.096304,2.021816,0.681556,0.128672,
220,2019-05-09 14:20:57.380109,165,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.88219,0.099578,2.025069,0.680746,0.142879,
221,2019-05-09 14:35:26.714485,160,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.869021,0.081725,2.025326,0.680568,0.156305,
222,2019-05-09 22:15:06.585802,160,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}",,1.894731,0.094456,2.02153,0.681675,0.126799,


In [None]:
param_idx = 236
column_idx = 213
db_ = db
df_trial_ = df_trial
mytrial = []
columns = copy.deepcopy(df_trial_.loc[column_idx]['param']['columns'])
param = copy.deepcopy(df_trial_.loc[param_idx]['param'])
param['algorithm']['init'] = {'logging_level': 'Silent', **param['algorithm']['init']}
param['scaler']['init'] = {}
param['algorithm']['fit'] = {}

selected_columns = revert_rfe(df_train, param, columns, df_test, mytrial, start_columns=[columns[0]], limit=20, remark='start from top1 column')
print(len(selected_columns))
for trial_i in mytrial:
    db_.insert(trial_i)
df_trial_ = db_.select()
df_trial_['kfold'] = df_trial_['param'].apply(lambda x: x['kfold'])

In [None]:
db_.commit()

In [3]:
mytrial =[]

#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.0001, 1)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
                'logging_level': 'Silent'
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 9')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

In [80]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [85]:
df_trial[(df_trial['remark']=='tune 9')&(df_trial['mae_diff']<.1)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
175,2019-05-19 13:16:11.593624,34,2.001394,0.000772,2.098855,0.002856,0.097461
133,2019-05-19 12:48:16.282690,34,2.012604,0.000851,2.10332,0.004457,0.090716
184,2019-05-19 13:19:23.692973,34,2.025755,0.001135,2.115385,0.003147,0.08963
205,2019-05-19 13:27:37.605488,34,2.031798,0.00124,2.120268,0.003316,0.08847


In [86]:
db.commit()

In [4]:
mytrial =[]
EP.select_features_(df_train, df_trial.loc[225]['param'], mytrial, nfeats_best=10, nfeats_removed_per_try=1, key='average_model_weight', remark='group3 RFE3')
for trial_i in mytrial:
    db.insert(trial_i)

In [107]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [108]:
df_trial[(df_trial['remark']=='group3 RFE3')][['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
211,2019-05-19 14:01:13.362562,34,2.012592,0.000867,2.104591,0.004387,0.091999
212,2019-05-19 14:02:10.587266,33,2.011891,0.00096,2.103509,0.004173,0.091618
213,2019-05-19 14:03:04.502816,32,2.011686,0.000879,2.105717,0.004033,0.094031
214,2019-05-19 14:03:56.238684,31,2.011478,0.000974,2.103843,0.004536,0.092365
215,2019-05-19 14:04:46.032615,30,2.011883,0.000935,2.10356,0.004511,0.091676
216,2019-05-19 14:05:35.946079,29,2.011029,0.000916,2.102193,0.004018,0.091165
217,2019-05-19 14:06:22.262427,28,2.012179,0.000963,2.103864,0.003936,0.091685
218,2019-05-19 14:07:06.267310,27,2.010659,0.000878,2.104995,0.00426,0.094336
219,2019-05-19 14:07:48.441406,26,2.010527,0.000957,2.104252,0.00429,0.093725
220,2019-05-19 14:08:29.891292,25,2.011318,0.000937,2.103217,0.004011,0.091899


In [110]:
cb_columns = ['iqr_6',
 'abs_q01_4',
 'q25_roll_std_100',
 'median__roll_std',
 'q05_roll_std_100',
 'q05_roll_std_10',
 'abs_q75_6',
 'abs_q25_5',
 'q05_roll_std_1000',
 'abs_q01_5',
 "number_peaks{'n': 10}",
 '5000std_quantile05',
 "number_peaks{'n': 3}",
 'abs_q75_7',
 'spkt_welch_density__coeff_3',
 'q75_roll_mean_10',
 '5000quantile75mean_',
 'Hilbert_mean_6']

In [5]:
mytrial =[]

#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.0001, 1)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':df_trial.loc[239]['param']['columns'],
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 239')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

In [113]:
len(mytrial)

200

In [115]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [120]:
df_trial[(df_trial['remark']=='tune 239')&(df_trial['mae_diff']<.09)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
399,2019-05-19 16:16:28.165523,18,2.011196,0.000486,2.094451,0.002037,0.083255
418,2019-05-19 16:24:59.264481,18,2.010556,0.000663,2.097445,0.004091,0.086889
442,2019-05-19 16:36:55.296284,18,2.013315,0.000613,2.10096,0.00196,0.087645
352,2019-05-19 15:51:04.411734,18,2.028762,0.000384,2.103298,0.002808,0.074536
396,2019-05-19 16:15:31.916870,18,2.03092,0.000522,2.107848,0.002218,0.076928
405,2019-05-19 16:18:24.352245,18,2.025673,0.000641,2.11132,0.002253,0.085647


In [121]:
db.commit()

In [6]:
param = copy.deepcopy(df_trial.loc[399]['param'])
param['kfold']['type'] = 'stratified'
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 399 use stratified')

In [125]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [126]:
df_trial.tail(1)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
448,2019-05-19 21:26:25.175250,18,2.030253,1e-06,2.035186,4e-06,0.004932


In [39]:
db.commit()

In [None]:
mytrial =[]

#  tune hypterparameters
def objective(trial):
        
    num_trees = trial.suggest_int('num_trees', 200, 1000)
    depth = trial.suggest_int('depth', 2, 10)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    l2_leaf_reg = trial.suggest_uniform('l2_leaf_reg', 0.0001, 1)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', .6, 1)
    random_strength = trial.suggest_uniform('random_strength', .001, 1)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':df_trial.loc[239]['param']['columns'],
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'cb.CatBoostRegressor',
            'init':{
                "num_trees":num_trees,
                "depth":depth,
                "learning_rate":learning_rate,
                "l2_leaf_reg":l2_leaf_reg,
                "bagging_temperature":bagging_temperature,
                "random_strength":random_strength,
                "random_state":random_state,
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 239 by stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

In [127]:
for trial_i in mytrial:
    db.insert(trial_i)

In [128]:
df_trial = db.select()
df_trial[(df_trial['remark']=='tune 239 by stratified')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(10)

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
625,2019-05-19 18:43:45.614456,tune 239 by stratified,18,1.870594,3.552597e-06,1.917066,8.9e-05,0.046472
501,2019-05-19 17:23:10.885796,tune 239 by stratified,18,1.869067,8.096949e-06,1.917206,0.000139,0.048138
552,2019-05-19 17:59:33.597379,tune 239 by stratified,18,1.869971,3.90674e-05,1.919193,0.000145,0.049222
621,2019-05-19 18:42:00.925931,tune 239 by stratified,18,1.871447,2.573961e-07,1.919232,0.000104,0.047786
531,2019-05-19 17:43:50.287298,tune 239 by stratified,18,1.873281,1.539356e-05,1.919493,0.000131,0.046213
583,2019-05-19 18:18:25.954161,tune 239 by stratified,18,1.874073,1.223199e-06,1.920116,0.000119,0.046043
642,2019-05-19 18:51:48.166331,tune 239 by stratified,18,1.88336,4.913177e-06,1.923299,0.000138,0.039939
478,2019-05-19 17:03:02.267204,tune 239 by stratified,18,1.882801,1.295311e-06,1.925255,9.2e-05,0.042454
479,2019-05-19 17:04:29.586846,tune 239 by stratified,18,1.883649,7.780033e-06,1.927169,0.000145,0.04352
502,2019-05-19 17:23:48.136932,tune 239 by stratified,18,1.892556,3.577277e-05,1.927895,8e-05,0.035339


In [35]:
db.commit()

In [7]:
param = copy.deepcopy(df_trial.loc[625]['param'])
param['kfold']['type'] = 'group'
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 625 use group')

In [130]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [131]:
df_trial.tail(1)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
649,2019-05-19 21:29:50.830576,18,1.812501,0.000577,2.080786,0.00455,0.268285


In [132]:
db.commit()