In [11]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import optuna

from common import EP
from dfdb import DFDB

import types
import copy

In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [7]:
common_columns50 = ["number_peaks{'n': 5}",
 'abs_q75_6',
 'q01_roll_std_100',
 '5000crest_factor_quantile75',
 'abs_q01_4',
 'q25_roll_std_100',
 'q05_roll_std_10',
 'median__roll_std',
 'abs_q05_6',
 '5000form_factor_quantile75',
 '5000smoothness_quantile05',
 "quantile{'q': 0.9}",
 'abs_q75_2',
 'q01_roll_std_1000',
 '5000quantile75mean_',
 'spkt_welch_density__coeff_3',
 "number_peaks{'n': 10}",
 "number_peaks{'n': 1}",
 '5000smoothness_mean_',
 'abs_q25_5',
 '5000std_quantile05',
 '5000smoothness_std_',
 '5000smoothness_median_',
 '5000median_variance_',
 'spkt_welch_density__coeff_4',
 '5000variance_quantile25',
 'abs_q95_2',
 'abs_q75_7',
 'q05_5',
 '5000smoothness_entropy_',
 '5000smoothness_quantile25',
 'q01_2',
 'q05_roll_std_100',
 '3th_peak_freq',
 "autocorrelation{'lag': 5}",
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'q01_roll_std_10',
 'abs_q01_3',
 '5000variance_median_',
 'q75_roll_std_10',
 '5000skewness_max_',
 'iqr_3',
 'abs_q01_5',
 'q75_roll_mean_10',
 '5000quantile99median_',
 'iqr_6',
 'Hilbert_mean_6',
 'q05_roll_std_1000',
 "number_peaks{'n': 3}",
 'spkt_welch_densitycoeff_2',
 '5000std_median_',
 '5000std_quantile25']

In [8]:
common_columns75 = ['abs_q75_6',
 'abs_q01_4',
 'q25_roll_std_100',
 'q05_roll_std_10',
 'median__roll_std',
 '5000smoothness_quantile05',
 'spkt_welch_density__coeff_3',
 "number_peaks{'n': 10}",
 'abs_q25_5',
 'abs_q75_7',
 'q05_5',
 'q05_roll_std_100',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'iqr_6',
 'q05_roll_std_1000']

In [9]:
common_columns95 = ['abs_q75_6',
 'abs_q01_4',
 'q25_roll_std_100',
 "number_peaks{'n': 10}",
 'abs_q25_5',
 'q05_roll_std_100',
 'iqr_6',
 'q05_roll_std_1000']

In [12]:
db = DFDB('../trial2/svr.pkl', auto_commit=False)

In [20]:
param = {'algorithm': {'cls': 'SVR',
  'fit': {},
  'init': {'kernel': 'rbf',
   'degree': 2,
   'gamma': 'auto',
   'coef0': 0.0,
   'tol': 0.001,
   'C': 0.01,
   'epsilon': 0.01,
   'shrinking': True}},
 'columns': common_columns95,

 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},#stratified
 'scaler': {'cls': 'StandardScaler'}}

In [21]:
# run one try
mytrial = []
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='try common_columns95')
db.insert(mytrial[0])
df_trial = db.select()

In [22]:
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
0,2019-05-20 01:31:12.243686,52,2.032887,0.001813,2.13426,0.005904,0.101373,try common_columns50
1,2019-05-20 01:36:01.459672,15,2.047455,0.003158,2.092718,0.014978,0.045264,try common_columns75
2,2019-05-20 01:39:53.913244,8,2.053058,0.003195,2.095292,0.015183,0.042234,try common_columns95


In [23]:
db.commit()

In [None]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    tol = trial.suggest_uniform('tol', 0.0001, 0.01)
    C = trial.suggest_uniform('C', 0.0001, 0.1)
        
    args={'algorithm': {'cls': 'SVR',
      'fit': {},
      'init': {'kernel': 'rbf',
       'degree': 2,
       'gamma': 'auto',
       'coef0': 0.0,
       'tol': tol,
       'C': C,
       'epsilon': 0.01,
       'shrinking': True}},
     'columns': common_columns75,

     'kfold': {'n_splits': 3,
      'random_state': 1985,
      'shuffle': True,
      'type': 'group'},#stratified
     'scaler': {'cls': 'StandardScaler'}}
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 1')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-20 01:47:52,017] Finished a trial resulted in value: 0.2176318459171829. Current best value is 0.2176318459171829 with parameters: {'tol': 0.0022743584316052435, 'C': 0.07772127735463577}.
[I 2019-05-20 01:51:27,906] Finished a trial resulted in value: 0.23006625110572052. Current best value is 0.2176318459171829 with parameters: {'tol': 0.0022743584316052435, 'C': 0.07772127735463577}.
[I 2019-05-20 01:55:18,791] Finished a trial resulted in value: 0.21302681117529873. Current best value is 0.21302681117529873 with parameters: {'tol': 0.009936216362271008, 'C': 0.07109160433854167}.
[I 2019-05-20 01:58:30,613] Finished a trial resulted in value: 0.124556669153277. Current best value is 0.124556669153277 with parameters: {'tol': 0.00029965763316254653, 'C': 0.016552321684991808}.
[I 2019-05-20 02:01:42,979] Finished a trial resulted in value: 0.2220427959211385. Current best value is 0.124556669153277 with parameters: {'tol': 0.00029965763316254653, 'C': 0.016552321684991808

[I 2019-05-20 06:04:45,844] Finished a trial resulted in value: 0.17070909697717307. Current best value is 0.0001580228836165994 with parameters: {'tol': 0.006867385429730456, 'C': 0.00015719886906617308}.
[I 2019-05-20 06:07:58,299] Finished a trial resulted in value: 0.16238894193057757. Current best value is 0.0001580228836165994 with parameters: {'tol': 0.006867385429730456, 'C': 0.00015719886906617308}.


In [28]:
for trial_i in mytrial:
    db.insert(trial_i)

In [31]:
df_trial = db.select()
df_trial[(df_trial['remark']=='tune 1')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(10)

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
137,2019-05-20 09:01:46.433971,tune 1,15,2.043196,0.003123,2.09175,0.015234,0.048555
170,2019-05-20 10:48:16.389100,tune 1,15,2.044071,0.003133,2.091971,0.015201,0.0479
81,2019-05-20 06:01:33.199607,tune 1,15,2.044651,0.003136,2.092031,0.015176,0.04738
118,2019-05-20 08:00:40.156557,tune 1,15,2.043713,0.003133,2.092043,0.015241,0.048331
31,2019-05-20 03:20:26.879165,tune 1,15,2.044751,0.003129,2.092091,0.015182,0.04734
39,2019-05-20 03:46:04.667985,tune 1,15,2.045696,0.003142,2.092262,0.015109,0.046565
104,2019-05-20 07:15:30.785332,tune 1,15,2.046396,0.003144,2.092399,0.015011,0.046004
157,2019-05-20 10:06:25.296764,tune 1,15,2.046736,0.003153,2.092413,0.015026,0.045678
184,2019-05-20 11:33:21.369361,tune 1,15,2.047225,0.003154,2.092593,0.01499,0.045368
145,2019-05-20 09:27:35.568088,tune 1,15,2.048043,0.003155,2.092812,0.014963,0.04477


In [None]:
param = copy.deepcopy(df_trial.loc[137]['param'])
param['kfold']['type'] = 'stratified'
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 137 use stratified')

In [36]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [36]:
df_trial.tail(1)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
203,2019-05-20 12:49:47.634780,15,2.054332,2e-06,2.055705,2.3e-05,0.001373,remodel 137 use stratified


In [37]:
db.commit()

In [None]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    tol = trial.suggest_uniform('tol', 0.0001, 0.01)
    C = trial.suggest_uniform('C', 0.0001, 0.1)
        
    args={'algorithm': {'cls': 'SVR',
      'fit': {},
      'init': {'kernel': 'rbf',
       'degree': 2,
       'gamma': 'auto',
       'coef0': 0.0,
       'tol': tol,
       'C': C,
       'epsilon': 0.01,
       'shrinking': True}},
     'columns': common_columns75,

     'kfold': {'n_splits': 3,
      'random_state': 1985,
      'shuffle': True,
      'type': 'stratified'},#stratified
     'scaler': {'cls': 'StandardScaler'}}
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 1 by stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

In [41]:
for trial_i in mytrial:
    db.insert(trial_i)

In [42]:
df_trial = db.select()
df_trial[(df_trial['remark']=='tune 1 by stratified')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(10)

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
228,2019-05-20 02:46:36.962602,tune 0 by stratified,52,2.114446,3e-06,2.115535,3.6e-05,0.001089
329,2019-05-20 02:47:12.231780,tune 0 by stratified,52,2.118512,3e-06,2.11952,3.5e-05,0.001007
249,2019-05-20 02:46:45.458087,tune 0 by stratified,52,2.124563,3e-06,2.125458,3.5e-05,0.000894
291,2019-05-20 02:47:00.866893,tune 0 by stratified,52,2.126036,3e-06,2.126911,3.6e-05,0.000875
388,2019-05-20 02:47:32.508708,tune 0 by stratified,52,2.127294,2e-06,2.128154,3.6e-05,0.00086
207,2019-05-20 02:46:29.050081,tune 0 by stratified,52,2.131934,2e-06,2.132731,3.6e-05,0.000797
260,2019-05-20 02:46:50.338657,tune 0 by stratified,52,2.133011,2e-06,2.133807,3.6e-05,0.000796
347,2019-05-20 02:47:18.378053,tune 0 by stratified,52,2.134873,2e-06,2.135663,3.7e-05,0.000789
210,2019-05-20 02:46:30.102680,tune 0 by stratified,52,2.136211,2e-06,2.136992,3.7e-05,0.00078
283,2019-05-20 02:46:58.613954,tune 0 by stratified,52,2.140295,2e-06,2.141042,3.8e-05,0.000747


In [43]:
param = copy.deepcopy(df_trial.loc[228]['param'])
param['kfold']['type'] = 'group'
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 228 use group')

In [44]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [45]:
df_trial.tail(1)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
405,2019-05-20 02:51:40.118659,52,2.104126,0.002708,2.137954,0.011853,0.033828


In [41]:
db.commit()

(204, 14)