In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import optuna

from common import EP
from dfdb import DFDB

import types
import copy


numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [7]:
catboost_columns = ['spkt_welch_density__coeff_3',
 'spkt_welch_densitycoeff_2',
 'abs_q25_5',
 'abs_q75_6',
 'q05_roll_std_1000',
 'abs_q75_7',
 'abs_q95_2',
 'q05_5',
 'abs_q75_2',
 '5000skewness_max_',
 'fft_coefficientcoeff_80__attr_"imag"',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 '5000kurtosis_mean_',
 "number_peaks{'n': 1}",
 '5000smoothness_entropy_',
 'ave10_7',
 'q75_roll_std_1000',
 'FFT_Mag_25q0',
 'fft_coefficientcoeff_20__attr_"abs"']
lgbm_columns = ['q25_roll_std_100',
 'abs_q25_5',
 'spkt_welch_density__coeff_3',
 'abs_q75_6',
 'abs_q75_7',
 'spkt_welch_densitycoeff_2',
 'median__roll_std',
 'abs_q01_5',
 '5000smoothness_quantile05',
 '5000smoothness_std_',
 'abs_q95_3',
 'FFT_Mag_75q0',
 '5000median_std_',
 'spkt_welch_density__coeff_17']
xgbm_columns = ['q25_roll_std_100',
 'abs_q25_5',
 'spkt_welch_density__coeff_3',
 'abs_q75_7',
 'spkt_welch_densitycoeff_2',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'agg_autocorrelationf_agg_"mean"__maxlag_40',
 "number_peaks{'n': 1}",
 '5000peak_peak_amp_max_',
 'abs_q95_3',
 'spkt_welch_density__coeff_89',
 'abs_q05_2']
randomforest_randomforest = ['abs_q25_5', 'abs_q01_4', 'q25_roll_std_100']
extratrees_columns = ['q05_2',
 "number_peaks{'n': 1}",
 'abs_q01_6',
 'abs_q95_2',
 '5000smoothness_quantile25',
 '5000std_median_',
 '5000smoothness_median_',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'abs_q75_6',
 "number_peaks{'n': 3}",
 'q75_roll_std_10',
 "number_peaks{'n': 10}",
 '5000min_quantile75',
 '5000smoothness_quantile05',
 "number_peaks{'n': 5}",
 'abs_q01_2',
 '5000smoothness_mean_',
 'min_roll_std_100',
 'abs_q05_2',
 'q01_roll_std_1000']
gradientboosting_columns = ['q05_5',
 'kurt_1',
 'abs_q75_6',
 'abs_q75_7',
 'spkt_welch_density__coeff_28',
 'spkt_welch_density__coeff_99',
 'fft_coefficientcoeff_6__attr_"abs"',
 '5000smoothness_quantile05',
 'q25_roll_std_100',
 'spkt_welch_densitycoeff_2',
 'abs_max_1',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'abs_q25_5',
 'abs_q01_7',
 'q05_8',
 'Hilbert_mean_6',
 'abs_q95_2',
 '5000skewness_max_',
 '5000kurtosis_mean_',
 'spkt_welch_density__coeff_3']

In [8]:
all_columns = catboost_columns+lgbm_columns+xgbm_columns+randomforest_randomforest+extratrees_columns+gradientboosting_columns
unique_columns = list(set(all_columns))
common_columns = []
common_columns50 = []
common_columns75 = []
common_columns95 = []
N_columns = 6
count_values = [all_columns.count(col) for col in unique_columns]
for col in unique_columns:
    if all_columns.count(col)==N_columns:
        common_columns.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .5):
        common_columns50.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .75):
        common_columns75.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .95):
        common_columns95.append(col)
print('unique_columns ',len(unique_columns))
print('common_columns50 ',len(common_columns50))
print('common_columns75 ',len(common_columns75))
print('common_columns95 ',len(common_columns95))
print('common_columns ',len(common_columns))

unique_columns  55
common_columns50  55
common_columns75  15
common_columns95  7
common_columns  0


In [9]:
db = DFDB('../trial/svr.pkl', auto_commit=False)

In [10]:
df_trial = db.select()

In [13]:
mytrial = []
columns = common_columns50
param = {'algorithm': {'cls': 'SVR',
  'fit': {},
  'init': {'kernel': 'rbf',
   'degree': 2,
   'gamma': 'auto',
   'coef0': 0.0,
   'tol': 0.001,
   'C': 0.01,
   'epsilon': 0.01,
   'shrinking': True}},
 'columns': common_columns95,

 'kfold': {'n_splits': 8,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},#stratified
 'scaler': {'cls': 'StandardScaler', 'init':{}}}

selected_columns = EP.revert_rfe(df_train, param, columns, df_test, mytrial, start_columns=common_columns95, remark='start from top1 column 2th')
print(len(selected_columns))
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold'])

23


In [14]:
selected_columns

['abs_q75_7',
 'abs_q75_6',
 'q25_roll_std_100',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'spkt_welch_densitycoeff_2',
 'spkt_welch_density__coeff_3',
 'abs_q25_5',
 "number_peaks{'n': 1}",
 'abs_max_1',
 'q75_roll_std_1000',
 'abs_q01_4',
 'spkt_welch_density__coeff_28',
 'fft_coefficientcoeff_6__attr_"abs"',
 "number_peaks{'n': 10}",
 'q05_roll_std_1000',
 'FFT_Mag_75q0',
 'Hilbert_mean_6',
 'FFT_Mag_25q0',
 'spkt_welch_density__coeff_17',
 'abs_q95_3',
 '5000std_median_',
 '5000kurtosis_mean_',
 'ave10_7']

In [16]:
df_trial[df_trial['remark']=='start from top1 column 2th'][['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
37,2019-05-25 09:03:10.706005,7,2.0486,0.013863,2.107101,0.747082,0.058501,start from top1 column 2th
38,2019-05-25 09:09:11.679913,8,2.050101,0.013929,2.106993,0.748712,0.056892,start from top1 column 2th
39,2019-05-25 09:15:30.301626,9,2.051601,0.013977,2.106727,0.750697,0.055126,start from top1 column 2th
40,2019-05-25 09:21:54.226557,10,2.054471,0.014024,2.108176,0.752683,0.053705,start from top1 column 2th
41,2019-05-25 09:28:18.957383,10,2.053538,0.013955,2.110898,0.74856,0.05736,start from top1 column 2th
42,2019-05-25 09:34:48.218966,10,2.054115,0.013981,2.110393,0.75212,0.056278,start from top1 column 2th
43,2019-05-25 09:41:22.905512,10,2.052796,0.013991,2.106546,0.752843,0.05375,start from top1 column 2th
44,2019-05-25 09:48:18.546358,11,2.049634,0.013927,2.106217,0.749555,0.056583,start from top1 column 2th
45,2019-05-25 09:55:21.813597,12,2.050437,0.013949,2.105917,0.751867,0.05548,start from top1 column 2th
46,2019-05-25 10:02:32.321257,12,2.050437,0.013949,2.105917,0.751867,0.05548,start from top1 column 2th


In [17]:
db.commit()

In [19]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    tol = trial.suggest_uniform('tol', 0.0001, 0.01)
    C = trial.suggest_uniform('C', 0.0001, 0.1)
        
    args={'algorithm': {'cls': 'SVR',
      'fit': {},
      'init': {'kernel': 'rbf',
       'degree': 2,
       'gamma': 'auto',
       'coef0': 0.0,
       'tol': tol,
       'C': C,
       'epsilon': 0.01,
       'shrinking': True}},
     'columns': selected_columns,

     'kfold': {'n_splits': 3,
      'random_state': 1985,
      'shuffle': True,
      'type': 'group'},#stratified
     'scaler': {'cls': 'StandardScaler', 'init':{}}}
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 87')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[32m[I 2019-05-25 17:54:06,072][0m Finished trial#0 resulted in value: 0.17508020184742037. Current best value is 0.17508020184742037 with parameters: {'tol': 0.008749191395273746, 'C': 0.07448845572282828}.[0m
[32m[I 2019-05-25 17:57:25,568][0m Finished trial#1 resulted in value: 0.1199310578741198. Current best value is 0.1199310578741198 with parameters: {'tol': 0.008152892383936775, 'C': 0.02823464453730016}.[0m
[32m[I 2019-05-25 18:00:37,142][0m Finished trial#2 resulted in value: 0.17082239300443106. Current best value is 0.1199310578741198 with parameters: {'tol': 0.008152892383936775, 'C': 0.02823464453730016}.[0m
[32m[I 2019-05-25 18:04:14,675][0m Finished trial#3 resulted in value: 0.010105284118146055. Current best value is 0.010105284118146055 with parameters: {'tol': 0.0066856168838831396, 'C': 0.001554955406969218}.[0m
[32m[I 2019-05-25 18:07:35,719][0m Finished trial#4 resulted in value: 0.10127187288219874. Current best value is 0.010105284118146055 with p

[32m[I 2019-05-25 19:53:12,207][0m Finished trial#38 resulted in value: 0.14676255851989095. Current best value is 0.0029565943838838903 with parameters: {'tol': 0.005216574995275543, 'C': 0.00041665420155923005}.[0m
[32m[I 2019-05-25 19:56:09,047][0m Finished trial#39 resulted in value: 0.13458169423365113. Current best value is 0.0029565943838838903 with parameters: {'tol': 0.005216574995275543, 'C': 0.00041665420155923005}.[0m
[32m[I 2019-05-25 19:59:09,634][0m Finished trial#40 resulted in value: 0.0028440428390800764. Current best value is 0.0028440428390800764 with parameters: {'tol': 0.006853007470164037, 'C': 0.00039821147505778357}.[0m
[32m[I 2019-05-25 20:02:04,816][0m Finished trial#41 resulted in value: 0.09536918427978823. Current best value is 0.0028440428390800764 with parameters: {'tol': 0.006853007470164037, 'C': 0.00039821147505778357}.[0m
[32m[I 2019-05-25 20:05:00,713][0m Finished trial#42 resulted in value: 0.15945519712503475. Current best value is 0

[32m[I 2019-05-25 21:44:25,383][0m Finished trial#76 resulted in value: 0.04394810786099414. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-25 21:47:13,334][0m Finished trial#77 resulted in value: 0.13045355494101255. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-25 21:50:01,059][0m Finished trial#78 resulted in value: 0.15561734301804872. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-25 21:52:48,628][0m Finished trial#79 resulted in value: 0.15998944139478333. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-25 21:55:41,812][0m Finished trial#80 resulted in value: 0.17314644608526158. Current best value is

[32m[I 2019-05-25 23:26:23,338][0m Finished trial#114 resulted in value: 0.10854152971416349. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-25 23:29:01,100][0m Finished trial#115 resulted in value: 0.16788313846775382. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-25 23:31:37,093][0m Finished trial#116 resulted in value: 0.07198741266016298. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-25 23:34:14,018][0m Finished trial#117 resulted in value: 0.12334603564024643. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-25 23:36:50,049][0m Finished trial#118 resulted in value: 0.15519716997145835. Current best val

[32m[I 2019-05-26 01:05:01,226][0m Finished trial#151 resulted in value: 0.08333705078867902. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-26 01:07:40,528][0m Finished trial#152 resulted in value: 0.034149793793028564. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-26 01:10:19,920][0m Finished trial#153 resulted in value: 0.061731953719264335. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-26 01:13:03,357][0m Finished trial#154 resulted in value: 0.0024575089066338793. Current best value is 0.00020763037288037967 with parameters: {'tol': 0.008569832461238543, 'C': 0.00020376193209543426}.[0m
[32m[I 2019-05-26 01:15:42,262][0m Finished trial#155 resulted in value: 0.06980870127800023. Current best

[32m[I 2019-05-26 02:43:36,988][0m Finished trial#188 resulted in value: 0.08875113999050374. Current best value is 0.0001018889090753803 with parameters: {'tol': 0.009472177302741386, 'C': 0.00019931903638828116}.[0m
[32m[I 2019-05-26 02:46:15,231][0m Finished trial#189 resulted in value: 0.037222142610248. Current best value is 0.0001018889090753803 with parameters: {'tol': 0.009472177302741386, 'C': 0.00019931903638828116}.[0m
[32m[I 2019-05-26 02:48:53,719][0m Finished trial#190 resulted in value: 0.11354186075357336. Current best value is 0.0001018889090753803 with parameters: {'tol': 0.009472177302741386, 'C': 0.00019931903638828116}.[0m
[32m[I 2019-05-26 02:51:31,953][0m Finished trial#191 resulted in value: 0.08016187579386365. Current best value is 0.0001018889090753803 with parameters: {'tol': 0.009472177302741386, 'C': 0.00019931903638828116}.[0m
[32m[I 2019-05-26 02:54:09,797][0m Finished trial#192 resulted in value: 0.09845703184471824. Current best value is 

In [20]:
for trial_i in mytrial:
    db.insert(trial_i)

In [21]:
# df_trial = db.select()
# df_trial[(df_trial['remark']=='tune 1')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(10)

In [22]:
db.commit()

In [None]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    tol = trial.suggest_uniform('tol', 0.0001, 0.01)
    C = trial.suggest_uniform('C', 0.0001, 0.1)
        
    args={'algorithm': {'cls': 'SVR',
      'fit': {},
      'init': {'kernel': 'rbf',
       'degree': 2,
       'gamma': 'auto',
       'coef0': 0.0,
       'tol': tol,
       'C': C,
       'epsilon': 0.01,
       'shrinking': True}},
     'columns': selected_columns,

     'kfold': {'n_splits': 3,
      'random_state': 1985,
      'shuffle': True,
      'type': 'stratified'},#stratified
     'scaler': {'cls': 'StandardScaler', 'init':{}}}
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 87 by stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[32m[I 2019-05-26 03:15:18,435][0m Finished trial#0 resulted in value: 0.0115960663189059. Current best value is 0.0115960663189059 with parameters: {'tol': 0.00898505377010385, 'C': 0.0648527632065916}.[0m
[32m[I 2019-05-26 03:17:56,202][0m Finished trial#1 resulted in value: 0.012248161735003054. Current best value is 0.0115960663189059 with parameters: {'tol': 0.00898505377010385, 'C': 0.0648527632065916}.[0m
[32m[I 2019-05-26 03:20:34,126][0m Finished trial#2 resulted in value: 0.004905193351458348. Current best value is 0.004905193351458348 with parameters: {'tol': 0.004688633727892607, 'C': 0.010188618898813789}.[0m
[32m[I 2019-05-26 03:23:11,992][0m Finished trial#3 resulted in value: 0.012024828597155328. Current best value is 0.004905193351458348 with parameters: {'tol': 0.004688633727892607, 'C': 0.010188618898813789}.[0m
[32m[I 2019-05-26 03:25:49,307][0m Finished trial#4 resulted in value: 0.007959715725871551. Current best value is 0.004905193351458348 with p

[32m[I 2019-05-26 04:55:19,053][0m Finished trial#38 resulted in value: 0.011904163893081071. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 04:57:56,115][0m Finished trial#39 resulted in value: 0.012790462844690365. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 05:00:32,353][0m Finished trial#40 resulted in value: 0.005300585874358395. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 05:03:09,110][0m Finished trial#41 resulted in value: 0.0102681069169765. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 05:05:45,278][0m Finished trial#42 resulted in value: 0.007220436148119708. Current best value is 

[32m[I 2019-05-26 06:37:44,917][0m Finished trial#76 resulted in value: 0.0004761238635052482. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 06:40:25,812][0m Finished trial#77 resulted in value: 0.007119669325098985. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 06:43:06,712][0m Finished trial#78 resulted in value: 0.006468335921183069. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 06:45:46,682][0m Finished trial#79 resulted in value: 0.005900831341894533. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 06:48:27,195][0m Finished trial#80 resulted in value: 0.004430221003042463. Current best value 

[32m[I 2019-05-26 08:23:46,672][0m Finished trial#113 resulted in value: 0.008259375634249232. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 08:26:30,382][0m Finished trial#114 resulted in value: 0.009340707111097819. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 08:29:23,936][0m Finished trial#115 resulted in value: 0.0025799204183889635. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 08:32:10,535][0m Finished trial#116 resulted in value: 0.00866049579942948. Current best value is 0.00021009373096850543 with parameters: {'tol': 0.00846769704300727, 'C': 0.00014837204427602384}.[0m
[32m[I 2019-05-26 08:35:05,190][0m Finished trial#117 resulted in value: 0.012484284914501022. Current best va

In [None]:
for trial_i in mytrial:
    db.insert(trial_i)
db.commit()