In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_colwidth', -1)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
df_spec_train = pd.read_pickle('../feats/spec_features.pkl')
df_spec_test = pd.read_pickle('../feats/spec_features_test.pkl')

In [6]:
df_train = pd.merge(df_train, df_spec_train, on='index')
df_test = pd.merge(df_test, df_spec_test, on='index')

In [7]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [8]:
db = DFDB('../trial/svr.pkl', auto_commit=False)

In [9]:
df_trial = db.select()
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial['algorithm-init'] = df_trial['param'].apply(lambda x: x['algorithm']['init'])

In [10]:
df_trial[(df_trial['kfold-type']=='group')&(df_trial['mae_diff']<.1)][['datetime','nfeatures', 'remark', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark','algorithm-init']].sort_values(by=['val_mae'], ascending=True).head()

Unnamed: 0,datetime,nfeatures,remark,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark.1,algorithm-init
122,2019-05-25 19:26:41.288587,23,tune 87,1.984271,0.002619,2.070003,0.015321,0.085732,tune 87,"{'kernel': 'rbf', 'degree': 2, 'gamma': 'auto', 'coef0': 0.0, 'tol': 0.004467335360842515, 'C': 0.07777223472867957, 'epsilon': 0.01, 'shrinking': True}"
183,2019-05-25 22:23:04.802706,23,tune 87,1.985069,0.002625,2.070099,0.015359,0.08503,tune 87,"{'kernel': 'rbf', 'degree': 2, 'gamma': 'auto', 'coef0': 0.0, 'tol': 0.00818001683713508, 'C': 0.07583092890979252, 'epsilon': 0.01, 'shrinking': True}"
271,2019-05-26 02:16:58.678651,23,tune 87,1.983767,0.002614,2.070112,0.015315,0.086345,tune 87,"{'kernel': 'rbf', 'degree': 2, 'gamma': 'auto', 'coef0': 0.0, 'tol': 0.009459294862178752, 'C': 0.07899133713603163, 'epsilon': 0.01, 'shrinking': True}"
173,2019-05-25 21:55:41.730612,23,tune 87,1.986476,0.002644,2.070117,0.015497,0.083641,tune 87,"{'kernel': 'rbf', 'degree': 2, 'gamma': 'auto', 'coef0': 0.0, 'tol': 0.009935811727902139, 'C': 0.0723079192876168, 'epsilon': 0.01, 'shrinking': True}"
217,2019-05-25 23:52:54.381507,23,tune 87,1.983275,0.00261,2.07013,0.01529,0.086855,tune 87,"{'kernel': 'rbf', 'degree': 2, 'gamma': 'auto', 'coef0': 0.0, 'tol': 0.007551034472617852, 'C': 0.08038546508971715, 'epsilon': 0.01, 'shrinking': True}"


In [11]:
catboost_columns = ['mfcc_3_rolling_std_mean',
  'max_9',
  'q25_roll_std_100',
  'max_to_min',
  'max_to_min_5',
  'iqr_6',
  'q05_roll_std_1000',
  'q05_roll_std_100',
  'abs_max_4',
  'abs_max_1',
  'spkt_welch_densitycoeff_2',
  'spkt_welch_density__coeff_3',
  'abs_q75_7',
  'q01_2',
  'abs_q01_4',
  'max_to_min_diff_5',
  'q05_5',
  'abs_q25_5',
  'abs_max_2',
  'min__roll_std',
  'median__roll_std',
  'abs_max_7',
  '5000peak_peak_amp_max_',
  'abs_q75_6']
lgbm_columns = ['mfcc_10_abs_q75',
  'q25_roll_std_100',
  'iqr_6',
  'mfcc_9_mean',
  'abs_q75_7',
  'mfcc_delta_5_min',
  'mfcc_delta_3_quantile01',
  'abs_q75_6',
  'q05_roll_std_100',
  'mfcc_accelerate_1_kurtosis',
  'mfcc_5_mean',
  'spkt_welch_density__coeff_42',
  'mfcc_3_rolling_std_mean',
  'mfcc_12_mean',
  '5000smoothness_entropy_',
  'mfcc_13_mean',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'mfcc_5_abs_max',
  'abs_q25_5',
  'mfcc_5_quantile95',
  'spkt_welch_densitycoeff_5',
  '5000crest_factor_quantile75',
  'spkt_welch_densitycoeff_2',
  'mfcc_accelerate_8_variance',
  "number_peaks{'n': 10}",
  'spkt_welch_density__coeff_3']
xgbm_columns = ['q05_roll_std_1000',
  'q05_roll_std_100',
  'mfcc_9_mean',
  'abs_q01_4',
  'iqr_6',
  'spkt_welch_density__coeff_4',
  'abs_q25_5',
  'abs_q75_6',
  'q01_2',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'abs_max_4',
  'median__roll_std',
  'abs_max_7',
  'spkt_welch_density__coeff_28',
  '5000crest_factor_quantile75',
  'mfcc_1_kurtosis',
  'mfcc_4_median',
  'q05_5',
  'abs_max_8',
  'abs_q75_7',
  'q25_roll_std_100',
  'mfcc_delta_3_quantile01',
  'spkt_welch_densitycoeff_2',
  'max_to_min_diff_5',
  'mfcc_12_mean',
  'spkt_welch_densitycoeff_5',
  'spkt_welch_density__coeff_3',
  "number_peaks{'n': 10}",
  'spkt_welch_density__coeff_27']

In [12]:
all_columns = catboost_columns+lgbm_columns+xgbm_columns
unique_columns = list(set(all_columns))
common_columns = []
common_columns50 = []
common_columns75 = []
common_columns95 = []
N_columns = 6
count_values = [all_columns.count(col) for col in unique_columns]
for col in unique_columns:
    if all_columns.count(col)==N_columns:
        common_columns.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .5):
        common_columns50.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .75):
        common_columns75.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .95):
        common_columns95.append(col)
print('unique_columns ',len(unique_columns))
print('common_columns50 ',len(common_columns50))
print('common_columns75 ',len(common_columns75))
print('common_columns95 ',len(common_columns95))
print('common_columns ',len(common_columns))

unique_columns  47
common_columns50  24
common_columns75  24
common_columns95  8
common_columns  0


In [13]:
param = {'algorithm': {'cls': 'SVR',
  'fit': {},
  'init': {'kernel': 'rbf',
   'degree': 2,
   'gamma': 'auto',
   'coef0': 0.0,
   'tol': 0.004467335360842515,
   'C': 0.07777223472867957,
   'epsilon': 0.01,
   'shrinking': True}},
 'columns': common_columns50,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler', 'init': {}}}

In [14]:
mytrial = []
EP.width_frist_rfe(df_train, param, mytrial, 999, df_test=df_test, remark='wf new 122')

In [20]:
len(mytrial)

238

In [18]:
for trial_i in mytrial:
    db.insert(trial_i)
db.commit()

In [15]:
df_trial = db.select()
df_trial[(df_trial['remark']=='wf new 122')][['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].sort_values(by=['val_mae'])

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
918,2019-06-02 11:56:25.239555,wf new 122,41,1.952661,0.002006,2.059689,0.012549,0.107028
928,2019-06-02 12:48:18.076337,wf new 122,41,1.952153,0.00212,2.059695,0.012911,0.107541
924,2019-06-02 12:27:41.646213,wf new 122,41,1.952636,0.00215,2.059922,0.012873,0.107286
898,2019-06-02 10:08:46.969255,wf new 122,42,1.951723,0.002064,2.060157,0.012566,0.108434
909,2019-06-02 11:08:00.397972,wf new 122,42,1.953449,0.002093,2.060292,0.012853,0.106843
919,2019-06-02 12:01:46.188509,wf new 122,41,1.951806,0.002043,2.060359,0.012453,0.108553
930,2019-06-02 12:58:35.738628,wf new 122,41,1.952908,0.002094,2.060377,0.01274,0.107469
927,2019-06-02 12:43:09.963892,wf new 122,41,1.950898,0.002059,2.06044,0.012513,0.109543
920,2019-06-02 12:07:04.320368,wf new 122,41,1.951265,0.002056,2.060442,0.01254,0.109176
925,2019-06-02 12:32:50.893015,wf new 122,41,1.950914,0.002057,2.060483,0.012513,0.109569


In [22]:
db.commit()