In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_colwidth', -1)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
df_spec_train = pd.read_pickle('../feats/spec_features.pkl')
df_spec_test = pd.read_pickle('../feats/spec_features_test.pkl')

In [6]:
df_train = pd.merge(df_train, df_spec_train, on='index')
df_test = pd.merge(df_test, df_spec_test, on='index')

In [7]:
db = DFDB('../trial/ridge.pkl', auto_commit=False)

In [8]:
df_trial = db.select()
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial['algorithm-init'] = df_trial['param'].apply(lambda x: x['algorithm']['init'])

In [9]:
df_trial[(df_trial['kfold-type']=='group')&(df_trial['mae_diff']<.1)][['datetime','nfeatures', 'remark', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark','algorithm-init']].sort_values(by=['val_mae'], ascending=True).head()

Unnamed: 0,datetime,nfeatures,remark,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark.1,algorithm-init
14,2019-05-16 11:27:47.800787,165,,2.074874,0.001981,2.156191,0.008585,0.081317,,"{'alpha': 10000, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': 42}"
251,2019-05-25 06:12:57.901092,44,tune 199,2.116238,0.013467,2.167211,0.533201,0.050973,tune 199,"{'alpha': 10642, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.0005679380685224855, 'solver': 'auto', 'random_state': 5294}"
391,2019-05-25 06:15:58.381449,44,tune 199,2.119027,0.013523,2.169186,0.534659,0.050158,tune 199,"{'alpha': 12390, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.008109716938994626, 'solver': 'auto', 'random_state': 5849}"
202,2019-05-25 06:11:48.235489,44,tune 199,2.120168,0.013546,2.170008,0.535256,0.04984,tune 199,"{'alpha': 13158, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.00945683764956391, 'solver': 'auto', 'random_state': 3713}"
291,2019-05-25 06:13:56.203938,44,tune 199,2.120498,0.013553,2.170247,0.535429,0.049749,tune 199,"{'alpha': 13386, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.0044772542525338115, 'solver': 'auto', 'random_state': 5441}"


In [10]:
catboost_columns = ['mfcc_3_rolling_std_mean',
  'max_9',
  'q25_roll_std_100',
  'max_to_min',
  'max_to_min_5',
  'iqr_6',
  'q05_roll_std_1000',
  'q05_roll_std_100',
  'abs_max_4',
  'abs_max_1',
  'spkt_welch_densitycoeff_2',
  'spkt_welch_density__coeff_3',
  'abs_q75_7',
  'q01_2',
  'abs_q01_4',
  'max_to_min_diff_5',
  'q05_5',
  'abs_q25_5',
  'abs_max_2',
  'min__roll_std',
  'median__roll_std',
  'abs_max_7',
  '5000peak_peak_amp_max_',
  'abs_q75_6']
lgbm_columns = ['mfcc_10_abs_q75',
  'q25_roll_std_100',
  'iqr_6',
  'mfcc_9_mean',
  'abs_q75_7',
  'mfcc_delta_5_min',
  'mfcc_delta_3_quantile01',
  'abs_q75_6',
  'q05_roll_std_100',
  'mfcc_accelerate_1_kurtosis',
  'mfcc_5_mean',
  'spkt_welch_density__coeff_42',
  'mfcc_3_rolling_std_mean',
  'mfcc_12_mean',
  '5000smoothness_entropy_',
  'mfcc_13_mean',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'mfcc_5_abs_max',
  'abs_q25_5',
  'mfcc_5_quantile95',
  'spkt_welch_densitycoeff_5',
  '5000crest_factor_quantile75',
  'spkt_welch_densitycoeff_2',
  'mfcc_accelerate_8_variance',
  "number_peaks{'n': 10}",
  'spkt_welch_density__coeff_3']
xgbm_columns = ['q05_roll_std_1000',
  'q05_roll_std_100',
  'mfcc_9_mean',
  'abs_q01_4',
  'iqr_6',
  'spkt_welch_density__coeff_4',
  'abs_q25_5',
  'abs_q75_6',
  'q01_2',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'abs_max_4',
  'median__roll_std',
  'abs_max_7',
  'spkt_welch_density__coeff_28',
  '5000crest_factor_quantile75',
  'mfcc_1_kurtosis',
  'mfcc_4_median',
  'q05_5',
  'abs_max_8',
  'abs_q75_7',
  'q25_roll_std_100',
  'mfcc_delta_3_quantile01',
  'spkt_welch_densitycoeff_2',
  'max_to_min_diff_5',
  'mfcc_12_mean',
  'spkt_welch_densitycoeff_5',
  'spkt_welch_density__coeff_3',
  "number_peaks{'n': 10}",
  'spkt_welch_density__coeff_27']

In [11]:
all_columns = catboost_columns+lgbm_columns+xgbm_columns
unique_columns = list(set(all_columns))
common_columns = []
common_columns50 = []
common_columns75 = []
common_columns95 = []
N_columns = 6
count_values = [all_columns.count(col) for col in unique_columns]
for col in unique_columns:
    if all_columns.count(col)==N_columns:
        common_columns.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .5):
        common_columns50.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .75):
        common_columns75.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .95):
        common_columns95.append(col)
print('unique_columns ',len(unique_columns))
print('common_columns50 ',len(common_columns50))
print('common_columns75 ',len(common_columns75))
print('common_columns95 ',len(common_columns95))
print('common_columns ',len(common_columns))

unique_columns  47
common_columns50  24
common_columns75  24
common_columns95  8
common_columns  0


In [13]:
param = {'algorithm': {'cls': 'Ridge',
  'fit': {},
  'init': {'alpha': 10000,
   'fit_intercept': True,
   'normalize': False,
   'copy_X': True,
   'max_iter': None,
   'tol': 0.001,
   'solver': 'auto',
   'random_state': 42}},
 'columns': unique_columns,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler', 'init':{}}}

In [17]:
mytrial = []
EP.width_frist_rfe(df_train, param, mytrial, 999, df_test=df_test, remark='wf new 14')

In [20]:
for trial_i in mytrial:
    db.insert(trial_i)
db.commit()

In [21]:
df_trial = db.select()
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial['algorithm-init'] = df_trial['param'].apply(lambda x: x['algorithm']['init'])

In [22]:
df_trial[(df_trial['remark']=='wf new 14')][['datetime','nfeatures', 'remark', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark','algorithm-init']].sort_values(by=['val_mae'], ascending=True).head()

Unnamed: 0,datetime,nfeatures,remark,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark.1,algorithm-init
1386,2019-06-02 01:48:19.687465,25,wf new 14,2.124926,0.003387,2.132191,0.014497,0.007266,wf new 14,"{'alpha': 10000, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': 42}"
1397,2019-06-02 01:48:21.573419,25,wf new 14,2.124849,0.003377,2.132195,0.014484,0.007346,wf new 14,"{'alpha': 10000, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': 42}"
1362,2019-06-02 01:48:15.404241,26,wf new 14,2.124826,0.003375,2.132196,0.014481,0.007371,wf new 14,"{'alpha': 10000, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': 42}"
1419,2019-06-02 01:48:25.854501,24,wf new 14,2.124935,0.003387,2.1322,0.014493,0.007265,wf new 14,"{'alpha': 10000, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': 42}"
1401,2019-06-02 01:48:22.464275,25,wf new 14,2.124864,0.00338,2.132203,0.014491,0.007338,wf new 14,"{'alpha': 10000, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': 42}"


In [27]:
mytrial = []
#  tune hypterparameters
def objective(trial):

    alpha = trial.suggest_int('alpha', 10000, 100000)
    tol = trial.suggest_uniform('tol', 0.0001, 0.01)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={'algorithm': {'cls': 'Ridge',
      'fit': {},
      'init': {'alpha': alpha,
       'fit_intercept': True,
       'normalize': False,
       'copy_X': True,
       'max_iter': None,
       'tol': tol,
       'solver': 'auto',
       'random_state': random_state}},
     'columns': df_trial.loc[1386]['param']['columns'],
     'kfold': {'n_splits': 3,
      'random_state': 1985,
      'shuffle': True,
      'type': 'stratified'},#stratified
     'scaler': {'cls': 'StandardScaler', 'init':{}}}
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 1386 s3')
    val_mae_mean = np.mean(df_his.valid)

    return  val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-06-02 02:07:41,123] Finished a trial resulted in value: 2.2106918608693555. Current best value is 2.2106918608693555 with parameters: {'alpha': 61146, 'tol': 0.008115437013893046, 'random_state': 4353}.
[I 2019-06-02 02:07:41,276] Finished a trial resulted in value: 2.181510927639543. Current best value is 2.181510927639543 with parameters: {'alpha': 42581, 'tol': 0.009897071269653997, 'random_state': 7699}.
[I 2019-06-02 02:07:41,450] Finished a trial resulted in value: 2.203130317968757. Current best value is 2.181510927639543 with parameters: {'alpha': 42581, 'tol': 0.009897071269653997, 'random_state': 7699}.
[I 2019-06-02 02:07:41,660] Finished a trial resulted in value: 2.249329136533584. Current best value is 2.181510927639543 with parameters: {'alpha': 42581, 'tol': 0.009897071269653997, 'random_state': 7699}.
[I 2019-06-02 02:07:41,896] Finished a trial resulted in value: 2.196460673140056. Current best value is 2.181510927639543 with parameters: {'alpha': 42581, 'tol'

[I 2019-06-02 02:07:49,831] Finished a trial resulted in value: 2.2083765303922345. Current best value is 2.12885210833838 with parameters: {'alpha': 10550, 'tol': 0.0007345400477711591, 'random_state': 9922}.
[I 2019-06-02 02:07:50,054] Finished a trial resulted in value: 2.1531287072233747. Current best value is 2.12885210833838 with parameters: {'alpha': 10550, 'tol': 0.0007345400477711591, 'random_state': 9922}.
[I 2019-06-02 02:07:50,286] Finished a trial resulted in value: 2.1613505127707846. Current best value is 2.12885210833838 with parameters: {'alpha': 10550, 'tol': 0.0007345400477711591, 'random_state': 9922}.
[I 2019-06-02 02:07:50,620] Finished a trial resulted in value: 2.127889144060644. Current best value is 2.127889144060644 with parameters: {'alpha': 10024, 'tol': 0.0041087078468304405, 'random_state': 1994}.
[I 2019-06-02 02:07:50,844] Finished a trial resulted in value: 2.1983920036768896. Current best value is 2.127889144060644 with parameters: {'alpha': 10024, 't

[I 2019-06-02 02:07:59,786] Finished a trial resulted in value: 2.129238719530388. Current best value is 2.127889144060644 with parameters: {'alpha': 10024, 'tol': 0.0041087078468304405, 'random_state': 1994}.
[I 2019-06-02 02:08:00,013] Finished a trial resulted in value: 2.1717588398393715. Current best value is 2.127889144060644 with parameters: {'alpha': 10024, 'tol': 0.0041087078468304405, 'random_state': 1994}.
[I 2019-06-02 02:08:00,280] Finished a trial resulted in value: 2.1337840606581557. Current best value is 2.127889144060644 with parameters: {'alpha': 10024, 'tol': 0.0041087078468304405, 'random_state': 1994}.
[I 2019-06-02 02:08:00,493] Finished a trial resulted in value: 2.1818099093710646. Current best value is 2.127889144060644 with parameters: {'alpha': 10024, 'tol': 0.0041087078468304405, 'random_state': 1994}.
[I 2019-06-02 02:08:00,759] Finished a trial resulted in value: 2.1448398184961914. Current best value is 2.127889144060644 with parameters: {'alpha': 10024,

[I 2019-06-02 02:08:09,467] Finished a trial resulted in value: 2.1327106130115263. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:09,734] Finished a trial resulted in value: 2.150915227453146. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:09,963] Finished a trial resulted in value: 2.157991987621806. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:10,199] Finished a trial resulted in value: 2.1421232664213132. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:10,472] Finished a trial resulted in value: 2.186926636579034. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol'

[I 2019-06-02 02:08:19,567] Finished a trial resulted in value: 2.1493945129087515. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:19,823] Finished a trial resulted in value: 2.152850250218407. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:20,051] Finished a trial resulted in value: 2.128761481074502. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:20,310] Finished a trial resulted in value: 2.1402197780135768. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:20,544] Finished a trial resulted in value: 2.128210039127179. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol'

[I 2019-06-02 02:08:30,196] Finished a trial resulted in value: 2.146318769741541. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:30,491] Finished a trial resulted in value: 2.2215784464117214. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:30,742] Finished a trial resulted in value: 2.1875576974555035. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:31,032] Finished a trial resulted in value: 2.1523436136190557. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol': 0.009755267454034489, 'random_state': 5654}.
[I 2019-06-02 02:08:31,295] Finished a trial resulted in value: 2.135912318585707. Current best value is 2.127859546404698 with parameters: {'alpha': 10008, 'tol

In [28]:
for trial_i in mytrial:
    db.insert(trial_i)
db.commit()

In [29]:
df_trial = db.select()
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial['algorithm-init'] = df_trial['param'].apply(lambda x: x['algorithm']['init'])

In [30]:
df_trial[(df_trial['remark']=='tune 1386 s3')][['datetime','nfeatures', 'remark', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark','algorithm-init']].sort_values(by=['val_mae'], ascending=True).head()

Unnamed: 0,datetime,nfeatures,remark,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark.1,algorithm-init
1826,2019-06-02 02:08:31.514839,25,tune 1386 s3,2.126937,5.596982e-07,2.127856,3.9e-05,0.000919,tune 1386 s3,"{'alpha': 10006, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.0030168836856056275, 'solver': 'auto', 'random_state': 7435}"
1733,2019-06-02 02:08:06.895670,25,tune 1386 s3,2.126941,5.598218e-07,2.12786,3.9e-05,0.000919,tune 1386 s3,"{'alpha': 10008, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.009755267454034489, 'solver': 'auto', 'random_state': 5654}"
1670,2019-06-02 02:07:50.589517,25,tune 1386 s3,2.126971,5.608185e-07,2.127889,3.9e-05,0.000918,tune 1386 s3,"{'alpha': 10024, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.0041087078468304405, 'solver': 'auto', 'random_state': 1994}"
1767,2019-06-02 02:08:15.547998,25,tune 1386 s3,2.126974,5.609441e-07,2.127893,3.9e-05,0.000918,tune 1386 s3,"{'alpha': 10026, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.004962806180361272, 'solver': 'auto', 'random_state': 8262}"
1773,2019-06-02 02:08:17.002703,25,tune 1386 s3,2.126982,5.611971e-07,2.1279,3.9e-05,0.000918,tune 1386 s3,"{'alpha': 10030, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.003697877954480417, 'solver': 'auto', 'random_state': 6477}"
