In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_colwidth', -1)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
df_spec_train = pd.read_pickle('../feats/spec_features.pkl')
df_spec_test = pd.read_pickle('../feats/spec_features_test.pkl')

In [6]:
df_train = pd.merge(df_train, df_spec_train, on='index')
df_test = pd.merge(df_test, df_spec_test, on='index')

In [7]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [12]:
db = DFDB('../trial/randomforest.pkl', auto_commit=False)

In [13]:
lgbm_top200 = ['spkt_welch_densitycoeff_2',
 'spkt_welch_density__coeff_3',
 'q25_roll_std_100',
 '3th_peak_freq',
 'min_roll_std_100',
 'q05_roll_std_100',
 'iqr_6',
 'abs_max_8',
 'mfcc_5_abs_mean',
 "number_peaks{'n': 10}",
 'ave10_7',
 'mfcc_13_mean',
 "autocorrelation{'lag': 5}",
 'mfcc_accelerate_8_variance',
 'abs_max_7',
 'mfcc_accelerate_1_kurtosis',
 'q05_roll_std_1000',
 'spkt_welch_density__coeff_42',
 'max_to_min_diff_5',
 'mfcc_13_quantile25',
 'fft_coefficientcoeff_80__attr_"imag"',
 'abs_q25_5',
 'mfcc_5_mean',
 'median__roll_std',
 '5000skewness_max_',
 'fft_coefficientcoeff_6__attr_"abs"',
 'partial_autocorrelationlag_5',
 'abs_min_8',
 'spkt_welch_density__coeff_28',
 'ar_coefficientk_10__coeff_3',
 'abs_q75_7',
 'mfcc_accelerate_15_min',
 'abs_max_4',
 'mfcc_10_quantile25',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'spkt_welch_density__coeff_79',
 "value_count{'value': 1}",
 '5000peak_peak_amp_max_',
 'min__roll_std',
 'mfcc_4_rolling_std_mean',
 'q01_roll_std_1000',
 'mfcc_3_abs_q75',
 'mfcc_5_quantile95',
 'fft_coefficientcoeff_16__attr_"imag"',
 'abs_q01_4',
 'mfcc_6_quantile05',
 'agg_autocorrelationf_agg_"mean"__maxlag_40',
 '5000quantile25skewness_',
 'mfcc_0_abs_q25',
 'mfcc_delta_3_quantile01',
 'spkt_welch_density__coeff_99',
 'mfcc_9_mean',
 'mfcc_12_mean',
 'q75_roll_mean_10',
 'max_to_min',
 'mfcc_3_abs_mean',
 'kurt_7',
 'mfcc_14_quantile01',
 'mfcc_10_mean',
 'mfcc_delta_13_min',
 'mfcc_delta_13_abs_max',
 '5000no_zero_crossing_mean_',
 'abs_q75_6',
 'mfcc_12_quantile99',
 'mfcc_accelerate_6_skewness',
 'mfcc_5_max',
 '5000smoothness_entropy_',
 '5000median_skewness_',
 'mfcc_accelerate_8_min',
 'abs_min_3',
 '5000quantile75mean_',
 "number_crossing_m{'m': 1}",
 '5000smoothness_std_',
 'mfcc_9_abs_q25',
 'flac3_1_quantile99',
 'mfcc_5_skewness',
 'fft_coefficientcoeff_56__attr_"angle"',
 'fft_coefficientcoeff_70__attr_"abs"',
 'mfcc_10_abs_q75',
 'fft_coefficientcoeff_24__attr_"angle"',
 'med_7',
 'spkt_welch_density__coeff_73',
 'abs_q99_8',
 'ave10_6',
 'spkt_welch_density__coeff_38',
 'skew_1',
 'mfcc_delta_3_abs_q95',
 "change_quantiles{'ql': 0.6, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'}",
 'max_to_min_5',
 'mfcc_delta_4_quantile75',
 'abs_max_1',
 '5000crest_factor_quantile75',
 'partial_autocorrelationlag_1',
 'mfcc_delta_11_quantile01',
 'mfcc_accelerate_7_quantile01',
 'mfcc_8_rolling_std_mean',
 'q95_roll_mean_100',
 'mfcc_delta_6_rolling_std_mean',
 'flac3_0_min',
 'fft_coefficientcoeff_36__attr_"abs"',
 'iqr_8',
 'max_9',
 '5000smoothness_quantile05',
 'mfcc_1_kurtosis',
 'mfcc_7_abs_q95',
 'spkt_welch_density__coeff_66',
 'spkt_welch_density__coeff_64',
 'mfcc_3_abs_q95',
 '5000skewness_mean_',
 'mfcc_10_abs_mean',
 '5000quantile99quantile01',
 'mean_change_rate',
 'mfcc_accelerate_5_rolling_std_mean',
 'mfcc_accelerate_5_quantile05',
 '5000rms_median_',
 'flac3_1_abs_mean',
 '5000rms_quantile05',
 '5000quantile75quantile25',
 'mfcc_accelerate_12_min',
 "value_count{'value': -1}",
 'mfcc_12_abs_max',
 'abs_min_5',
 'mfcc_5_abs_q99',
 'mfcc_delta_5_quantile01',
 'mfcc_12_quantile75',
 'fft_coefficientcoeff_56__attr_"imag"',
 'spkt_welch_densitycoeff_5',
 'mfcc_0_abs_q05',
 'mfcc_13_quantile75',
 'mfcc_delta_5_min',
 'mfcc_5_quantile99',
 'fft_coefficientcoeff_8__attr_"angle"',
 'spkt_welch_density__coeff_30',
 'mfcc_accelerate_4_max',
 'mfcc_14_median',
 "change_quantiles{'ql': 0.2, 'qh': 0.4, 'isabs': False, 'f_agg': 'var'}",
 'mfcc_5_median',
 'mfcc_accelerate_5_abs_q75',
 'spkt_welch_density__coeff_59',
 'mfcc_accelerate_5_kurtosis',
 'mfcc_delta_9_quantile99',
 'mfcc_5_quantile25',
 'spkt_welch_density__coeff_113',
 'mfcc_2_quantile05',
 'spkt_welch_density__coeff_58',
 'mfcc_5_abs_q75',
 'spkt_welch_density__coeff_22',
 'spkt_welch_density__coeff_115',
 'spkt_welch_density__coeff_4',
 'mfcc_5_abs_q05',
 'spkt_welch_density__coeff_25',
 'mfcc_5_abs_max',
 'spkt_welch_density__coeff_27',
 'mfcc_4_quantile75',
 'mfcc_3_rolling_std_mean',
 'mfcc_4_median',
 'mfcc_0_quantile01',
 '5000quantile75rssq_',
 'fft_coefficientcoeff_24__attr_"imag"',
 'mfcc_delta_12_kurtosis',
 '5000rms_quantile25',
 'mfcc_13_quantile05',
 'fft_coefficientcoeff_8__attr_"imag"',
 'mfcc_delta_11_rolling_std_mean',
 'mfcc_delta_12_quantile99',
 'mfcc_accelerate_9_abs_q25',
 'mfcc_11_abs_mean',
 'mfcc_delta_15_abs_max',
 'abs_max_2',
 'mfcc_11_abs_std',
 'abs_max_roll_mean_1000',
 'peak_to_average_power_ratio__roll_mean',
 'mfcc_13_median',
 'fft_coefficientcoeff_62__attr_"abs"',
 'mfcc_11_quantile75',
 "quantile{'q': 0.8}",
 'mfcc_delta_3_rolling_std_mean',
 'abs_q99_7',
 'mfcc_delta_12_max',
 'mfcc_12_min',
 'mfcc_10_quantile99',
 'mfcc_14_abs_q99',
 'mfcc_14_quantile05',
 'mfcc_14_quantile25',
 'kurt_1',
 'mfcc_2_median',
 'q01_2',
 'mfcc_2_abs_q25',
 'kurt_8',
 'mfcc_accelerate_6_abs_max',
 'mfcc_1_quantile75',
 'q05_5',
 'abs_q95_9',
 'q05_roll_mean_100',
 'mfcc_10_abs_q25',
 'q75_9',
 'mfcc_15_mean',
 "change_quantiles{'ql': 0.6, 'qh': 0.8, 'isabs': True, 'f_agg': 'mean'}",
 'flac3_0_abs_std',
 "change_quantiles{'ql': 0.4, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'}"]

In [15]:
df_trial = db.select()
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial['algorithm-init'] = df_trial['param'].apply(lambda x: x['algorithm']['init'])

In [16]:
df_trial[(df_trial['kfold-type']=='group')&(df_trial['mae_diff']<.1)][['datetime','nfeatures', 'remark', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark','algorithm-init']].sort_values(by=['val_mae'], ascending=True).head()

Unnamed: 0,datetime,nfeatures,remark,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark.1,algorithm-init
724,2019-05-24 02:19:46.484385,3,start from top1 column 2th,2.085508,0.014216,2.134237,0.563771,0.048729,start from top1 column 2th,"{'n_estimators': 617, 'max_depth': 9, 'max_features': 0.6118189949437737, 'min_samples_leaf': 0.10163450193406814, 'random_state': 6370}"
725,2019-05-24 02:20:56.632514,4,start from top1 column 2th,2.0834,0.014043,2.13446,0.566987,0.05106,start from top1 column 2th,"{'n_estimators': 617, 'max_depth': 9, 'max_features': 0.6118189949437737, 'min_samples_leaf': 0.10163450193406814, 'random_state': 6370}"
731,2019-05-24 02:27:57.645343,4,start from top1 column 2th,2.082232,0.014117,2.134881,0.567611,0.052648,start from top1 column 2th,"{'n_estimators': 617, 'max_depth': 9, 'max_features': 0.6118189949437737, 'min_samples_leaf': 0.10163450193406814, 'random_state': 6370}"
932,2019-05-24 15:30:07.425180,3,tune 724,2.085913,0.014133,2.135108,0.563899,0.049195,tune 724,"{'n_estimators': 578, 'max_depth': 13, 'max_features': 0.6306849933710565, 'min_samples_leaf': 0.10332910932373635, 'random_state': 475}"
727,2019-05-24 02:23:18.005532,4,start from top1 column 2th,2.080193,0.013923,2.136026,0.569078,0.055833,start from top1 column 2th,"{'n_estimators': 617, 'max_depth': 9, 'max_features': 0.6118189949437737, 'min_samples_leaf': 0.10163450193406814, 'random_state': 6370}"


In [18]:
param = {'columns':lgbm_top200 ,
 'kfold': {'n_splits': 8,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler', 'init': {}},
 'algorithm': {'cls': 'RandomForestRegressor',
  'init': {'n_estimators': 617,
   'max_depth': 9,
   'max_features': 0.6118189949437737,
   'min_samples_leaf': 0.10163450193406814,
   'random_state': 6370},
  'fit': {}}}

In [19]:
mytrial =[]
EP.select_features_(df_train, param, mytrial, nfeats_best=30, nfeats_removed_per_try=10, key='average_permutation_weight', remark='rfe to 30 group3')

In [20]:
for trial_i in mytrial:
    db.insert(trial_i)
db.commit()

In [21]:
df_trial = db.select()
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold'])
df_trial['algorithm-init'] = df_trial['param'].apply(lambda x: x['algorithm']['init'])
df_trial['algorithm-cls'] = df_trial['param'].apply(lambda x : x['algorithm']['cls'])

In [24]:
df_trial[df_trial['remark']=='rfe to 30 group3'].sort_values(by=['val_mae'])[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','kfold','algorithm-init']].head()

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,kfold,algorithm-init
1338,2019-06-03 06:47:16.883830,rfe to 30 group3,40,2.075935,0.013814,2.138075,0.572014,0.06214,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}","{'n_estimators': 617, 'max_depth': 9, 'max_features': 0.6118189949437737, 'min_samples_leaf': 0.10163450193406814, 'random_state': 6370}"
1337,2019-06-03 06:38:00.563701,rfe to 30 group3,50,2.075866,0.013755,2.138507,0.572647,0.062641,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}","{'n_estimators': 617, 'max_depth': 9, 'max_features': 0.6118189949437737, 'min_samples_leaf': 0.10163450193406814, 'random_state': 6370}"
1331,2019-06-03 04:50:44.266939,rfe to 30 group3,110,2.075985,0.013771,2.138637,0.572111,0.062652,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}","{'n_estimators': 617, 'max_depth': 9, 'max_features': 0.6118189949437737, 'min_samples_leaf': 0.10163450193406814, 'random_state': 6370}"
1324,2019-06-03 00:22:02.570645,rfe to 30 group3,180,2.075833,0.013816,2.138739,0.572413,0.062907,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}","{'n_estimators': 617, 'max_depth': 9, 'max_features': 0.6118189949437737, 'min_samples_leaf': 0.10163450193406814, 'random_state': 6370}"
1333,2019-06-03 05:36:51.417976,rfe to 30 group3,90,2.076372,0.013831,2.138939,0.572164,0.062567,"{'n_splits': 8, 'random_state': 1985, 'shuffle': True, 'type': 'group'}","{'n_estimators': 617, 'max_depth': 9, 'max_features': 0.6118189949437737, 'min_samples_leaf': 0.10163450193406814, 'random_state': 6370}"


In [26]:
score = df_trial.loc[1338]['val_mae']
param = df_trial.loc[1338]['param']
score

2.138075148948355

In [28]:
mytrial = []
# EP.width_frist_rfe(df_train, param, mytrial, score, df_test=df_test, remark='wf 1338')