In [16]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso

import optuna

from common import EP
from dfdb import DFDB

import types
import copy

In [3]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [7]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [8]:
catboost_columns = ['spkt_welch_density__coeff_3',
 'abs_q25_5',
 'q25_roll_std_100',
 'abs_q75_6',
 'abs_q75_7',
 'spkt_welch_densitycoeff_2',
 'abs_q01_4',
 'iqr_6',
 'q05_roll_std_100',
 'q05_roll_std_1000',
 'median__roll_std',
 'abs_q01_5',
 "number_peaks{'n': 10}",
 'FFT_Mag_75q0',
 "value_count{'value': 1}",
 'q01_roll_std_100',
 'abs_q95_2',
 'abs_q95_6',
 'MA_1000MA_std_mean_7',
 'q05_roll_std_10',
 'q01_roll_std_1000',
 'abs_max_roll_mean_1000',
 'abs_q75_2',
 'abs_q05_6',
 '5000std_quantile25',
 "number_crossing_m{'m': 1}",
 "autocorrelation{'lag': 5}",
 'q75_roll_std_10',
 'q05_2',
 '5000smoothness_quantile05']

In [9]:
common_columns = ['q25_roll_std_100',
 'abs_q25_5',
 'iqr_6',
 'abs_q01_4',
 'abs_q75_7',
 'spkt_welch_density__coeff_3',
 'spkt_welch_densitycoeff_2']

In [10]:
main_columns = ['q25_roll_std_100',
 'abs_q25_5',
 'q05_roll_std_1000',
 'abs_q95_2',
 'abs_q75_6',
 'iqr_6',
 "autocorrelation{'lag': 5}",
 'median__roll_std',
 'abs_q01_4',
 'q05_roll_std_100',
 'abs_q75_7',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'spkt_welch_density__coeff_3',
 'spkt_welch_densitycoeff_2']

In [11]:
unique_columns = ['spkt_welch_density__coeff_31',
 'spkt_welch_density__coeff_3',
 '3th_peak_freq',
 'fft_coefficientcoeff_24__attr_"imag"',
 "autocorrelation{'lag': 5}",
 'iqr_9',
 'min__roll_std',
 'spkt_welch_densitycoeff_2',
 'abs_q01_7',
 'spkt_welch_density__coeff_63',
 "number_crossing_m{'m': 1}",
 'Hilbert_mean_1',
 'q95_9',
 'abs_min_3',
 'spkt_welch_density__coeff_57',
 'spkt_welch_densitycoeff_8',
 'abs_q05_9',
 '5000skewness_max_',
 'FFT_Mag_75q0',
 '5000smoothness_std_',
 'spkt_welch_density__coeff_28',
 '5000smoothness_entropy_',
 'spkt_welch_density__coeff_113',
 'abs_min_6',
 'fft_coefficientcoeff_20__attr_"abs"',
 "change_quantiles{'ql': 0.4, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'abs_q75_7',
 '5000smoothness_quantile05',
 'Hilbert_mean_2',
 '5000peak_peak_amp_max_',
 'abs_q25_7',
 "change_quantiles{'ql': 0.2, 'qh': 0.4, 'isabs': True, 'f_agg': 'mean'}",
 'fft_coefficientcoeff_32__attr_"imag"',
 'abs_min_2',
 '5000quantile25peak_to_average_power_ratio_',
 'partial_autocorrelationlag_1',
 'kurt_7',
 'abs_q25_5',
 'abs_max_1',
 'abs_q95_6',
 "number_peaks{'n': 1}",
 'max_to_min_5',
 'abs_q75_1',
 'fft_coefficientcoeff_62__attr_"abs"',
 'abs_max_5',
 'abs_max_roll_mean_1000',
 '4th_peak_psd',
 'q01_roll_std_100',
 'fft_coefficientcoeff_50__attr_"abs"',
 'med_6',
 'fft_coefficientcoeff_26__attr_"abs"',
 'abs_q95_8',
 'abs_q05_2',
 'fft_coefficientcoeff_56__attr_"angle"',
 'spkt_welch_density__coeff_117',
 'spkt_welch_density__coeff_30',
 'fft_coefficientcoeff_6__attr_"abs"',
 'peak_to_average_power_ratio__roll_mean',
 'fft_coefficientcoeff_21__attr_"abs"',
 "value_count{'value': -1}",
 "value_count{'value': 1}",
 'q05_roll_std_100',
 '5000clearance_factor_quantile25',
 'ave10_7',
 'abs_q75_6',
 'spkt_welch_density__coeff_25',
 'agg_autocorrelationf_agg_"mean"__maxlag_40',
 'abs_q95_2',
 'q95_roll_mean_1000',
 'fft_coefficientcoeff_24__attr_"real"',
 'q25_1',
 'abs_q05_6',
 "change_quantiles{'ql': 0.0, 'qh': 0.8, 'isabs': False, 'f_agg': 'mean'}",
 'abs_max_6',
 "binned_entropy{'max_bins': 10}",
 'med_4',
 'abs_max_2',
 'abs_max_3',
 'spkt_welch_density__coeff_84',
 'ave10_6',
 "autocorrelation{'lag': 4}",
 'med_2',
 'spkt_welch_density__coeff_42',
 'iqr',
 '5000kurtosis_quantile75',
 "change_quantiles{'ql': 0.2, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'}",
 'min__roll_mean',
 'q05_roll_std_10',
 "number_peaks{'n': 5}",
 'abs_q25_1',
 'max_to_min',
 'max_to_min_1',
 'FFT_Mag_25q0',
 'fft_coefficientcoeff_19__attr_"abs"',
 'iqr_8',
 'abs_q99_8',
 '5000std_quantile05',
 '5000quantile05median_',
 'mean_change_rate',
 'spkt_welch_density__coeff_4',
 'min_9',
 'abs_max_4',
 '5000rms_quantile25',
 "change_quantiles{'ql': 0.6, 'qh': 0.8, 'isabs': True, 'f_agg': 'mean'}",
 'spkt_welch_density__coeff_99',
 'abs_q75_2',
 'abs_q05_7',
 'q01_roll_std_1000',
 'abs_q01_6',
 'median__roll_std',
 'abs_max_7',
 'max_to_min_6',
 'spkt_welch_density__coeff_64',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'abs_q05_1',
 'min_roll_std_100',
 'q75_roll_std_10',
 'abs_max_8',
 "change_quantiles{'ql': 0.4, 'qh': 0.6, 'isabs': False, 'f_agg': 'mean'}",
 'fft_coefficientcoeff_8__attr_"imag"',
 'fft_coefficientcoeff_80__attr_"imag"',
 "number_peaks{'n': 3}",
 'q99_roll_mean_1000',
 'spkt_welch_density__coeff_29',
 'kurt_3',
 '5000std_quantile25',
 'max_to_min_diff_5',
 '5000kurtosis_mean_',
 'abs_q25_8',
 'spkt_welch_density__coeff_38',
 'q25_roll_std_100',
 'spkt_welch_density__coeff_65',
 'ave10_2',
 'abs_q25_9',
 '5000min_quantile05',
 'fft_coefficientcoeff_36__attr_"abs"',
 "number_peaks{'n': 10}",
 'FFT_Mag_95q0',
 'q01_roll_std_10',
 'abs_q99_7',
 'abs_min_5',
 'spkt_welch_density__coeff_115',
 'abs_min_7',
 'MA_1000MA_std_mean_7',
 'spkt_welch_density__coeff_50',
 'q01_roll_mean_1000',
 'spkt_welch_density__coeff_41',
 'fft_coefficientcoeff_16__attr_"imag"',
 'q05_2',
 'med_8',
 'abs_max_roll_mean_100',
 'iqr_6',
 'partial_autocorrelationlag_5',
 'med_5',
 'spkt_welch_density__coeff_17',
 '5000skewness_mean_',
 'skew_1',
 'abs_q01_4',
 'abs_q95_7',
 'spkt_welch_density__coeff_79',
 'abs_q01_5',
 'q05_roll_std_1000',
 'max_9',
 '5000form_factor_quantile75',
 'q05_5']

In [17]:
db = DFDB('../trial/ridge.pkl', auto_commit=False)

In [31]:
param={'algorithm': {'cls': 'Ridge',
  'fit': {},
  'init': {'alpha': 50000,
   'fit_intercept': True,
   'normalize': False,
   'copy_X': True,
   'max_iter': None,
   'tol': 0.001,
   'solver': 'auto',
   'random_state': 42}},
 'columns': tsfresh_columns,

 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},#stratified
 'scaler': {'cls': 'StandardScaler'}}

In [32]:
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 667 use group')
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [33]:
df_trial[['datetime','nfeatures', 'remark', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,remark,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
0,2019-05-18 13:50:08.547754,7,remodel 667 use group,2.580433,0.025526,2.449928,0.137262,-0.130505
1,2019-05-18 13:50:49.721884,7,remodel 667 use group,2.397321,0.003223,2.396167,0.013775,-0.001154
2,2019-05-18 13:52:51.908421,165,remodel 667 use group,2.14516,0.002662,2.180493,0.012586,0.035332
3,2019-05-18 13:53:08.943219,14,remodel 667 use group,2.301714,0.00307,2.30239,0.013065,0.000676
4,2019-05-18 13:53:54.830672,1071,remodel 667 use group,2.057224,0.002123,2.163357,0.008752,0.106134


In [59]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial)

In [60]:
df_trial = pd.DataFrame(mytrial)
df_trial['param-kfold'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial['param-algorithm-init-alpha'] = df_trial['param'].apply(lambda x: x['algorithm']['init']['alpha'])
df_trial[['datetime','param-kfold','param-algorithm-init-alpha', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,param-kfold,param-algorithm-init-alpha,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
0,2019-05-14 12:49:36.998495,stratified,1.0,165,2.020883,1.6e-05,2.031414,0.00069,0.010531
1,2019-05-14 12:50:04.008149,stratified,1.0,165,2.020883,1.6e-05,2.031414,0.00069,0.010531
2,2019-05-14 12:50:17.591358,stratified,0.1,165,2.020934,1.6e-05,2.031516,0.000689,0.010582
3,2019-05-14 12:53:00.997937,stratified,1.0,165,2.020883,1.6e-05,2.031414,0.00069,0.010531
4,2019-05-16 11:23:59.922861,stratified,1.0,165,2.019544,1.3e-05,2.031924,0.000113,0.01238
5,2019-05-16 11:24:23.663528,group,1.0,165,1.987654,0.001902,2.211701,0.002852,0.224047
6,2019-05-16 11:25:46.874634,group,0.1,165,1.987732,0.0019,2.212782,0.002833,0.22505
7,2019-05-16 11:25:59.972194,group,10.0,165,1.987642,0.001909,2.203818,0.003079,0.216176
8,2019-05-16 11:26:11.550518,group,10.0,165,2.002236,0.01492,2.210635,0.216426,0.208399
9,2019-05-16 11:26:34.317097,group,10.0,165,1.987642,0.001909,2.203818,0.003079,0.216176


In [44]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [63]:
df_trial = pd.DataFrame(mytrial)
df_trial.to_pickle('../trial/ridge.pkl')