In [15]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso

import optuna

from common import EP

import types

In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [4]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [5]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [6]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [7]:
mytrial = []
# mytrial = list(pd.read_pickle('trial/catboost.pkl').T.to_dict().values())
# df_trial = pd.DataFrame(mytrial)
# len(mytrial)

In [8]:
columns = ["change_quantiles{'ql': 0.4, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'spkt_welch_density__coeff_65',
 'q05_roll_std_10',
 'Hilbert_mean_2',
 'min__roll_std',
 'spkt_welch_density__coeff_4',
 'abs_max_6',
 'spkt_welch_density__coeff_99',
 'abs_q01_7',
 'abs_max_8',
 'kurt_7',
 'partial_autocorrelationlag_5',
 'q05_2',
 'abs_max_roll_mean_100',
 'spkt_welch_density__coeff_50',
 'abs_q01_6',
 'ave10_6',
 "binned_entropy{'max_bins': 10}",
 'min_roll_std_100',
 'spkt_welch_density__coeff_113',
 'med_6',
 'spkt_welch_density__coeff_42',
 '4th_peak_psd',
 '5000min_quantile05',
 'fft_coefficientcoeff_36__attr_"abs"',
 'spkt_welch_density__coeff_41',
 'q01_roll_std_100',
 'max_to_min',
 'spkt_welch_density__coeff_17',
 'abs_min_5',
 'q25_roll_std_100',
 '5000skewness_mean_',
 "change_quantiles{'ql': 0.2, 'qh': 0.4, 'isabs': True, 'f_agg': 'mean'}",
 'peak_to_average_power_ratio__roll_mean',
 'fft_coefficientcoeff_20__attr_"abs"',
 'fft_coefficientcoeff_50__attr_"abs"',
 "change_quantiles{'ql': 0.2, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'}",
 'q01_roll_std_1000',
 'abs_max_5',
 'FFT_Mag_25q0',
 'abs_q95_7',
 '5000skewness_max_',
 '5000std_quantile05',
 'abs_q25_5',
 "number_peaks{'n': 3}",
 'q05_roll_std_1000',
 'abs_q99_7',
 'abs_q95_2',
 'q75_roll_std_10',
 'spkt_welch_density__coeff_84',
 'abs_q75_6',
 'abs_max_3',
 'kurt_3',
 "change_quantiles{'ql': 0.4, 'qh': 0.6, 'isabs': False, 'f_agg': 'mean'}",
 'iqr',
 'spkt_welch_density__coeff_64',
 'fft_coefficientcoeff_8__attr_"imag"',
 'spkt_welch_density__coeff_29',
 'iqr_6',
 "autocorrelation{'lag': 5}",
 'spkt_welch_density__coeff_57',
 'spkt_welch_density__coeff_117',
 'q25_1',
 "number_crossing_m{'m': 1}",
 'fft_coefficientcoeff_32__attr_"imag"',
 'fft_coefficientcoeff_21__attr_"abs"',
 'abs_q95_6',
 'Hilbert_mean_1',
 'skew_1',
 'iqr_9',
 'ave10_2',
 'median__roll_std',
 '5000clearance_factor_quantile25',
 'abs_q95_8',
 '5000smoothness_quantile05',
 'med_4',
 'spkt_welch_density__coeff_28',
 'abs_max_roll_mean_1000',
 '5000kurtosis_mean_',
 'iqr_8',
 '5000kurtosis_quantile75',
 'spkt_welch_density__coeff_31',
 'med_5',
 'min__roll_mean',
 'abs_q01_4',
 'fft_coefficientcoeff_24__attr_"imag"',
 'max_to_min_6',
 'abs_q75_2',
 'q05_5',
 'spkt_welch_density__coeff_25',
 'q05_roll_std_100',
 'abs_q75_7',
 'abs_q01_5',
 'FFT_Mag_75q0',
 'mean_change_rate',
 'med_8',
 'q01_roll_std_10',
 'fft_coefficientcoeff_56__attr_"angle"',
 '3th_peak_freq',
 'abs_q05_9',
 'abs_min_3',
 '5000smoothness_std_',
 'fft_coefficientcoeff_24__attr_"real"',
 'abs_q05_7',
 'q99_roll_mean_1000',
 'abs_q99_8',
 '5000rms_quantile25',
 'abs_max_7',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'q95_roll_mean_1000',
 '5000quantile25peak_to_average_power_ratio_',
 "number_peaks{'n': 10}",
 'abs_q05_6',
 'spkt_welch_density__coeff_3',
 'spkt_welch_density__coeff_38',
 'spkt_welch_density__coeff_115',
 'abs_q25_9',
 'fft_coefficientcoeff_62__attr_"abs"',
 'max_to_min_diff_5',
 'abs_q05_2',
 'fft_coefficientcoeff_80__attr_"imag"',
 "number_peaks{'n': 5}",
 "autocorrelation{'lag': 4}",
 'abs_min_2',
 'FFT_Mag_95q0',
 'spkt_welch_densitycoeff_8',
 'abs_q05_1',
 'min_9',
 "change_quantiles{'ql': 0.0, 'qh': 0.8, 'isabs': False, 'f_agg': 'mean'}",
 "number_peaks{'n': 1}",
 'partial_autocorrelationlag_1',
 'q01_roll_mean_1000',
 'abs_max_1',
 'max_to_min_1',
 'abs_min_6',
 'abs_min_7',
 'fft_coefficientcoeff_19__attr_"abs"',
 "value_count{'value': 1}",
 'abs_q25_7',
 "change_quantiles{'ql': 0.6, 'qh': 0.8, 'isabs': True, 'f_agg': 'mean'}",
 'MA_1000MA_std_mean_7',
 "value_count{'value': -1}",
 'abs_q25_1',
 'spkt_welch_density__coeff_79',
 'agg_autocorrelationf_agg_"mean"__maxlag_40',
 'max_to_min_5',
 'q95_9',
 'spkt_welch_densitycoeff_2',
 '5000peak_peak_amp_max_',
 'fft_coefficientcoeff_16__attr_"imag"',
 '5000form_factor_quantile75',
 'fft_coefficientcoeff_26__attr_"abs"',
 'spkt_welch_density__coeff_63',
 'med_2',
 '5000std_quantile25',
 'max_9',
 'fft_coefficientcoeff_6__attr_"abs"',
 'spkt_welch_density__coeff_30',
 '5000smoothness_entropy_',
 'abs_max_2',
 'abs_max_4',
 '5000quantile05median_',
 'ave10_7',
 'abs_q25_8',
 'abs_q75_1']

In [32]:
param={
    'algorithm': {
        'cls': 'Ridge',
        'fit': {
        },
        'init': {
            'alpha':1.0, 
            'fit_intercept':True, 
            'normalize':False, 
            'copy_X':True, 
            'max_iter':None, 
            'tol':0.001, 
            'solver':'auto', #newton-cg,lbfgs,liblinear,sag,saga
            'random_state':42
        }
    },
    'columns': columns,
    'feature_importance': {
        'is_output': False,
        'permutation_feature_importance': False,
        'permutation_random_state': 1
    },
    'kfold': {
        'n_splits': 8,
        'random_state': 1985,
        'shuffle': True,
        'type': 'stratified'
    },
    'scaler': {
        'cls': 'StandardScaler'
    }
}

In [33]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial)

In [34]:
df_trial = pd.DataFrame(mytrial)
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
0,2019-05-14 12:49:36.998495,165,2.020883,1.6e-05,2.031414,0.00069,0.010531
1,2019-05-14 12:50:04.008149,165,2.020883,1.6e-05,2.031414,0.00069,0.010531
2,2019-05-14 12:50:17.591358,165,2.020934,1.6e-05,2.031516,0.000689,0.010582
3,2019-05-14 12:53:00.997937,165,2.020883,1.6e-05,2.031414,0.00069,0.010531


In [44]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [35]:
df_trial.to_pickle('../trial/ridge.pkl')