In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [7]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [2]:
# mytrial = []
mytrial = list(pd.read_pickle('../trial/gradientboosting.pkl').T.to_dict().values())
df_trial = pd.DataFrame(mytrial)
len(mytrial)

327

In [11]:
param={'columns': ['q25_roll_std_100',
  'spkt_welch_density__coeff_3',
  'abs_q01_4',
  'abs_q25_5',
  'q05_roll_std_1000',
  'median__roll_std',
  'iqr_6',
  'spkt_welch_densitycoeff_2',
  'q05_roll_std_100',
  'q05_2',
  'abs_q75_7',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  "value_count{'value': -1}",
  'abs_q95_2',
  'MA_1000MA_std_mean_7',
  '5000smoothness_std_',
  '5000quantile25peak_to_average_power_ratio_',
  'FFT_Mag_25q0',
  '5000skewness_max_',
  'q05_5',
  'max_to_min_5',
  "autocorrelation{'lag': 5}",
  'abs_q05_2',
  'min__roll_std',
  '5000peak_peak_amp_max_',
  'max_to_min_diff_5',
  '5000form_factor_quantile75',
  'iqr',
  'kurt_7',
  'spkt_welch_density__coeff_4',
  "number_peaks{'n': 3}",
  'spkt_welch_density__coeff_42',
  'fft_coefficientcoeff_20__attr_"abs"',
  '5000quantile05median_',
  'abs_max_2',
  "number_peaks{'n': 1}",
  'abs_max_8',
  "number_peaks{'n': 5}",
  'spkt_welch_density__coeff_17',
  'abs_max_3',
  '4th_peak_psd',
  "autocorrelation{'lag': 4}",
  '5000min_quantile05',
  '5000kurtosis_mean_',
  'min_9',
  'ave10_6',
  'spkt_welch_density__coeff_57',
  'abs_max_7',
  'agg_autocorrelationf_agg_"mean"__maxlag_40',
  'spkt_welch_density__coeff_31',
  'kurt_3',
  '5000skewness_mean_',
  '5000kurtosis_quantile75',
  'abs_q95_6',
  'spkt_welch_densitycoeff_8',
  'abs_min_5',
  'med_4',
  'abs_q01_7',
  'spkt_welch_density__coeff_64',
  'abs_min_3',
  'fft_coefficientcoeff_19__attr_"abs"',
  'fft_coefficientcoeff_62__attr_"abs"',
  'spkt_welch_density__coeff_99',
  'partial_autocorrelationlag_1',
  'fft_coefficientcoeff_56__attr_"angle"',
  'fft_coefficientcoeff_6__attr_"abs"',
  'iqr_8',
  'abs_q05_1',
  'abs_max_6',
  'fft_coefficientcoeff_36__attr_"abs"'],
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler'},
 'algorithm': {'cls': 'GradientBoostingRegressor',
  'init': {'max_depth': 3,
   'learning_rate': 0.04018024140881379,
   'n_estimators': 253,
   'subsample': 0.6846361552509973,
   'alpha': 0.9990262087522855,
   'random_state': 2784},
  'fit': {}},
}

In [12]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial)

In [16]:
#check feature_importances
# df_feature_importances = df_trial.loc[14]['df_feature_importances']
# if type(df_feature_importances)==pd.DataFrame:
#     sorted_columns = EP.evaluate(df_feature_importances, key='average_model_weight')
# else:
#     sorted_columns = df_trial.loc[14]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)
# len(sorted_columns)

In [13]:
df_trial = pd.DataFrame(mytrial)
#[(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].tail(1)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
220,2019-05-16 11:03:43.728017,70,1.84528,2e-05,1.906736,0.000234,0.061456


In [14]:
###  tune hypterparameters
def objective(trial):
    
    max_depth = trial.suggest_int('max_depth', 2, 6)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.4)
    n_estimators = trial.suggest_int('n_estimators', 100,500)
    subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
    alpha = trial.suggest_uniform('alpha', 0.00001, 1.0)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':param['columns'].copy(),
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'GradientBoostingRegressor',
            'init':{
                "max_depth":max_depth,
                "learning_rate":learning_rate,
                "n_estimators":n_estimators,
                "subsample":subsample,
                "alpha":alpha,
                "random_state":random_state,
            },
            'fit':{
#                 'eval_metric':'mae',
#                 'verbose':False,
#                 'early_stopping_rounds':200,
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 220')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-16 11:11:04,714] Finished a trial resulted in value: 1.1003866575630066. Current best value is 1.1003866575630066 with parameters: {'max_depth': 5, 'learning_rate': 0.2673759810262276, 'n_estimators': 455, 'subsample': 0.815126163788147, 'alpha': 0.5579249669410118, 'random_state': 3603}.
[I 2019-05-16 11:12:10,862] Finished a trial resulted in value: 0.3637291200466947. Current best value is 0.3637291200466947 with parameters: {'max_depth': 4, 'learning_rate': 0.17193290826962068, 'n_estimators': 134, 'subsample': 0.7292935105100071, 'alpha': 0.6361955441244439, 'random_state': 9478}.
[I 2019-05-16 11:14:32,771] Finished a trial resulted in value: 0.637563537069587. Current best value is 0.3637291200466947 with parameters: {'max_depth': 4, 'learning_rate': 0.17193290826962068, 'n_estimators': 134, 'subsample': 0.7292935105100071, 'alpha': 0.6361955441244439, 'random_state': 9478}.
[I 2019-05-16 11:17:35,291] Finished a trial resulted in value: 0.5143052245062809. Current be

[I 2019-05-16 12:10:26,171] Finished a trial resulted in value: 0.2401239001531168. Current best value is 0.03933910561150104 with parameters: {'max_depth': 2, 'learning_rate': 0.01277049228577442, 'n_estimators': 500, 'subsample': 0.9397787290669495, 'alpha': 0.8199914338254779, 'random_state': 2571}.
[I 2019-05-16 12:13:23,611] Finished a trial resulted in value: 0.2960989339604943. Current best value is 0.03933910561150104 with parameters: {'max_depth': 2, 'learning_rate': 0.01277049228577442, 'n_estimators': 500, 'subsample': 0.9397787290669495, 'alpha': 0.8199914338254779, 'random_state': 2571}.
[I 2019-05-16 12:19:36,926] Finished a trial resulted in value: 1.4674707941630605. Current best value is 0.03933910561150104 with parameters: {'max_depth': 2, 'learning_rate': 0.01277049228577442, 'n_estimators': 500, 'subsample': 0.9397787290669495, 'alpha': 0.8199914338254779, 'random_state': 2571}.
[I 2019-05-16 12:24:29,986] Finished a trial resulted in value: 0.7811414846532656. Curr

[I 2019-05-16 13:20:48,282] Finished a trial resulted in value: 0.11189077221138567. Current best value is 0.03933910561150104 with parameters: {'max_depth': 2, 'learning_rate': 0.01277049228577442, 'n_estimators': 500, 'subsample': 0.9397787290669495, 'alpha': 0.8199914338254779, 'random_state': 2571}.
[I 2019-05-16 13:23:20,190] Finished a trial resulted in value: 0.7247767989869182. Current best value is 0.03933910561150104 with parameters: {'max_depth': 2, 'learning_rate': 0.01277049228577442, 'n_estimators': 500, 'subsample': 0.9397787290669495, 'alpha': 0.8199914338254779, 'random_state': 2571}.
[I 2019-05-16 13:24:20,486] Finished a trial resulted in value: 0.1259890815636178. Current best value is 0.03933910561150104 with parameters: {'max_depth': 2, 'learning_rate': 0.01277049228577442, 'n_estimators': 500, 'subsample': 0.9397787290669495, 'alpha': 0.8199914338254779, 'random_state': 2571}.
[I 2019-05-16 13:26:30,744] Finished a trial resulted in value: 0.4284329804177467. Cur

[I 2019-05-16 14:07:04,955] Finished a trial resulted in value: 0.190059554753343. Current best value is 0.022932735860323395 with parameters: {'max_depth': 2, 'learning_rate': 0.01032226388250946, 'n_estimators': 293, 'subsample': 0.7967808427773099, 'alpha': 0.5385476572038151, 'random_state': 8553}.
[I 2019-05-16 14:08:01,309] Finished a trial resulted in value: 0.1126128170177638. Current best value is 0.022932735860323395 with parameters: {'max_depth': 2, 'learning_rate': 0.01032226388250946, 'n_estimators': 293, 'subsample': 0.7967808427773099, 'alpha': 0.5385476572038151, 'random_state': 8553}.
[I 2019-05-16 14:10:05,246] Finished a trial resulted in value: 0.05120814159760063. Current best value is 0.022932735860323395 with parameters: {'max_depth': 2, 'learning_rate': 0.01032226388250946, 'n_estimators': 293, 'subsample': 0.7967808427773099, 'alpha': 0.5385476572038151, 'random_state': 8553}.
[I 2019-05-16 14:11:01,894] Finished a trial resulted in value: 0.052598439211273794.

[I 2019-05-16 14:52:30,086] Finished a trial resulted in value: 0.5107889122628924. Current best value is 0.022932735860323395 with parameters: {'max_depth': 2, 'learning_rate': 0.01032226388250946, 'n_estimators': 293, 'subsample': 0.7967808427773099, 'alpha': 0.5385476572038151, 'random_state': 8553}.
[I 2019-05-16 14:53:50,022] Finished a trial resulted in value: 0.2677602112941815. Current best value is 0.022932735860323395 with parameters: {'max_depth': 2, 'learning_rate': 0.01032226388250946, 'n_estimators': 293, 'subsample': 0.7967808427773099, 'alpha': 0.5385476572038151, 'random_state': 8553}.


KeyboardInterrupt: 

In [43]:
df_test_pred = df_trial.loc[342]['df_test_pred']

In [44]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission.csv', index=False)

In [None]:
df_trial = pd.DataFrame(mytrial)

In [3]:
#[df_trial['mae_diff']<.05].sort_values(by=['val_mae']) | (df_trial['remark']=='tune feats selected by group ')
df_trial[(df_trial['remark']=='tune 220')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'], ascending=True)[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
304,2019-05-16 14:12:57.544171,tune 220,70,1.891917,1.2e-05,1.93832,0.00018,0.046403
261,2019-05-16 12:50:04.270091,tune 220,70,1.895138,1.8e-05,1.942202,0.000159,0.047064
251,2019-05-16 12:25:03.693296,tune 220,70,1.905621,9e-06,1.950264,0.000188,0.044643
306,2019-05-16 14:16:23.815368,tune 220,70,1.914761,1.9e-05,1.951837,0.000144,0.037076
296,2019-05-16 14:04:17.076904,tune 220,70,1.913728,1.9e-05,1.95266,0.000209,0.038932
252,2019-05-16 12:27:48.729137,tune 220,70,1.929379,1.6e-05,1.964114,0.000176,0.034736
298,2019-05-16 14:06:05.931616,tune 220,70,1.934922,1.5e-05,1.968955,0.000155,0.034033
287,2019-05-16 13:53:05.005114,tune 220,70,1.936148,1.8e-05,1.970082,0.000166,0.033934
303,2019-05-16 14:12:09.034276,tune 220,70,1.942505,1.6e-05,1.972552,0.000159,0.030047
267,2019-05-16 13:10:04.468930,tune 220,70,1.942479,2.2e-05,1.974155,0.000162,0.031677


In [15]:
df_trial = pd.DataFrame(mytrial)
df_trial.to_pickle('../trial/gradientboosting.pkl')