In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from fastFM import als, mcmc, sgd
# from pyfm import pylibfm

import optuna

from common import EP
from dfdb import DFDB

import types
import copy

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_colwidth', -1)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [7]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [8]:
common_columns50 = ["number_peaks{'n': 5}",
 'abs_q75_6',
 'q01_roll_std_100',
 '5000crest_factor_quantile75',
 'abs_q01_4',
 'q25_roll_std_100',
 'q05_roll_std_10',
 'median__roll_std',
 'abs_q05_6',
 '5000form_factor_quantile75',
 '5000smoothness_quantile05',
 "quantile{'q': 0.9}",
 'abs_q75_2',
 'q01_roll_std_1000',
 '5000quantile75mean_',
 'spkt_welch_density__coeff_3',
 "number_peaks{'n': 10}",
 "number_peaks{'n': 1}",
 '5000smoothness_mean_',
 'abs_q25_5',
 '5000std_quantile05',
 '5000smoothness_std_',
 '5000smoothness_median_',
 '5000median_variance_',
 'spkt_welch_density__coeff_4',
 '5000variance_quantile25',
 'abs_q95_2',
 'abs_q75_7',
 'q05_5',
 '5000smoothness_entropy_',
 '5000smoothness_quantile25',
 'q01_2',
 'q05_roll_std_100',
 '3th_peak_freq',
 "autocorrelation{'lag': 5}",
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'q01_roll_std_10',
 'abs_q01_3',
 '5000variance_median_',
 'q75_roll_std_10',
 '5000skewness_max_',
 'iqr_3',
 'abs_q01_5',
 'q75_roll_mean_10',
 '5000quantile99median_',
 'iqr_6',
 'Hilbert_mean_6',
 'q05_roll_std_1000',
 "number_peaks{'n': 3}",
 'spkt_welch_densitycoeff_2',
 '5000std_median_',
 '5000std_quantile25']

In [9]:
common_columns75 = ['abs_q75_6',
 'abs_q01_4',
 'q25_roll_std_100',
 'q05_roll_std_10',
 'median__roll_std',
 '5000smoothness_quantile05',
 'spkt_welch_density__coeff_3',
 "number_peaks{'n': 10}",
 'abs_q25_5',
 'abs_q75_7',
 'q05_5',
 'q05_roll_std_100',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'iqr_6',
 'q05_roll_std_1000']

In [10]:
common_columns95 = ['abs_q75_6',
 'abs_q01_4',
 'q25_roll_std_100',
 "number_peaks{'n': 10}",
 'abs_q25_5',
 'q05_roll_std_100',
 'iqr_6',
 'q05_roll_std_1000']

In [17]:
# mytrial = []
db = DFDB('../trial2/knn.pkl', auto_commit=False)

In [24]:
param = {'algorithm': {'cls': 'KNeighborsRegressor',
  'fit': {},
  'init': {'n_neighbors': 477,
   'weights': 'uniform',
   'algorithm': 'ball_tree',
   'leaf_size': 30,
   'p': 2,
   'metric': 'minkowski'}},
 'columns':common_columns95 ,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler'}}

In [25]:
# run one try
mytrial = []
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='try common_columns95')
db.insert(mytrial[0])
df_trial = db.select()

In [26]:
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
0,2019-05-20 00:42:09.757131,52,2.009122,0.002402,2.110341,0.007128,0.101219,try common_columns50
1,2019-05-20 00:47:07.028265,15,2.023477,0.002784,2.075365,0.013826,0.051888,try common_columns75
2,2019-05-20 00:48:39.095936,8,2.042961,0.002507,2.081918,0.012869,0.038957,try common_columns95


In [27]:
db.commit()

In [33]:
mytrial = []
#  tune hypterparameters
def objective(trial):

    n_neighbors = trial.suggest_int('n_neighbors', 100, 1000)
        
    args={'algorithm': {'cls': 'KNeighborsRegressor',
  'fit': {},
  'init': {'n_neighbors': n_neighbors,
   'weights': 'uniform',
   'algorithm': 'ball_tree',
   'leaf_size': 30,
   'p': 2,
   'metric': 'minkowski'}},
 'columns':common_columns75 ,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler'}}
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 1')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-20 01:10:35,889] Finished a trial resulted in value: 0.07615869884263476. Current best value is 0.07615869884263476 with parameters: {'n_neighbors': 887}.
[I 2019-05-20 01:11:21,405] Finished a trial resulted in value: 0.11396327769966753. Current best value is 0.07615869884263476 with parameters: {'n_neighbors': 887}.
[I 2019-05-20 01:12:18,191] Finished a trial resulted in value: 0.08870388375995865. Current best value is 0.07615869884263476 with parameters: {'n_neighbors': 887}.
[I 2019-05-20 01:13:16,912] Finished a trial resulted in value: 0.0850852265488679. Current best value is 0.07615869884263476 with parameters: {'n_neighbors': 887}.
[I 2019-05-20 01:14:17,831] Finished a trial resulted in value: 0.08158442391583545. Current best value is 0.07615869884263476 with parameters: {'n_neighbors': 887}.
[I 2019-05-20 01:15:26,105] Finished a trial resulted in value: 0.07219083312210565. Current best value is 0.07219083312210565 with parameters: {'n_neighbors': 959}.
[I 20

[I 2019-05-20 02:49:37,761] Finished a trial resulted in value: 0.07331220152556156. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.
[I 2019-05-20 02:50:16,744] Finished a trial resulted in value: 0.13528627848052596. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.
[I 2019-05-20 02:51:25,201] Finished a trial resulted in value: 0.0747158082344836. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.
[I 2019-05-20 02:52:36,967] Finished a trial resulted in value: 0.0706310072936702. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.
[I 2019-05-20 02:53:44,449] Finished a trial resulted in value: 0.07616463157492045. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.
[I 2019-05-20 02:54:14,972] Finished a trial resulted in value: 0.22636095104483034. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.


[I 2019-05-20 04:32:33,355] Finished a trial resulted in value: 0.07057080889738962. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.
[I 2019-05-20 04:33:41,377] Finished a trial resulted in value: 0.07497984270279182. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.
[I 2019-05-20 04:34:46,007] Finished a trial resulted in value: 0.07947618160094456. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.
[I 2019-05-20 04:35:37,692] Finished a trial resulted in value: 0.10172111105894625. Current best value is 0.07055699440678302 with parameters: {'n_neighbors': 1000}.


In [35]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [43]:
df_trial[(df_trial['remark']=='tune 1')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']].head()

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
253,2019-05-20 01:59:56.752054,15,2.025539,0.002811,2.075086,0.013919,0.049548,tune 1
53,2019-05-20 01:59:56.752054,15,2.025539,0.002811,2.075086,0.013919,0.049548,tune 1
202,2019-05-20 04:35:37.676850,15,2.026069,0.002818,2.075089,0.013917,0.04902,tune 1
402,2019-05-20 04:35:37.676850,15,2.026069,0.002818,2.075089,0.013917,0.04902,tune 1
89,2019-05-20 02:36:29.707501,15,2.028377,0.002859,2.075174,0.013891,0.046797,tune 1


In [44]:
param = copy.deepcopy(df_trial.loc[253]['param'])
param['kfold']['type'] = 'stratified'
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 253 use stratified')

In [45]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [46]:
df_trial.tail(1)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
403,2019-05-20 04:43:04.926060,15,2.038075,4e-06,2.042308,3.7e-05,0.004233


In [47]:
db.commit()

In [None]:
mytrial = []
#  tune hypterparameters
def objective(trial):

    n_neighbors = trial.suggest_int('n_neighbors', 100, 1000)
        
    args={'algorithm': {'cls': 'KNeighborsRegressor',
  'fit': {},
  'init': {'n_neighbors': n_neighbors,
   'weights': 'uniform',
   'algorithm': 'ball_tree',
   'leaf_size': 30,
   'p': 2,
   'metric': 'minkowski'}},
 'columns':common_columns75 ,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler'}}
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 1 by stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-20 04:47:10,225] Finished a trial resulted in value: 0.007353461358213766. Current best value is 0.007353461358213766 with parameters: {'n_neighbors': 613}.
[I 2019-05-20 04:47:49,822] Finished a trial resulted in value: 0.014806073018054283. Current best value is 0.007353461358213766 with parameters: {'n_neighbors': 613}.
[I 2019-05-20 04:48:59,174] Finished a trial resulted in value: 0.0037748296280766167. Current best value is 0.0037748296280766167 with parameters: {'n_neighbors': 918}.
[I 2019-05-20 04:49:47,497] Finished a trial resulted in value: 0.010526797622112331. Current best value is 0.0037748296280766167 with parameters: {'n_neighbors': 918}.
[I 2019-05-20 04:50:26,890] Finished a trial resulted in value: 0.015435773635910175. Current best value is 0.0037748296280766167 with parameters: {'n_neighbors': 918}.
[I 2019-05-20 04:51:07,573] Finished a trial resulted in value: 0.014806073018054283. Current best value is 0.0037748296280766167 with parameters: {'n_neigh

In [50]:
for trial_i in mytrial:
    db.insert(trial_i)

In [51]:
df_trial = db.select()
df_trial[(df_trial['remark']=='tune 1 by stratified')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(10)

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
521,2019-05-20 06:45:30.675143,tune 1 by stratified,15,1.991735,3e-06,2.010023,1.6e-05,0.018288
422,2019-05-20 05:03:59.501560,tune 1 by stratified,15,1.996266,3e-06,2.012802,2.1e-05,0.016536
459,2019-05-20 05:40:50.508814,tune 1 by stratified,15,2.006866,4e-06,2.019675,2.2e-05,0.012809
515,2019-05-20 06:39:28.357982,tune 1 by stratified,15,2.007511,5e-06,2.020119,2.5e-05,0.012608
591,2019-05-20 07:59:36.109166,tune 1 by stratified,15,2.009073,6e-06,2.021087,2.7e-05,0.012013
445,2019-05-20 05:27:16.886509,tune 1 by stratified,15,2.013984,1e-05,2.02473,2.5e-05,0.010746
540,2019-05-20 07:03:55.455644,tune 1 by stratified,15,2.016349,1.1e-05,2.026548,2.8e-05,0.0102
442,2019-05-20 05:24:18.172510,tune 1 by stratified,15,2.01815,1e-05,2.02782,2.6e-05,0.00967
412,2019-05-20 04:53:35.269502,tune 1 by stratified,15,2.018754,1e-05,2.028194,2.5e-05,0.00944
543,2019-05-20 07:06:37.199683,tune 1 by stratified,15,2.020841,1e-05,2.029682,3e-05,0.008841


In [52]:
param = copy.deepcopy(df_trial.loc[521]['param'])
param['kfold']['type'] = 'group'
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 521 use group')

In [53]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [58]:
df_trial.tail(1)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff', 'remark']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
604,2019-05-20 09:19:19.905349,15,1.965282,0.00242,2.082229,0.013162,0.116947,remodel 521 use group


In [59]:
db.commit()