In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import lightgbm as lgb
import catboost as cb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from fastFM import als, mcmc, sgd
# from pyfm import pylibfm

import optuna

from common import EP
from dfdb import DFDB

import types
import copy

  from numpy.core.umath_tests import inner1d
Using TensorFlow backend.


In [3]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [4]:
pd.set_option('display.max_colwidth', -1)

In [5]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [6]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [7]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [8]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [14]:
unique_columns = ['5000variance_median_',
 "number_peaks{'n': 5}",
 'spkt_welch_density__coeff_3',
 '5000quantile99median_',
 'q05_roll_std_1000',
 '5000skewness_max_',
 '5000form_factor_quantile75',
 'abs_q05_6',
 '5000smoothness_std_',
 'abs_q75_6',
 'q05_5',
 'abs_q75_2',
 'abs_q95_6',
 '5000smoothness_median_',
 'median__roll_std',
 'abs_q75_7',
 '5000smoothness_quantile25',
 'q75_roll_std_10',
 '5000median_variance_',
 'abs_q25_5',
 "number_peaks{'n': 1}",
 '5000smoothness_quantile05',
 'MA_1000MA_std_mean_7',
 'iqr_6',
 '5000std_median_',
 '3th_peak_freq',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'abs_q01_5',
 'spkt_welch_density__coeff_4',
 '5000variance_quantile25',
 'q01_2',
 '5000crest_factor_quantile75',
 '5000std_quantile05',
 'q01_roll_std_10',
 'iqr_3',
 'abs_q01_3',
 '5000std_quantile25',
 'spkt_welch_densitycoeff_2',
 'q01_roll_std_1000',
 "autocorrelation{'lag': 5}",
 "value_count{'value': 1}",
 "number_peaks{'n': 3}",
 'abs_q95_2',
 "number_peaks{'n': 10}",
 '5000smoothness_mean_',
 '5000smoothness_entropy_',
 'q05_roll_std_100',
 'q01_roll_std_100',
 'abs_q01_4',
 "quantile{'q': 0.9}",
 'q05_roll_std_10',
 'FFT_Mag_75q0',
 'q25_roll_std_100']

In [12]:
common_columns = ['q05_roll_std_1000',
 'abs_q75_6',
 'abs_q25_5',
 'iqr_6',
 "number_peaks{'n': 10}",
 'q05_roll_std_100',
 'abs_q01_4',
 'q25_roll_std_100']

In [13]:
common_columns75 = ['spkt_welch_density__coeff_3',
 'q05_roll_std_1000',
 'abs_q75_6',
 'q05_5',
 'median__roll_std',
 'abs_q75_7',
 'abs_q25_5',
 '5000smoothness_quantile05',
 'iqr_6',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'spkt_welch_densitycoeff_2',
 "number_peaks{'n': 10}",
 'q05_roll_std_100',
 'abs_q01_4',
 'q05_roll_std_10',
 'q25_roll_std_100']

In [15]:
# mytrial = []
db = DFDB('../trial/fm.pkl', auto_commit=False)

In [36]:
param = {'algorithm': {'cls': 'als.FMRegression',
  'fit': {},
  'init': {'n_iter': 10,
   'init_stdev': 0.00030963137584220923,
   'rank': 2,
   'random_state': 42,
   'l2_reg_w': 0.1,
   'l2_reg_V': 0.1,
   'l2_reg': 0}},
 'columns': common_columns75,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler'}}

In [40]:
# run one try
mytrial = []
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='try common_columns75')
db.insert(mytrial[0])
df_trial = db.select()

In [43]:
df_trial[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
0,2019-05-19 02:27:37.231419,53,2.046364,0.00276,2.133099,0.007646,0.086735,try unique_columns
1,2019-05-19 02:30:35.286431,8,2.103744,0.002911,2.114094,0.01188,0.010349,try common_columns
2,2019-05-19 02:30:51.965487,16,2.162543,0.003232,2.172957,0.01146,0.010414,try common_columns75


In [45]:
db.commit()

In [None]:
mytrial = []
#  tune hypterparameters
def objective(trial):

    n_iter = trial.suggest_int('n_iter', 10, 100)
    init_stdev = trial.suggest_uniform('init_stdev', 0.00001, .01)
    rank = trial.suggest_int('rank', 2, 16)
    random_state = trial.suggest_int('random_state', 0, 9999)
        
    args={'algorithm': {'cls': 'als.FMRegression',
      'fit': {},
      'init': {'n_iter': n_iter,
       'init_stdev': init_stdev,
       'rank': rank,
       'random_state': random_state,}},
     'columns': common_columns,
     'kfold': {'n_splits': 3,
      'random_state': 1985,
      'shuffle': True,
      'type': 'group'},
     'scaler': {'cls': 'StandardScaler'}}
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 1')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2019-05-19 02:34:43,332] Finished trial#0 resulted in value: 0.03402100634977167. Current best value is 0.03402100634977167 with parameters: {'n_iter': 89, 'init_stdev': 0.003222444750043738, 'rank': 16, 'random_state': 2447}.
[I 2019-05-19 02:34:44,578] Finished trial#1 resulted in value: 0.022427260037415204. Current best value is 0.022427260037415204 with parameters: {'n_iter': 30, 'init_stdev': 0.008279886934058144, 'rank': 2, 'random_state': 1633}.
[I 2019-05-19 02:34:47,743] Finished trial#2 resulted in value: 0.02863462341975847. Current best value is 0.022427260037415204 with parameters: {'n_iter': 30, 'init_stdev': 0.008279886934058144, 'rank': 2, 'random_state': 1633}.
[I 2019-05-19 02:34:49,181] Finished trial#3 resulted in value: 0.024687066057663406. Current best value is 0.022427260037415204 with parameters: {'n_iter': 30, 'init_stdev': 0.008279886934058144, 'rank': 2, 'random_state': 1633}.
[I 2019-05-19 02:34:53,920] Finished trial#4 resulted in value: 0.032311205374

In [47]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [52]:
df_trial[(df_trial['remark']=='tune 1')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']].head()

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
87,2019-05-19 02:39:02.429082,8,2.081971,0.003128,2.098198,0.013561,0.016227,tune 1
31,2019-05-19 02:36:22.957864,8,2.082857,0.003129,2.098202,0.013558,0.015345,tune 1
17,2019-05-19 02:35:35.107136,8,2.084507,0.0032,2.098301,0.013485,0.013795,tune 1
74,2019-05-19 02:38:25.206793,8,2.08527,0.002988,2.098578,0.013128,0.013308,tune 1
100,2019-05-19 02:39:40.090049,8,2.09027,0.002844,2.098669,0.012769,0.008399,tune 1


In [53]:
db.commit()

In [54]:
param = copy.deepcopy(df_trial.loc[87]['param'])
param['kfold']['type'] = 'stratified'
# run one try
mytrial=[]
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial, remark='remodel 87 use stratified')

In [55]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [56]:
df_trial.tail(1)[['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
103,2019-05-19 02:46:58.134980,8,2.084925,4e-06,2.087292,4.2e-05,0.002367


In [57]:
db.commit()