In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from rgf.sklearn import RGFRegressor

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from dfdb import DFDB

import types
import copy


numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_colwidth', -1)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [6]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [7]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [8]:
catboost_columns = ['spkt_welch_density__coeff_3',
 'spkt_welch_densitycoeff_2',
 'abs_q25_5',
 'abs_q75_6',
 'q05_roll_std_1000',
 'abs_q75_7',
 'abs_q95_2',
 'q05_5',
 'abs_q75_2',
 '5000skewness_max_',
 'fft_coefficientcoeff_80__attr_"imag"',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 '5000kurtosis_mean_',
 "number_peaks{'n': 1}",
 '5000smoothness_entropy_',
 'ave10_7',
 'q75_roll_std_1000',
 'FFT_Mag_25q0',
 'fft_coefficientcoeff_20__attr_"abs"']
lgbm_columns = ['q25_roll_std_100',
 'abs_q25_5',
 'spkt_welch_density__coeff_3',
 'abs_q75_6',
 'abs_q75_7',
 'spkt_welch_densitycoeff_2',
 'median__roll_std',
 'abs_q01_5',
 '5000smoothness_quantile05',
 '5000smoothness_std_',
 'abs_q95_3',
 'FFT_Mag_75q0',
 '5000median_std_',
 'spkt_welch_density__coeff_17']
xgbm_columns = ['q25_roll_std_100',
 'abs_q25_5',
 'spkt_welch_density__coeff_3',
 'abs_q75_7',
 'spkt_welch_densitycoeff_2',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'agg_autocorrelationf_agg_"mean"__maxlag_40',
 "number_peaks{'n': 1}",
 '5000peak_peak_amp_max_',
 'abs_q95_3',
 'spkt_welch_density__coeff_89',
 'abs_q05_2']
randomforest_randomforest = ['abs_q25_5', 'abs_q01_4', 'q25_roll_std_100']
extratrees_columns = ['q05_2',
 "number_peaks{'n': 1}",
 'abs_q01_6',
 'abs_q95_2',
 '5000smoothness_quantile25',
 '5000std_median_',
 '5000smoothness_median_',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'abs_q75_6',
 "number_peaks{'n': 3}",
 'q75_roll_std_10',
 "number_peaks{'n': 10}",
 '5000min_quantile75',
 '5000smoothness_quantile05',
 "number_peaks{'n': 5}",
 'abs_q01_2',
 '5000smoothness_mean_',
 'min_roll_std_100',
 'abs_q05_2',
 'q01_roll_std_1000']
gradientboosting_columns = ['q05_5',
 'kurt_1',
 'abs_q75_6',
 'abs_q75_7',
 'spkt_welch_density__coeff_28',
 'spkt_welch_density__coeff_99',
 'fft_coefficientcoeff_6__attr_"abs"',
 '5000smoothness_quantile05',
 'q25_roll_std_100',
 'spkt_welch_densitycoeff_2',
 'abs_max_1',
 "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
 'abs_q25_5',
 'abs_q01_7',
 'q05_8',
 'Hilbert_mean_6',
 'abs_q95_2',
 '5000skewness_max_',
 '5000kurtosis_mean_',
 'spkt_welch_density__coeff_3']

In [9]:
all_columns = catboost_columns+lgbm_columns+xgbm_columns+randomforest_randomforest+extratrees_columns+gradientboosting_columns
unique_columns = list(set(all_columns))
common_columns = []
common_columns50 = []
common_columns75 = []
common_columns95 = []
N_columns = 6
count_values = [all_columns.count(col) for col in unique_columns]
for col in unique_columns:
    if all_columns.count(col)==N_columns:
        common_columns.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .5):
        common_columns50.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .75):
        common_columns75.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .95):
        common_columns95.append(col)
print('unique_columns ',len(unique_columns))
print('common_columns50 ',len(common_columns50))
print('common_columns75 ',len(common_columns75))
print('common_columns95 ',len(common_columns95))
print('common_columns ',len(common_columns))

unique_columns  55
common_columns50  55
common_columns75  15
common_columns95  7
common_columns  0


In [14]:
db = DFDB('../trial/frgf.pkl', auto_commit=False)

In [15]:
df_trial = db.select()

In [22]:
mytrial = []
columns = common_columns50
param = param = {'columns': common_columns95,
 'kfold': {'n_splits': 8,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler', 'init':{}},
 'algorithm': {'cls': 'RGFRegressor',
  'init': {'max_leaf': 3424,
   'reg_depth': 17.29510357020571,
   'l2': 0.014605736255217133,
   'min_samples_leaf': 93,
   'learning_rate': 0.3374237229665843},
  'fit': {}},
}

selected_columns = EP.revert_rfe(df_train, param, columns, df_test, mytrial, start_columns=common_columns95, remark='start from top1 column 2th')
print(len(selected_columns))
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold'])

20


NameError: name 'df_trial_' is not defined

In [27]:
db.commit()

In [26]:
df_trial[(df_trial['remark']=='start from top1 column 2th')&(df_trial['mae_diff']<.2)].sort_values(by=['val_mae'])[['datetime','remark','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].head(5)

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
394,2019-05-25 04:03:39.956778,start from top1 column 2th,20,1.899065,0.010376,2.055058,0.48899,0.155993
393,2019-05-25 03:53:53.739119,start from top1 column 2th,20,1.899065,0.010376,2.055058,0.48899,0.155993
387,2019-05-25 02:55:35.957430,start from top1 column 2th,19,1.90056,0.010414,2.055221,0.493023,0.154661
378,2019-05-25 01:29:12.797918,start from top1 column 2th,19,1.90056,0.010414,2.055221,0.493023,0.154661
385,2019-05-25 02:36:44.776411,start from top1 column 2th,19,1.90056,0.010414,2.055221,0.493023,0.154661


In [28]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    max_leaf = trial.suggest_int('max_leaf', 50, 5000)
    reg_depth = trial.suggest_uniform('reg_depth', 1.0, 100.0)
    l2 = trial.suggest_uniform('l2', 0.001, .1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 100)
    learning_rate = trial.suggest_uniform('learning_rate', .05, .5)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'RGFRegressor',
            'init':{
                "max_leaf":max_leaf,
                "reg_depth":reg_depth,
                "l2":l2,
                "min_samples_leaf":min_samples_leaf,
                "learning_rate":learning_rate,
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune x group')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[32m[I 2019-05-25 07:28:12,805][0m Finished trial#0 resulted in value: 0.2815771961221072. Current best value is 0.2815771961221072 with parameters: {'max_leaf': 1075, 'reg_depth': 8.510567747457767, 'l2': 0.014404781293938077, 'min_samples_leaf': 31, 'learning_rate': 0.42941984528927757}.[0m
[32m[I 2019-05-25 07:29:45,211][0m Finished trial#1 resulted in value: 0.20007595992363772. Current best value is 0.20007595992363772 with parameters: {'max_leaf': 2153, 'reg_depth': 68.23138662272015, 'l2': 0.014089681492332584, 'min_samples_leaf': 11, 'learning_rate': 0.1526764218952854}.[0m
[32m[I 2019-05-25 07:30:22,779][0m Finished trial#2 resulted in value: 0.08948112092156181. Current best value is 0.08948112092156181 with parameters: {'max_leaf': 893, 'reg_depth': 94.13034468947681, 'l2': 0.05955688675735973, 'min_samples_leaf': 19, 'learning_rate': 0.3232487472022709}.[0m
[32m[I 2019-05-25 07:32:17,352][0m Finished trial#3 resulted in value: 0.26260027445704004. Current best va

[32m[I 2019-05-25 08:08:27,447][0m Finished trial#28 resulted in value: 0.24089500827847346. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 08:10:50,739][0m Finished trial#29 resulted in value: 0.5646545564966633. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 08:10:58,380][0m Finished trial#30 resulted in value: 0.032350828090305296. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 08:11:41,662][0m Finished trial#31 resulted in value: 0.10424816754708405. Current best v

[32m[I 2019-05-25 08:45:51,451][0m Finished trial#56 resulted in value: 0.10767726717143367. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 08:46:31,476][0m Finished trial#57 resulted in value: 0.17881942758845032. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 08:46:41,936][0m Finished trial#58 resulted in value: 0.07521540773316267. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 08:47:46,365][0m Finished trial#59 resulted in value: 0.09437051541551597. Current best v

[32m[I 2019-05-25 09:15:38,440][0m Finished trial#84 resulted in value: 0.28937665663161466. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 09:16:29,375][0m Finished trial#85 resulted in value: 0.09495182375082273. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 09:20:12,739][0m Finished trial#86 resulted in value: 0.14904385999781747. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 09:22:56,168][0m Finished trial#87 resulted in value: 0.14929368085646474. Current best v

[32m[I 2019-05-25 09:55:02,869][0m Finished trial#112 resulted in value: 0.03934478916521266. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 09:55:40,618][0m Finished trial#113 resulted in value: 0.08601418444759891. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 09:57:01,858][0m Finished trial#114 resulted in value: 0.20059510746347486. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 09:57:28,428][0m Finished trial#115 resulted in value: 0.07315891474466674. Current be

[32m[I 2019-05-25 10:37:12,363][0m Finished trial#140 resulted in value: 0.08881374760570715. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 10:37:48,171][0m Finished trial#141 resulted in value: 0.11030768403722593. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 10:38:04,602][0m Finished trial#142 resulted in value: 0.07892927675137505. Current best value is 0.02747818613934046 with parameters: {'max_leaf': 51, 'reg_depth': 99.44061087904409, 'l2': 0.08581275265226812, 'min_samples_leaf': 55, 'learning_rate': 0.12604636016214937}.[0m
[32m[I 2019-05-25 10:40:07,414][0m Finished trial#143 resulted in value: 0.13220566911804096. Current be

[32m[I 2019-05-25 10:56:37,839][0m Finished trial#168 resulted in value: 0.12650873832643786. Current best value is 0.021836314427236547 with parameters: {'max_leaf': 57, 'reg_depth': 99.82354179074754, 'l2': 0.09009669763713389, 'min_samples_leaf': 85, 'learning_rate': 0.057769272717762624}.[0m
[32m[I 2019-05-25 10:57:01,720][0m Finished trial#169 resulted in value: 0.058330489183041345. Current best value is 0.021836314427236547 with parameters: {'max_leaf': 57, 'reg_depth': 99.82354179074754, 'l2': 0.09009669763713389, 'min_samples_leaf': 85, 'learning_rate': 0.057769272717762624}.[0m
[32m[I 2019-05-25 10:57:36,043][0m Finished trial#170 resulted in value: 0.07446925919956836. Current best value is 0.021836314427236547 with parameters: {'max_leaf': 57, 'reg_depth': 99.82354179074754, 'l2': 0.09009669763713389, 'min_samples_leaf': 85, 'learning_rate': 0.057769272717762624}.[0m
[32m[I 2019-05-25 10:58:40,135][0m Finished trial#171 resulted in value: 0.10977990126083921. Cur

[32m[I 2019-05-25 11:22:24,168][0m Finished trial#196 resulted in value: 0.06916532832497542. Current best value is 0.02160470317274568 with parameters: {'max_leaf': 56, 'reg_depth': 86.93935962704745, 'l2': 0.08291662162715731, 'min_samples_leaf': 92, 'learning_rate': 0.05028793078327748}.[0m
[32m[I 2019-05-25 11:23:19,060][0m Finished trial#197 resulted in value: 0.22481582248295076. Current best value is 0.02160470317274568 with parameters: {'max_leaf': 56, 'reg_depth': 86.93935962704745, 'l2': 0.08291662162715731, 'min_samples_leaf': 92, 'learning_rate': 0.05028793078327748}.[0m
[32m[I 2019-05-25 11:24:30,536][0m Finished trial#198 resulted in value: 0.10115180874471724. Current best value is 0.02160470317274568 with parameters: {'max_leaf': 56, 'reg_depth': 86.93935962704745, 'l2': 0.08291662162715731, 'min_samples_leaf': 92, 'learning_rate': 0.05028793078327748}.[0m
[32m[I 2019-05-25 11:27:25,086][0m Finished trial#199 resulted in value: 0.13157989314098195. Current be

In [29]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [30]:
df_trial[(df_trial['remark']=='tune x group')&(df_trial['mae_diff']<.1)][['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].sort_values(by=['val_mae']).head(10)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
522,2019-05-25 10:09:22.575503,20,1.94686,0.00228,2.046279,0.015523,0.099419
511,2019-05-25 09:57:01.735969,20,1.948493,0.0023,2.046511,0.01572,0.098018
437,2019-05-25 08:24:15.402438,20,1.948143,0.002302,2.046703,0.015722,0.09856
504,2019-05-25 09:48:51.312109,20,1.947221,0.00229,2.046977,0.015598,0.099756
398,2019-05-25 07:29:45.113883,20,1.949497,0.002319,2.047227,0.015616,0.09773
403,2019-05-25 07:40:56.026731,20,1.951443,0.00233,2.047642,0.015602,0.096199
454,2019-05-25 08:46:31.370475,20,1.96199,0.002452,2.049251,0.015645,0.087261
434,2019-05-25 08:19:13.746008,20,1.9596,0.002418,2.04948,0.015726,0.08988
535,2019-05-25 10:33:46.682895,20,1.964931,0.002447,2.050637,0.015711,0.085705
515,2019-05-25 10:02:26.107042,20,1.966497,0.002469,2.051159,0.015871,0.084663


In [31]:
db.commit()

In [32]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    max_leaf = trial.suggest_int('max_leaf', 50, 5000)
    reg_depth = trial.suggest_uniform('reg_depth', 1.0, 100.0)
    l2 = trial.suggest_uniform('l2', 0.001, .1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 100)
    learning_rate = trial.suggest_uniform('learning_rate', .05, .5)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 8,
            'random_state': 1985,
            'shuffle': True,
            'type': 'group'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'RGFRegressor',
            'init':{
                "max_leaf":max_leaf,
                "reg_depth":reg_depth,
                "l2":l2,
                "min_samples_leaf":min_samples_leaf,
                "learning_rate":learning_rate,
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune x group 8')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[32m[I 2019-05-25 12:41:51,220][0m Finished trial#0 resulted in value: 0.1851734077939788. Current best value is 0.1851734077939788 with parameters: {'max_leaf': 4587, 'reg_depth': 87.15971648223338, 'l2': 0.09484795811344579, 'min_samples_leaf': 21, 'learning_rate': 0.2941347682469026}.[0m
[32m[I 2019-05-25 12:44:29,975][0m Finished trial#1 resulted in value: 0.24242143722016157. Current best value is 0.1851734077939788 with parameters: {'max_leaf': 4587, 'reg_depth': 87.15971648223338, 'l2': 0.09484795811344579, 'min_samples_leaf': 21, 'learning_rate': 0.2941347682469026}.[0m
[32m[I 2019-05-25 12:55:10,187][0m Finished trial#2 resulted in value: 0.2094159378365811. Current best value is 0.1851734077939788 with parameters: {'max_leaf': 4587, 'reg_depth': 87.15971648223338, 'l2': 0.09484795811344579, 'min_samples_leaf': 21, 'learning_rate': 0.2941347682469026}.[0m
[32m[I 2019-05-25 13:10:43,996][0m Finished trial#3 resulted in value: 0.20604325531780177. Current best value i

[32m[I 2019-05-25 15:40:11,357][0m Finished trial#28 resulted in value: 0.1852368918790903. Current best value is 0.13277254208236638 with parameters: {'max_leaf': 126, 'reg_depth': 46.584516424227324, 'l2': 0.05124065873399325, 'min_samples_leaf': 77, 'learning_rate': 0.23226033026525605}.[0m
[32m[I 2019-05-25 15:40:50,085][0m Finished trial#29 resulted in value: 0.08787119352801064. Current best value is 0.08787119352801064 with parameters: {'max_leaf': 98, 'reg_depth': 82.9966302332237, 'l2': 0.0995820888793864, 'min_samples_leaf': 72, 'learning_rate': 0.10296080117538534}.[0m
[32m[I 2019-05-25 15:53:47,374][0m Finished trial#30 resulted in value: 0.1759121203052553. Current best value is 0.08787119352801064 with parameters: {'max_leaf': 98, 'reg_depth': 82.9966302332237, 'l2': 0.0995820888793864, 'min_samples_leaf': 72, 'learning_rate': 0.10296080117538534}.[0m
[32m[I 2019-05-25 15:56:57,056][0m Finished trial#31 resulted in value: 0.14295181243719499. Current best value

[32m[I 2019-05-25 19:14:23,703][0m Finished trial#56 resulted in value: 0.18472707357981893. Current best value is 0.08787119352801064 with parameters: {'max_leaf': 98, 'reg_depth': 82.9966302332237, 'l2': 0.0995820888793864, 'min_samples_leaf': 72, 'learning_rate': 0.10296080117538534}.[0m
[32m[I 2019-05-25 19:20:47,391][0m Finished trial#57 resulted in value: 0.18183825283509555. Current best value is 0.08787119352801064 with parameters: {'max_leaf': 98, 'reg_depth': 82.9966302332237, 'l2': 0.0995820888793864, 'min_samples_leaf': 72, 'learning_rate': 0.10296080117538534}.[0m
[32m[I 2019-05-25 19:24:25,832][0m Finished trial#58 resulted in value: 0.18272490534678756. Current best value is 0.08787119352801064 with parameters: {'max_leaf': 98, 'reg_depth': 82.9966302332237, 'l2': 0.0995820888793864, 'min_samples_leaf': 72, 'learning_rate': 0.10296080117538534}.[0m
[32m[I 2019-05-25 19:24:58,211][0m Finished trial#59 resulted in value: 0.08953715179820905. Current best value i

[32m[I 2019-05-25 20:56:42,239][0m Finished trial#84 resulted in value: 0.1414679749431687. Current best value is 0.08727753534369743 with parameters: {'max_leaf': 54, 'reg_depth': 98.0235048587394, 'l2': 0.05665322133393795, 'min_samples_leaf': 16, 'learning_rate': 0.27001480079317974}.[0m
[32m[I 2019-05-25 21:14:07,233][0m Finished trial#85 resulted in value: 0.18437713641285555. Current best value is 0.08727753534369743 with parameters: {'max_leaf': 54, 'reg_depth': 98.0235048587394, 'l2': 0.05665322133393795, 'min_samples_leaf': 16, 'learning_rate': 0.27001480079317974}.[0m
[32m[I 2019-05-25 21:15:43,233][0m Finished trial#86 resulted in value: 0.14158764093227882. Current best value is 0.08727753534369743 with parameters: {'max_leaf': 54, 'reg_depth': 98.0235048587394, 'l2': 0.05665322133393795, 'min_samples_leaf': 16, 'learning_rate': 0.27001480079317974}.[0m
[32m[I 2019-05-25 21:19:13,745][0m Finished trial#87 resulted in value: 0.18801836628341018. Current best value

[32m[I 2019-05-25 23:33:18,788][0m Finished trial#112 resulted in value: 0.13716586897956148. Current best value is 0.08727753534369743 with parameters: {'max_leaf': 54, 'reg_depth': 98.0235048587394, 'l2': 0.05665322133393795, 'min_samples_leaf': 16, 'learning_rate': 0.27001480079317974}.[0m
[32m[I 2019-05-25 23:39:34,716][0m Finished trial#113 resulted in value: 0.1748750637462749. Current best value is 0.08727753534369743 with parameters: {'max_leaf': 54, 'reg_depth': 98.0235048587394, 'l2': 0.05665322133393795, 'min_samples_leaf': 16, 'learning_rate': 0.27001480079317974}.[0m
[32m[I 2019-05-25 23:41:11,726][0m Finished trial#114 resulted in value: 0.13762913229178075. Current best value is 0.08727753534369743 with parameters: {'max_leaf': 54, 'reg_depth': 98.0235048587394, 'l2': 0.05665322133393795, 'min_samples_leaf': 16, 'learning_rate': 0.27001480079317974}.[0m
[32m[I 2019-05-25 23:43:34,851][0m Finished trial#115 resulted in value: 0.15901712556630304. Current best v

[32m[I 2019-05-26 01:54:48,238][0m Finished trial#140 resulted in value: 0.10616877399336672. Current best value is 0.08727753534369743 with parameters: {'max_leaf': 54, 'reg_depth': 98.0235048587394, 'l2': 0.05665322133393795, 'min_samples_leaf': 16, 'learning_rate': 0.27001480079317974}.[0m
[32m[I 2019-05-26 01:57:51,496][0m Finished trial#141 resulted in value: 0.1695254718734508. Current best value is 0.08727753534369743 with parameters: {'max_leaf': 54, 'reg_depth': 98.0235048587394, 'l2': 0.05665322133393795, 'min_samples_leaf': 16, 'learning_rate': 0.27001480079317974}.[0m
[32m[I 2019-05-26 02:03:52,194][0m Finished trial#142 resulted in value: 0.16759926831814206. Current best value is 0.08727753534369743 with parameters: {'max_leaf': 54, 'reg_depth': 98.0235048587394, 'l2': 0.05665322133393795, 'min_samples_leaf': 16, 'learning_rate': 0.27001480079317974}.[0m
[32m[I 2019-05-26 02:13:38,017][0m Finished trial#143 resulted in value: 0.1764374627771856. Current best va

[32m[I 2019-05-26 03:23:18,427][0m Finished trial#168 resulted in value: 0.16653772972761471. Current best value is 0.08395992280984708 with parameters: {'max_leaf': 55, 'reg_depth': 99.75232661985737, 'l2': 0.07313838325261164, 'min_samples_leaf': 21, 'learning_rate': 0.3051999377223197}.[0m
[32m[I 2019-05-26 03:24:25,297][0m Finished trial#169 resulted in value: 0.13422183059564174. Current best value is 0.08395992280984708 with parameters: {'max_leaf': 55, 'reg_depth': 99.75232661985737, 'l2': 0.07313838325261164, 'min_samples_leaf': 21, 'learning_rate': 0.3051999377223197}.[0m
[32m[I 2019-05-26 03:28:18,235][0m Finished trial#170 resulted in value: 0.18562161166410213. Current best value is 0.08395992280984708 with parameters: {'max_leaf': 55, 'reg_depth': 99.75232661985737, 'l2': 0.07313838325261164, 'min_samples_leaf': 21, 'learning_rate': 0.3051999377223197}.[0m
[32m[I 2019-05-26 03:29:10,500][0m Finished trial#171 resulted in value: 0.1906329146680973. Current best v

[32m[I 2019-05-26 05:17:37,906][0m Finished trial#196 resulted in value: 0.16991920889383333. Current best value is 0.07870106212273874 with parameters: {'max_leaf': 51, 'reg_depth': 67.0752610870786, 'l2': 0.08440407881131598, 'min_samples_leaf': 5, 'learning_rate': 0.086383975439242}.[0m
[32m[I 2019-05-26 05:18:06,164][0m Finished trial#197 resulted in value: 0.08665002214815636. Current best value is 0.07870106212273874 with parameters: {'max_leaf': 51, 'reg_depth': 67.0752610870786, 'l2': 0.08440407881131598, 'min_samples_leaf': 5, 'learning_rate': 0.086383975439242}.[0m
[32m[I 2019-05-26 05:18:59,071][0m Finished trial#198 resulted in value: 0.1452447513100594. Current best value is 0.07870106212273874 with parameters: {'max_leaf': 51, 'reg_depth': 67.0752610870786, 'l2': 0.08440407881131598, 'min_samples_leaf': 5, 'learning_rate': 0.086383975439242}.[0m
[32m[I 2019-05-26 05:19:30,684][0m Finished trial#199 resulted in value: 0.10213933833569203. Current best value is 0

In [33]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [34]:
df_trial[(df_trial['remark']=='tune x group 8')&(df_trial['mae_diff']<.1)][['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].sort_values(by=['val_mae']).head(10)

Unnamed: 0,datetime,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
600,2019-05-25 13:10:43.881754,20,1.980358,0.011888,2.079444,0.545064,0.099086
631,2019-05-25 16:23:40.172648,20,1.981275,0.011908,2.079694,0.545644,0.098419
761,2019-05-26 03:05:50.270818,20,1.986954,0.012026,2.082624,0.550542,0.095669
689,2019-05-25 21:46:53.527192,20,1.988692,0.012063,2.083342,0.551842,0.09465
622,2019-05-25 15:05:51.380281,20,1.988294,0.012042,2.083365,0.551325,0.095071
695,2019-05-25 22:13:22.958503,20,1.989081,0.012068,2.083723,0.55198,0.094642
705,2019-05-25 23:10:45.638999,20,1.989584,0.012087,2.083756,0.552436,0.094172
779,2019-05-26 04:26:47.022646,20,1.98986,0.012092,2.084006,0.552624,0.094145
772,2019-05-26 03:55:38.917078,20,1.98958,0.012079,2.084136,0.552517,0.094556
652,2019-05-25 19:12:27.037420,20,1.990275,0.012095,2.084336,0.553158,0.094061


In [35]:
db.commit()

In [36]:
mytrial = []
#  tune hypterparameters
def objective(trial):
        
    max_leaf = trial.suggest_int('max_leaf', 50, 5000)
    reg_depth = trial.suggest_uniform('reg_depth', 1.0, 100.0)
    l2 = trial.suggest_uniform('l2', 0.001, .1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 100)
    learning_rate = trial.suggest_uniform('learning_rate', .05, .5)
        
    args={
        'columns':selected_columns,
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
            'init':{}
        },
        'algorithm':{
            'cls':'RGFRegressor',
            'init':{
                "max_leaf":max_leaf,
                "reg_depth":reg_depth,
                "l2":l2,
                "min_samples_leaf":min_samples_leaf,
                "learning_rate":learning_rate,
            },
            'fit':{
            },
        }
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune x stratified')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)



[32m[I 2019-05-26 05:24:50,436][0m Finished trial#0 resulted in value: 0.03531655493735561. Current best value is 0.03531655493735561 with parameters: {'max_leaf': 4925, 'reg_depth': 92.70040170519867, 'l2': 0.07840269641596194, 'min_samples_leaf': 15, 'learning_rate': 0.4758015640010475}.[0m
[32m[I 2019-05-26 05:25:16,337][0m Finished trial#1 resulted in value: 0.02068243654033094. Current best value is 0.02068243654033094 with parameters: {'max_leaf': 464, 'reg_depth': 51.57234912635375, 'l2': 0.06027749522029458, 'min_samples_leaf': 25, 'learning_rate': 0.31591289123211796}.[0m
[32m[I 2019-05-26 05:29:51,521][0m Finished trial#2 resulted in value: 0.4941023886744186. Current best value is 0.02068243654033094 with parameters: {'max_leaf': 464, 'reg_depth': 51.57234912635375, 'l2': 0.06027749522029458, 'min_samples_leaf': 25, 'learning_rate': 0.31591289123211796}.[0m
[32m[I 2019-05-26 05:33:23,415][0m Finished trial#3 resulted in value: 0.07187668451487444. Current best val

[32m[I 2019-05-26 06:10:13,138][0m Finished trial#28 resulted in value: 0.025030003936681883. Current best value is 0.012340008930624357 with parameters: {'max_leaf': 70, 'reg_depth': 42.15539395741013, 'l2': 0.03522166248398323, 'min_samples_leaf': 35, 'learning_rate': 0.25767054788556354}.[0m
[32m[I 2019-05-26 06:10:50,846][0m Finished trial#29 resulted in value: 0.029291477666367294. Current best value is 0.012340008930624357 with parameters: {'max_leaf': 70, 'reg_depth': 42.15539395741013, 'l2': 0.03522166248398323, 'min_samples_leaf': 35, 'learning_rate': 0.25767054788556354}.[0m
[32m[I 2019-05-26 06:11:00,708][0m Finished trial#30 resulted in value: 0.014373749253031448. Current best value is 0.012340008930624357 with parameters: {'max_leaf': 70, 'reg_depth': 42.15539395741013, 'l2': 0.03522166248398323, 'min_samples_leaf': 35, 'learning_rate': 0.25767054788556354}.[0m
[32m[I 2019-05-26 06:11:27,719][0m Finished trial#31 resulted in value: 0.0365641013761361. Current b

[32m[I 2019-05-26 07:00:48,338][0m Finished trial#56 resulted in value: 0.04221565022475783. Current best value is 0.012340008930624357 with parameters: {'max_leaf': 70, 'reg_depth': 42.15539395741013, 'l2': 0.03522166248398323, 'min_samples_leaf': 35, 'learning_rate': 0.25767054788556354}.[0m
[32m[I 2019-05-26 07:01:14,317][0m Finished trial#57 resulted in value: 0.037750872811011395. Current best value is 0.012340008930624357 with parameters: {'max_leaf': 70, 'reg_depth': 42.15539395741013, 'l2': 0.03522166248398323, 'min_samples_leaf': 35, 'learning_rate': 0.25767054788556354}.[0m
[32m[I 2019-05-26 07:03:17,206][0m Finished trial#58 resulted in value: 0.15233392728269168. Current best value is 0.012340008930624357 with parameters: {'max_leaf': 70, 'reg_depth': 42.15539395741013, 'l2': 0.03522166248398323, 'min_samples_leaf': 35, 'learning_rate': 0.25767054788556354}.[0m
[32m[I 2019-05-26 07:04:17,831][0m Finished trial#59 resulted in value: 0.029789857660779562. Current b

[32m[I 2019-05-26 07:34:59,214][0m Finished trial#84 resulted in value: 0.032225455285137. Current best value is 0.011454996004582475 with parameters: {'max_leaf': 51, 'reg_depth': 61.313257667420196, 'l2': 0.0499920411058562, 'min_samples_leaf': 25, 'learning_rate': 0.24004025059859513}.[0m
[32m[I 2019-05-26 07:35:24,327][0m Finished trial#85 resulted in value: 0.019674409802431084. Current best value is 0.011454996004582475 with parameters: {'max_leaf': 51, 'reg_depth': 61.313257667420196, 'l2': 0.0499920411058562, 'min_samples_leaf': 25, 'learning_rate': 0.24004025059859513}.[0m
[32m[I 2019-05-26 07:36:04,789][0m Finished trial#86 resulted in value: 0.022693557765964118. Current best value is 0.011454996004582475 with parameters: {'max_leaf': 51, 'reg_depth': 61.313257667420196, 'l2': 0.0499920411058562, 'min_samples_leaf': 25, 'learning_rate': 0.24004025059859513}.[0m
[32m[I 2019-05-26 07:39:23,841][0m Finished trial#87 resulted in value: 0.03883329378757721. Current bes

[32m[I 2019-05-26 08:25:34,261][0m Finished trial#112 resulted in value: 0.09898336831784756. Current best value is 0.011176798230264713 with parameters: {'max_leaf': 55, 'reg_depth': 28.131118558600043, 'l2': 0.02376268620397663, 'min_samples_leaf': 38, 'learning_rate': 0.19738364931597813}.[0m
[32m[I 2019-05-26 08:26:13,639][0m Finished trial#113 resulted in value: 0.018117045532787198. Current best value is 0.011176798230264713 with parameters: {'max_leaf': 55, 'reg_depth': 28.131118558600043, 'l2': 0.02376268620397663, 'min_samples_leaf': 38, 'learning_rate': 0.19738364931597813}.[0m
[32m[I 2019-05-26 08:31:31,308][0m Finished trial#114 resulted in value: 0.0541430223461262. Current best value is 0.011176798230264713 with parameters: {'max_leaf': 55, 'reg_depth': 28.131118558600043, 'l2': 0.02376268620397663, 'min_samples_leaf': 38, 'learning_rate': 0.19738364931597813}.[0m
[32m[I 2019-05-26 08:33:34,286][0m Finished trial#115 resulted in value: 0.03843469060269698. Curr

[32m[I 2019-05-26 09:04:38,205][0m Finished trial#140 resulted in value: 0.04339539063867393. Current best value is 0.011176798230264713 with parameters: {'max_leaf': 55, 'reg_depth': 28.131118558600043, 'l2': 0.02376268620397663, 'min_samples_leaf': 38, 'learning_rate': 0.19738364931597813}.[0m
[32m[I 2019-05-26 09:07:08,151][0m Finished trial#141 resulted in value: 0.041766246002764275. Current best value is 0.011176798230264713 with parameters: {'max_leaf': 55, 'reg_depth': 28.131118558600043, 'l2': 0.02376268620397663, 'min_samples_leaf': 38, 'learning_rate': 0.19738364931597813}.[0m
[32m[I 2019-05-26 09:08:47,562][0m Finished trial#142 resulted in value: 0.05917700469628657. Current best value is 0.011176798230264713 with parameters: {'max_leaf': 55, 'reg_depth': 28.131118558600043, 'l2': 0.02376268620397663, 'min_samples_leaf': 38, 'learning_rate': 0.19738364931597813}.[0m
[32m[I 2019-05-26 09:09:18,331][0m Finished trial#143 resulted in value: 0.033593126214541115. Cu

[32m[I 2019-05-26 09:26:59,816][0m Finished trial#168 resulted in value: 0.026043048634343247. Current best value is 0.00960230608069982 with parameters: {'max_leaf': 59, 'reg_depth': 76.68182186942094, 'l2': 0.06826989662751225, 'min_samples_leaf': 24, 'learning_rate': 0.2800422948008965}.[0m
[32m[I 2019-05-26 09:27:30,431][0m Finished trial#169 resulted in value: 0.01775767654389924. Current best value is 0.00960230608069982 with parameters: {'max_leaf': 59, 'reg_depth': 76.68182186942094, 'l2': 0.06826989662751225, 'min_samples_leaf': 24, 'learning_rate': 0.2800422948008965}.[0m
[32m[I 2019-05-26 09:28:12,665][0m Finished trial#170 resulted in value: 0.01957040607008328. Current best value is 0.00960230608069982 with parameters: {'max_leaf': 59, 'reg_depth': 76.68182186942094, 'l2': 0.06826989662751225, 'min_samples_leaf': 24, 'learning_rate': 0.2800422948008965}.[0m
[32m[I 2019-05-26 09:29:56,894][0m Finished trial#171 resulted in value: 0.027412583775904678. Current bes

[32m[I 2019-05-26 10:08:26,022][0m Finished trial#196 resulted in value: 0.06986474619462274. Current best value is 0.00960230608069982 with parameters: {'max_leaf': 59, 'reg_depth': 76.68182186942094, 'l2': 0.06826989662751225, 'min_samples_leaf': 24, 'learning_rate': 0.2800422948008965}.[0m
[32m[I 2019-05-26 10:08:37,140][0m Finished trial#197 resulted in value: 0.010641177848159942. Current best value is 0.00960230608069982 with parameters: {'max_leaf': 59, 'reg_depth': 76.68182186942094, 'l2': 0.06826989662751225, 'min_samples_leaf': 24, 'learning_rate': 0.2800422948008965}.[0m
[32m[I 2019-05-26 10:09:08,416][0m Finished trial#198 resulted in value: 0.018700848817753645. Current best value is 0.00960230608069982 with parameters: {'max_leaf': 59, 'reg_depth': 76.68182186942094, 'l2': 0.06826989662751225, 'min_samples_leaf': 24, 'learning_rate': 0.2800422948008965}.[0m
[32m[I 2019-05-26 10:09:31,193][0m Finished trial#199 resulted in value: 0.014630405856026395. Current be

In [37]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()

In [38]:
# df_trial[(df_trial['remark']=='tune x stratified')&(df_trial['mae_diff']<.05)][['datetime','nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].sort_values(by=['val_mae']).head(10)

In [39]:
db.commit()