In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb
import catboost as cb
from keras.wrappers.scikit_learn import KerasRegressor
from keras import Sequential
from keras.layers import Dense

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP
from models import *
from dfdb import DFDB

import types
import os
import copy

import seaborn as sns
import matplotlib.pyplot as plt

# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
# os.environ["CUDA_VISIBLE_DEVICES"]="3";  

Using TensorFlow backend.

numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.



In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [4]:
pd.set_option('display.max_columns', 2000)
pd.set_option('display.max_rows', 2000)
# pd.set_option('display.width', 2000)
# pd.set_option('display.expand_frame_repr', True)
pd.set_option('display.max_colwidth', -1)

In [5]:
def m_lineplot(dflist, plot_features=None, n_col=3):

    
    n_chart = len(dflist)
    n_row = int(n_chart/n_col) if n_chart % n_col == 0 else n_row+1
        
    fig = plt.figure(figsize=(5*n_col, 3*n_row))
    for i, df in enumerate(dflist):
        ax = fig.add_subplot(n_row, n_col, i+1)
        if type(plot_features) == type(None):
            plot_features = df.columns.tolist()
        for feat in plot_features:
            sns.lineplot(x=df.index, y=df[feat], ax=ax)
    return 

In [6]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [7]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [24]:
import pickle
class MacOSFile(object):

    def __init__(self, f):
        self.f = f

    def __getattr__(self, item):
        return getattr(self.f, item)

    def read(self, n):
        # print("reading total_bytes=%s" % n, flush=True)
        if n >= (1 << 31):
            buffer = bytearray(n)
            idx = 0
            while idx < n:
                batch_size = min(n - idx, 1 << 31 - 1)
                # print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
                buffer[idx:idx + batch_size] = self.f.read(batch_size)
                # print("done.", flush=True)
                idx += batch_size
            return buffer
        return self.f.read(n)

    def write(self, buffer):
        n = len(buffer)
        print("writing total_bytes=%s..." % n, flush=True)
        idx = 0
        while idx < n:
            batch_size = min(n - idx, 1 << 31 - 1)
            print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
            self.f.write(buffer[idx:idx + batch_size])
            print("done.", flush=True)
            idx += batch_size


def pickle_dump(obj, file_path):
    with open(file_path, "wb") as f:
        return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)


def pickle_load(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(MacOSFile(f))

In [25]:
df_spec_train = pickle_load('../feats/spec_features.pkl')
df_spec_test = pickle_load('../feats/spec_features_test.pkl')
# df_train = pd.merge(df_train, df_spec_train, on='index')
# df_test = pd.merge(df_test, df_spec_test, on='index')

In [26]:
df_train = pd.merge(df_train, df_spec_train, on='index')
df_test = pd.merge(df_test, df_spec_test, on='index')

In [27]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [28]:
len(tsfresh_columns)

9731

In [29]:
catboost_columns = ['mfcc_3_rolling_std_mean',
  'max_9',
  'q25_roll_std_100',
  'max_to_min',
  'max_to_min_5',
  'iqr_6',
  'q05_roll_std_1000',
  'q05_roll_std_100',
  'abs_max_4',
  'abs_max_1',
  'spkt_welch_densitycoeff_2',
  'spkt_welch_density__coeff_3',
  'abs_q75_7',
  'q01_2',
  'abs_q01_4',
  'max_to_min_diff_5',
  'q05_5',
  'abs_q25_5',
  'abs_max_2',
  'min__roll_std',
  'median__roll_std',
  'abs_max_7',
  '5000peak_peak_amp_max_',
  'abs_q75_6']
lgbm_columns = ['mfcc_10_abs_q75',
  'q25_roll_std_100',
  'iqr_6',
  'mfcc_9_mean',
  'abs_q75_7',
  'mfcc_delta_5_min',
  'mfcc_delta_3_quantile01',
  'abs_q75_6',
  'q05_roll_std_100',
  'mfcc_accelerate_1_kurtosis',
  'mfcc_5_mean',
  'spkt_welch_density__coeff_42',
  'mfcc_3_rolling_std_mean',
  'mfcc_12_mean',
  '5000smoothness_entropy_',
  'mfcc_13_mean',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'mfcc_5_abs_max',
  'abs_q25_5',
  'mfcc_5_quantile95',
  'spkt_welch_densitycoeff_5',
  '5000crest_factor_quantile75',
  'spkt_welch_densitycoeff_2',
  'mfcc_accelerate_8_variance',
  "number_peaks{'n': 10}",
  'spkt_welch_density__coeff_3']
xgbm_columns = ['q05_roll_std_1000',
  'q05_roll_std_100',
  'mfcc_9_mean',
  'abs_q01_4',
  'iqr_6',
  'spkt_welch_density__coeff_4',
  'abs_q25_5',
  'abs_q75_6',
  'q01_2',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'abs_max_4',
  'median__roll_std',
  'abs_max_7',
  'spkt_welch_density__coeff_28',
  '5000crest_factor_quantile75',
  'mfcc_1_kurtosis',
  'mfcc_4_median',
  'q05_5',
  'abs_max_8',
  'abs_q75_7',
  'q25_roll_std_100',
  'mfcc_delta_3_quantile01',
  'spkt_welch_densitycoeff_2',
  'max_to_min_diff_5',
  'mfcc_12_mean',
  'spkt_welch_densitycoeff_5',
  'spkt_welch_density__coeff_3',
  "number_peaks{'n': 10}",
  'spkt_welch_density__coeff_27']

In [30]:
all_columns = catboost_columns+lgbm_columns+xgbm_columns
unique_columns = list(set(all_columns))
common_columns = []
common_columns50 = []
common_columns75 = []
common_columns95 = []
N_columns = 6
count_values = [all_columns.count(col) for col in unique_columns]
for col in unique_columns:
    if all_columns.count(col)==N_columns:
        common_columns.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .5):
        common_columns50.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .75):
        common_columns75.append(col)
    if all_columns.count(col)>=np.quantile(count_values, .95):
        common_columns95.append(col)
print('unique_columns ',len(unique_columns))
print('common_columns50 ',len(common_columns50))
print('common_columns75 ',len(common_columns75))
print('common_columns95 ',len(common_columns95))
print('common_columns ',len(common_columns))

unique_columns  47
common_columns50  24
common_columns75  24
common_columns95  8
common_columns  0


In [9]:
# mytrial = []
db = DFDB('../trial/knn.pkl', auto_commit=False)

In [10]:
df_trial = db.select()
df_trial['kfold'] = df_trial['param'].apply(lambda x: x['kfold'])
df_trial['algorithm-init'] = df_trial['param'].apply(lambda x: x['algorithm']['init'])

In [12]:
df_trial[['datetime','remark', 'nfeatures', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','algorithm-init','kfold']].loc[[17]]

Unnamed: 0,datetime,remark,nfeatures,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,algorithm-init,kfold
17,2019-05-16 23:11:48.201772,,7,2.026411,0.002031,2.081674,0.013139,0.055264,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}","{'n_splits': 3, 'random_state': 1985, 'shuffle': True, 'type': 'group'}"


In [32]:
param = {'algorithm': {'cls': 'KNeighborsRegressor',
  'fit': {},
  'init': {'n_neighbors': 500,
   'weights': 'uniform',
   'algorithm': 'ball_tree',
   'leaf_size': 30,
   'p': 2,
   'metric': 'minkowski'}},
 'columns':unique_columns ,
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'group'},
 'scaler': {'cls': 'StandardScaler', 'init':{}}}

In [33]:
mytrial = []
EP.width_frist_rfe(df_train, param, mytrial, 999, df_test=df_test, remark='wf new 17')

In [34]:
for trial_i in mytrial:
    db.insert(trial_i)
df_trial = db.select()
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial['algorithm-init'] = df_trial['param'].apply(lambda x: x['algorithm']['init'])

In [35]:
df_trial[df_trial['remark']=='wf new 17'][['datetime','nfeatures', 'kfold-type', 'algorithm-init', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']].sort_values(by=['val_mae'])

Unnamed: 0,datetime,nfeatures,kfold-type,algorithm-init,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
1383,2019-06-03 05:03:33.781698,21,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027898,0.002466,2.065665,0.011227,0.037766
1364,2019-06-03 04:44:28.899038,22,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027902,0.002465,2.065673,0.011227,0.03777
1388,2019-06-03 05:08:20.798135,21,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027901,0.002465,2.065674,0.011225,0.037773
1405,2019-06-03 05:24:38.477604,20,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027911,0.002465,2.065678,0.011228,0.037767
1409,2019-06-03 05:28:26.335042,20,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027906,0.002465,2.065679,0.011226,0.037773
1384,2019-06-03 05:04:31.313464,21,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027909,0.002465,2.065682,0.011227,0.037773
1372,2019-06-03 04:52:35.612012,22,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027901,0.002466,2.065686,0.011232,0.037786
1415,2019-06-03 05:34:05.997036,20,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027919,0.002468,2.065688,0.011231,0.037769
1394,2019-06-03 05:14:03.104544,21,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027919,0.002467,2.06569,0.011232,0.037771
1341,2019-06-03 04:19:33.706720,23,group,"{'n_neighbors': 500, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2, 'metric': 'minkowski'}",2.027881,0.002462,2.065691,0.011222,0.037809


In [36]:
db.commit()