In [41]:
# basic
import pandas as pd
import numpy as np
import pprint
import matplotlib.pyplot as plt
%matplotlib inline

# preprocess
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

# models
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
# model tensorflow
import tensorflow as tf
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import ReLU, PReLU
from keras.optimizers import SGD, Adam

# optimizer
from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import mean_squared_error as MSE, make_scorer

# importance
import eli5
from eli5.sklearn import PermutationImportance

# other
import pickle
import time
import datetime
#import sys
#import gc

In [29]:
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

In [30]:
train_ohe = pickle.load(open('../features/feature_train_2020-11-14-10-53-25_svr.pkl', 'rb'))
test_ohe = pickle.load(open('../features/feature_test_2020-11-14-10-53-25_svr.pkl', 'rb'))
test_ohe = test_ohe.drop(['id', 'mpg'], axis=1)

train_le = pickle.load(open('../features/feature_train_2020-11-14-10-39-58_lgb.pkl', 'rb'))
test_le = pickle.load(open('../features/feature_test_2020-11-14-10-39-58_lgb.pkl', 'rb'))
test_le = test_le.drop(['id', 'mpg'], axis=1)

In [31]:
train_x_ohe = train_ohe.drop(['id', 'mpg'], axis=1)
train_x_le = train_le.drop(['id', 'mpg'], axis=1)
train_y = train_le['mpg']

# Define Functions

In [72]:
pickle.load(open('../logs/params_2020-11-16-22-02-29_nn.pickle', 'rb'))

{'batch_norm': 'no',
 'batch_size': 32.0,
 'hidden_activation': 'prelu',
 'hidden_dropout': 0.15000000000000002,
 'hidden_layers': 3,
 'hidden_units': 272.0,
 'input_dropout': 0.1,
 'optimizer': {'lr': 0.002485112914796712, 'type': 'adam'}}

In [82]:
class MLP:

    def __init__(self, params):
        self.params = params
        self.scaler = None
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):

        # パラメータ
        input_dropout = self.params['input_dropout']
        hidden_layers = int(self.params['hidden_layers'])
        hidden_units = int(self.params['hidden_units'])
        hidden_activation = self.params['hidden_activation']
        hidden_dropout = self.params['hidden_dropout']
        batch_norm = self.params['batch_norm']
        optimizer_type = self.params['optimizer']['type']
        optimizer_lr = self.params['optimizer']['lr']
        batch_size = int(self.params['batch_size'])

        # 標準化
        self.scaler = StandardScaler()
        tr_x = self.scaler.fit_transform(tr_x)
        va_x = self.scaler.transform(va_x)

        # Layer Setting
        self.model = Sequential()
        self.model.add(Dropout(input_dropout, input_shape=(tr_x.shape[1],)))# 入力層
        for i in range(hidden_layers):# 中間層
            self.model.add(Dense(hidden_units))
            if batch_norm == 'before_act':
                self.model.add(BatchNormalization())
            if hidden_activation == 'prelu':
                self.model.add(PReLU())
            elif hidden_activation == 'relu':
                self.model.add(ReLU())
            else:
                raise NotImplementedError
            self.model.add(Dropout(hidden_dropout))
        self.model.add(Dense(1))# 出力層

        # オプティマイザ
        if optimizer_type == 'sgd':
            optimizer = SGD(lr=optimizer_lr, decay=1e-6, momentum=0.9, nesterov=True)
        elif optimizer_type == 'adam':
            optimizer = Adam(lr=optimizer_lr, beta_1=0.9, beta_2=0.999, decay=0.)
        else:
            raise NotImplementedError

        # 目的関数、評価指標などの設定
        self.model.compile(loss='mean_squared_error',
                           optimizer=optimizer, metrics=['mse'])

        # エポック数、アーリーストッピング
        # あまりepochを大きくすると、小さい学習率のときに終わらないことがあるので注意
        nb_epoch = 200
        patience = 20
        early_stopping = EarlyStopping(patience=patience, restore_best_weights=True)

        # 学習の実行
        history = self.model.fit(tr_x, tr_y,
                                 epochs=nb_epoch,
                                 batch_size=batch_size, verbose=0,
                                 validation_data=(va_x, va_y),
                                 callbacks=[early_stopping])

    def predict(self, x):
        # 予測
        x = self.scaler.transform(x)
        y_pred = self.model.predict(x)
        y_pred = y_pred.flatten()
        return y_pred


In [77]:
class Model1LGBM:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        params = pickle.load(open('../logs/params_2020-11-14-09-20-22.pickle', 'rb'))
        
        params['max_depth'] = int(params['max_depth'])
        params['num_leaves'] = int(params['num_leaves'])
        params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
        
        model = lgb.LGBMRegressor(**params)
        model.fit(tr_x, tr_y)
        self.model = model

    def predict(self, x):
        return self.model.predict(x)

In [78]:
class Model1SVR:
    def __init__(self):
        self.model = None
    
    def fit(self, tr_x, tr_y, va_x, va_y):
        params = pickle.load(open('../logs/params_2020-11-14-10-53-25.pickle', 'rb'))
        model = SVR(**params)
        model.fit(tr_x, tr_y)
        self.model = model
    
    def predict(self, x):
        return self.model.predict(x)

In [50]:
# 線形モデル
class Model2Linear:

    def __init__(self):
        self.model = None
        self.scaler = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        tr_x = self.scaler.transform(tr_x)
        self.model = LinearRegression()
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x)
        return pred

In [35]:
def RMSE(y_true, y_pred):
    return(np.sqrt(MSE(y_true, y_pred)))

# Stacking

In [36]:
#ntrain = train_x.shape[0]
#ntest = test.shape[0]
SEED = 0
NFOLDS = 5

In [37]:
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []
    
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)    
    
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        
        va_idxes.append(va_idx)
    
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    
    preds_test = np.mean(preds_test, axis=0)
    
    return pred_train, preds_test

In [None]:
model_lgb = Model1LGBM()
model_svr = Model1SVR()
model_nn = pickle.load(open('../models/model_2020-11-16-22-02-29_nn.pickle', 'rb'))

pred_train_1a, pred_test_1a = predict_cv(model_lgb, train_x_le, train_y, test_le)
pred_train_1b, pred_test_1b = predict_cv(model_svr, train_x_ohe, train_y, test_ohe)
pred_train_1c, pred_test_1c = predict_cv(model_nn, train_x_le, train_y, test_le)

In [43]:
score_1a = RMSE(train_y, pred_train_1a)
score_1b = RMSE(train_y, pred_train_1b)
score_1c = RMSE(train_y, pred_train_1c)
print(f'model_1a(lgb) score: {score_1a:.4f}')
print(f'model_1b(svr) score: {score_1b:.4f}')
print(f'model_1x(nn) score: {score_1c:.4f}')

model_1a(lgb) score: 2.9136
model_1b(svr) score: 3.9382
model_1x(nn) score: 3.3292


In [48]:
# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame({
    'pred_1a(lgb)': pred_train_1a, 'pred_1b(svr)': pred_train_1b, 'pred_1c(nn)': pred_train_1c
})
test_x_2 = pd.DataFrame({
    'pred_1a(lgb)': pred_test_1a, 'pred_1b(svr)': pred_test_1b, 'pred_1c(nn)': pred_test_1c
})
display(train_x_2.head())
display(test_x_2.head())

Unnamed: 0,pred_1a(lgb),pred_1b(svr),pred_1c(nn)
0,22.299399,23.309511,23.427874
1,17.139813,17.297018,16.993597
2,17.799807,17.682712,18.570534
3,23.225657,24.450517,21.043411
4,17.539948,17.488732,17.429684


Unnamed: 0,pred_1a(lgb),pred_1b(svr),pred_1c(nn)
0,36.003875,35.822213,34.953388
1,27.402928,25.384798,28.00909
2,27.625162,27.28737,27.487497
3,32.040457,27.251684,29.33285
4,24.359053,23.253089,25.953278


In [51]:
model_2 = Model2Linear()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
score_2 = RMSE(train_y, pred_train_2)
print(f'score: {score_2:.4f}')

score: 2.8652


# Create Submission

In [52]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
submission = pd.DataFrame({'id':sample_sub.id})

submission = pd.concat(
    [submission, pd.Series(pred_test_2, name='pred')],
    axis=1
)

display(submission)

Unnamed: 0,id,pred
0,1,36.389143
1,2,27.531528
2,5,27.771422
3,6,31.155853
4,8,24.706675
...,...,...
495,992,27.616877
496,993,30.863691
497,996,17.734784
498,998,27.559746


In [68]:
train_le.to_pickle('../features/feature_train_' + dt + '_stack_1a.pickle')
train_ohe.to_pickle('../features/feature_train_' + dt + '_stack_1b.pickle')
train_le.to_pickle('../features/feature_train_' + dt + '_stack_1c.pickle')
train_x_2.to_pickle('../features/feature_train_' + dt + '_stack_2.pickle')
test_le.to_pickle('../features/feature_test_' + dt + '_stack_1a.pickle')
test_ohe.to_pickle('../features/feature_test_' + dt + '_stack_1b.pickle')
test_le.to_pickle('../features/feature_test_' + dt + '_stack_1c.pickle')
test_x_2.to_pickle('../features/feature_test_' + dt + '_stack_2.pickle')

submission.to_csv('../data/output/sub_' + dt + '_stack_lgb-svr-nn_lr.csv', header=False, index=False)

pickle.dump(model_svr, open('../models/model_' + dt + '_stack_1a.pickle', 'wb'))
pickle.dump(model_lgb, open('../models/model_' + dt + '_stack_1b.pickle', 'wb'))
pickle.dump(model_nn, open('../models/model_' + dt + '_stack_1c.pickle', 'wb'))
pickle.dump(model_2, open('../models/model_' + dt + '_stack_2.pickle', 'wb'))

pickle.dump(model_lgb.model.get_params(), open('../logs/params_' + dt + '_stack1a.pickle', 'wb'))
pickle.dump(model_svr.model.get_params(), open('../logs/params_' + dt + '_stack1b.pickle', 'wb'))
pickle.dump(model_nn.params, open('../logs/params_' + dt + '_stack1b.pickle', 'wb'))

pickle.dump(score_1a, open('../logs/train_score_' + dt + '_stack1a.pickle', 'wb'))
pickle.dump(score_1b, open('../logs/train_score_' + dt + '_stack1b.pickle', 'wb'))
pickle.dump(score_1c, open('../logs/train_score_' + dt + '_stack1c.pickle', 'wb'))
pickle.dump(score_2, open('../logs/train_score_' + dt + '_stack2.pickle', 'wb'))