In [169]:
import pandas as pd
import numpy as np
import pprint
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import mean_squared_error as MSE, make_scorer

# importance
import eli5
from eli5.sklearn import PermutationImportance

# other
import pickle
import time
import datetime
#import sys
#import gc

In [179]:
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

In [138]:
model_svr = pickle.load(open('../models/model_2020-11-14-10-53-25_svr.pickle', 'rb'))
pprint.pprint(model_svr)
model_lgb = pickle.load(open('../models/model_2020-11-14-09-20-22_lightgbm.pickle', 'rb'))
pprint.pprint(model_lgb)

SVR(C=9.961505666633057, epsilon=0.07917802484774836, gamma=0.07579803763426289)
LGBMRegressor(alpha=2.696306116587096e-08, bagging_fraction=0.9500000000000001,
              feature_fraction=0.6000000000000001, gamma=0.1,
              lambda=0.47886902858405855, max_depth=7, min_child_weight=2.0,
              min_data_in_leaf=6, num_leaves=30)


In [139]:
train_svr = pickle.load(open('../features/feature_train_2020-11-14-10-53-25_svr.pkl', 'rb'))
test_svr = pickle.load(open('../features/feature_test_2020-11-14-10-53-25_svr.pkl', 'rb'))
test_svr = test_svr.drop(['id', 'mpg'], axis=1)

train_lgb = pickle.load(open('../features/feature_train_2020-11-14-10-39-58_lgb.pkl', 'rb'))
test_lgb = pickle.load(open('../features/feature_test_2020-11-14-10-39-58_lgb.pkl', 'rb'))
test_lgb = test_lgb.drop(['id', 'mpg'], axis=1)

In [140]:
train_x_svr = train_svr.drop(['id', 'mpg'], axis=1)
train_x_lgb = train_lgb.drop(['id', 'mpg'], axis=1)
train_y = train_lgb['mpg']

In [141]:
def RMSE(y_true, y_pred):
    return(np.sqrt(MSE(y_true, y_pred)))

# Stacking

In [142]:
ntrain = train_x.shape[0]
ntest = test.shape[0]
SEED = 0
NFOLDS = 5

In [155]:
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []
    
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)    
    
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        
        va_idxes.append(va_idx)
    
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    
    preds_test = np.mean(preds_test, axis=0)
    
    return pred_train, preds_test

In [156]:
pred_train_1a, pred_test_1a = predict_cv(model_lgb, train_x_lgb, train_y, test_lgb)
pred_train_1b, pred_test_1b = predict_cv(model_svr, train_x_svr, train_y, test_svr)

In [191]:
score_1a = RMSE(train_y, pred_train_1a)
score_1b = RMSE(train_y, pred_train_1b)
print(f'model_1a(lgb) score: {score_1a:.4f}')
print(f'model_1b(svr) score: {score_1b:.4f}')

model_1a(lgb) score: 2.9136
model_1b(svr) score: 3.9382


In [173]:
# 線形モデル
class Model2Linear:

    def __init__(self):
        self.model = None
        self.scaler = None

    def fit(self, tr_x, tr_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        tr_x = self.scaler.transform(tr_x)
        self.model = LinearRegression()
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x)
        return pred

In [171]:
# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame({'pred_1a': pred_train_1a, 'pred_1b': pred_train_1b})
test_x_2 = pd.DataFrame({'pred_1a': pred_test_1a, 'pred_1b': pred_test_1b})

In [164]:
train_x_2

Unnamed: 0,pred_1a,pred_1b
0,22.299399,23.309511
1,17.139813,17.297018
2,17.799807,17.682712
3,23.225657,24.450517
4,17.539948,17.488732
...,...,...
495,28.217359,27.648859
496,36.557854,30.978024
497,17.585631,17.774491
498,28.768540,28.481582


In [166]:
test_x_2

Unnamed: 0,pred_1a,pred_1b
0,36.003875,35.822213
1,27.402928,25.384798
2,27.625162,27.287370
3,32.040457,27.251684
4,24.359053,23.253089
...,...,...
495,27.760863,24.634393
496,31.604547,24.234138
497,17.877298,18.413337
498,27.294498,27.287143


In [194]:
model_2 = Model2Linear()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
score_2 = RMSE(train_y, pred_train_2)
print(f'score: {score_2:.4f}')

score: 2.9187


# Create Submission

In [186]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
submission = pd.DataFrame({'id':sample_sub.id})

submission = pd.concat(
    [submission, pd.Series(pred_test_2, name='pred')],
    axis=1
)

display(submission)

Unnamed: 0,id,pred
0,1,36.277695
1,2,27.140216
2,5,27.601084
3,6,31.549348
4,8,24.121488
...,...,...
495,992,27.357394
496,993,30.743689
497,996,17.651931
498,998,27.304997


In [200]:
train_svr.to_pickle('../features/feature_train_' + dt + '_stack_1a.pickle')
train_lgb.to_pickle('../features/feature_train_' + dt + '_stack_1b.pickle')
train_x_2.to_pickle('../features/feature_train_' + dt + '_stack_2.pickle')
test_svr.to_pickle('../features/feature_test_' + dt + '_stack_1a.pickle')
test_lgb.to_pickle('../features/feature_test_' + dt + '_stack_1b.pickle')
test_x_2.to_pickle('../features/feature_test_' + dt + '_stack_2.pickle')

submission.to_csv('../data/output/sub_' + dt + '_stack_svr-lgb_lr.csv', header=False, index=False)

pickle.dump(model_svr, open('../models/model_' + dt + '_stack_1a.pickle', 'wb'))
pickle.dump(model_lgb, open('../models/model_' + dt + '_stack_1b.pickle', 'wb'))
pickle.dump(model_2, open('../models/model_' + dt + '_stack_2.pickle', 'wb'))

pickle.dump(model_svr.get_params(), open('../logs/params_' + dt + '_stack1a.pickle', 'wb'))
pickle.dump(model_lgb.get_params(), open('../logs/params_' + dt + '_stack1b.pickle', 'wb'))

pickle.dump(score_1a, open('../logs/train_score_' + dt + '_stack1a.pickle', 'wb'))
pickle.dump(score_1b, open('../logs/train_score_' + dt + '_stack1b.pickle', 'wb'))
pickle.dump(score_2, open('../logs/train_score_' + dt + '_stack2.pickle', 'wb'))