In [258]:
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb # 勾配ブースティング
import seaborn as sns # オシャレなグラフ
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

# Going to use these 5 base models for the stacking
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.model_selection import KFold

import datetime

In [186]:
train = pd.read_csv('../data/input/train.csv')
test = pd.read_csv('../data/input/test.csv')
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

train_test = pd.concat([train, test], ignore_index=True, sort=False)

# Preprocessing

## car name, car brand

In [187]:
train_test['car name'] = train_test['car name'].str.replace('vw', 'volkswagen')
train_test['car name'] = train_test['car name'].str.replace('vokswagen', 'volkswagen')
train_test['car name'] = train_test['car name'].str.replace('toyouta', 'toyota')
train_test['car name'] = train_test['car name'].str.replace('chevy', 'chevrolet')
train_test['car name'] = train_test['car name'].str.replace('datsun 200-sx', 'datsun 200sx')

train_test['car name'] = train_test['car name'].str.replace('datsun 210 mpg', 'datsun 210')
train_test['car name'] = train_test['car name'].str.replace('ford gran torino (sw)', 'ford gran torino')

In [188]:
train_test['car_brand'] = train_test['car name'].apply(lambda x: x.strip().split(' ')[0])

## horse power

In [189]:
train_test['horsepower'].replace('?', np.nan, inplace=True)
train_test['horsepower'] = train_test['horsepower'].astype(float)

In [190]:
# car name毎のhorse powerの平均値を計算(nanは計算外)
name_hp_mean = train_test.groupby('car name').horsepower.mean()
df_hp_isnull = train_test[train_test['horsepower'].isnull()]

# nan位置のtrainデータのhorse powerを置換する
for i in df_hp_isnull.index:
    train_test.loc[i, 'horsepower'] = name_hp_mean[df_hp_isnull.loc[i, 'car name']]
    
del df_hp_isnull, name_hp_mean

# Encoding

In [191]:
cat_cols = ['car name', 'car_brand']
for cols in cat_cols:
    train_test[cols] = LabelEncoder().fit_transform(train_test[cols])

del cat_cols

In [192]:
test = train_test[train_test['mpg'].isnull()].copy()
train = train_test.dropna(subset=['mpg'], axis=0).copy()

# Ensembling & Stacking

In [193]:
# 入力データの作成
y_train = train['mpg']
train = train.drop(['id', 'mpg'], axis=1)
test = test.drop(['id', 'mpg'], axis=1)

x_train = train.values # 学習データ
x_test = test.values # テストデータ

# パラメータ
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0
NFOLDS = 5 # 5分割
kf = KFold(n_splits= NFOLDS, random_state=SEED)

In [136]:
# 各モデルのパラメータ
# Gradient Boosting
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier 
svr_params = {
    'kernel' : 'rbf',
    'C' : 0.025
    }

In [228]:
svr_oof_train = np.zeros((ntrain,))
svr_oof_test = np.zeros((ntest,))
svr_oof_test_skf = np.empty((NFOLDS, ntest))

clf = SVR(**svr_params)

for i, (train_index, valid_index) in enumerate(kf.split(train)): # NFOLDS回まわる
    x_tr = train.loc[train_index]
    y_tr = y_train[train_index]
    x_va = train.loc[valid_index]

    clf.fit(x_tr, y_tr)

    svr_oof_train[valid_index] = clf.predict(x_va)# 各validationの予測結果
    svr_oof_test_skf[i, :] = clf.predict(test)

svr_oof_test[:] = svr_oof_test_skf.mean(axis=0)# 各Foldでテストデータを予測した結果

svr_oof_train.reshape(-1,1)
svr_oof_test.reshape(-1,1)
print("svr_oof_train", svr_oof_train.shape)
print("svr_oof_test", svr_oof_test.shape)

svr_oof_train (500,)
svr_oof_test (500,)


In [229]:
gb_oof_train = np.zeros((ntrain,))
gb_oof_test = np.zeros((ntest,))
gb_oof_test_skf = np.empty((NFOLDS, ntest))

clf = GradientBoostingRegressor(**gb_params)

for i, (train_index, valid_index) in enumerate(kf.split(train)): # NFOLDS回まわる
    x_tr = train.loc[train_index]
    y_tr = y_train[train_index]
    x_va = train.loc[valid_index]

    clf.fit(x_tr, y_tr)

    gb_oof_train[valid_index] = clf.predict(x_va)# 各validationの予測結果
    gb_oof_test_skf[i, :] = clf.predict(test)

gb_oof_test[:] = gb_oof_test_skf.mean(axis=0)# 各Foldでテストデータを予測した結果

gb_oof_train.reshape(-1,1)
gb_oof_test.reshape(-1,1)
print("gb_oof_train", gb_oof_train.shape)
print("gb_oof_test", gb_oof_test.shape)

gb_oof_train (500,)
gb_oof_test (500,)


# Second-Level Predictions

In [242]:
x_train = pd.DataFrame({
    "GradientBoost": gb_oof_train.ravel(),
    "SupportVector": svr_oof_train.ravel()
})
x_train

Unnamed: 0,GradientBoost,SupportVector
0,21.344776,24.775204
1,16.608866,24.332946
2,17.766229,24.761386
3,25.616068,24.981840
4,17.570066,24.950765
...,...,...
495,28.523433,26.384334
496,36.693581,26.641894
497,17.659874,26.379574
498,28.538128,26.679394


In [249]:
x_test = pd.DataFrame({"GradientBoost": gb_oof_test.tolist(), "SupportVector": svr_oof_test.tolist()})
x_test

Unnamed: 0,GradientBoost,SupportVector
0,35.870562,26.090603
1,28.385977,25.780799
2,27.844321,25.743488
3,34.674462,25.733766
4,22.879623,25.623459
...,...,...
495,28.420694,25.344253
496,29.124737,25.772707
497,18.211618,25.717196
498,27.263115,25.743488


In [250]:
gbm = xgb.XGBRegressor(
    n_estimators=2000,
    max_depth=4,
    learning_rate=0.1,
    min_child_weight=2,
    gamma=0.9
)
gbm.fit(x_train, y_train)
pred = gbm.predict(x_test)

In [256]:
submission = pd.DataFrame({'id':sample_sub.id})
submission = pd.concat(
    [submission, pd.Series(pred, name='pred')],
    axis=1
)
display(submission)

Unnamed: 0,id,pred
0,1,35.320442
1,2,32.382111
2,5,26.847887
3,6,39.124859
4,8,23.472961
...,...,...
495,992,31.460735
496,993,32.610085
497,996,18.743475
498,998,29.618065


In [259]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

submission[['id', 'pred']].to_csv('../data/output/sub_' + dt + '_ensambl_gb_svr_xgb.csv', header=False, index=False)