## Stacking Ensemble

Author: Abolfazl Ravanshad

This notebook covers my final using stacking ensemble approach. I replaced my own implementation with a more efficient and beautiful implementation by [Eliot Barril](https://www.kaggle.com/eliotbarr/house-prices-advanced-regression-techniques/stacking-starter/code). I have employed the hyperparameters optimezed in the previous step.

In [1]:
import numpy as np
import pandas as pd
from six.moves import cPickle as pickle
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from math import sqrt
import seaborn as sns

from sklearn import datasets, linear_model
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

%matplotlib inline



In [2]:
#load datasets
with open('./house_price_clean.pickle', 'rb') as f:
        datasets = pickle.load(f)
        
x_train = datasets['train_dataset']
x_train.drop("Id", 1,inplace=True)
x_test = datasets['test_dataset']
x_test.drop("Id", 1,inplace=True)
y = datasets['train_labels']
names = list(x_train)

In [3]:
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y)
ntrain = x_train.shape[0]
ntest = x_test.shape[0]

NFOLDS = 4
SEED = 0
NROWS = None

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))
    
    
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


In [4]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 20,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 20,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.9,
    'silent': 1,
    'subsample': 0.9,
    'learning_rate': 0.15,
    'objective': 'reg:linear',
    'max_depth': 3,
    'num_parallel_tree': 1,
    'min_child_weight': 9,
    'gamma': 0.16,
    'eval_metric': 'rmse',
    'n_estimators': 65,
    'nrounds': 500
}

# xgb_params = {
#     'seed': 0,
#     'colsample_bytree': 0.94149279254005003,
#     'silent': 1,
#     'subsample': 0.88835472863514275,
#     'learning_rate': 0.14430373199625962,
#     'objective': 'reg:linear',
#     'max_depth': 3,
#     'num_parallel_tree': 1,
#     'min_child_weight': 6,
#     'gamma': 0.3597058218675111,
#     'eval_metric': 'rmse',
#     'n_estimators': 108,
#     'nrounds': 500
# }

rd_params={
    'alpha': 10
}


ls_params={
    'alpha': 0.0005
}


In [5]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)

print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train))))
print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))

XG-CV: 0.129697124263
ET-CV: 0.148053748059
RF-CV: 0.143164318513
RD-CV: 0.111684899513
LS-CV: 0.111843738501


In [6]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)
print("{},{}".format(x_train.shape, x_test.shape))

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}


res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=5, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))

(1457, 5),(1459, 5)
[0]	train-rmse:11.4162+0.0011278	test-rmse:11.4162+0.00466634
[10]	train-rmse:10.3275+0.000969269	test-rmse:10.3275+0.00481385
[20]	train-rmse:9.34289+0.000914576	test-rmse:9.34288+0.00484213
[30]	train-rmse:8.45248+0.00103699	test-rmse:8.45248+0.0047108
[40]	train-rmse:7.6472+0.000874302	test-rmse:7.6471+0.00477228
[50]	train-rmse:6.91887+0.000706384	test-rmse:6.91877+0.00469426
[60]	train-rmse:6.26005+0.000551085	test-rmse:6.25994+0.00464644
[70]	train-rmse:5.66407+0.000694996	test-rmse:5.66395+0.00458299
[80]	train-rmse:5.12506+0.000678689	test-rmse:5.12496+0.00463651
[90]	train-rmse:4.63733+0.000688459	test-rmse:4.63718+0.00466843
[100]	train-rmse:4.19631+0.00071005	test-rmse:4.19639+0.00455051
[110]	train-rmse:3.79734+0.000730347	test-rmse:3.79723+0.00500777
[120]	train-rmse:3.43651+0.000526384	test-rmse:3.4363+0.00537647
[130]	train-rmse:3.11025+0.000597768	test-rmse:3.1101+0.00554792
[140]	train-rmse:2.81495+0.000622854	test-rmse:2.81478+0.00568746
[150]	trai

In [7]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

submission = pd.read_csv('./sample_submission.csv')
submission.iloc[:, 1] = gbdt.predict(dtest)
saleprice = np.expm1(submission['SalePrice'])
submission['SalePrice'] = saleprice
submission.to_csv('xgstacker_starter.sub.csv', index=None)

In [30]:
import csv 
submission = open('my_submission.csv', 'wb')
wr = csv.writer(submission, quoting=csv.QUOTE_ALL)
pred = np.expm1(gbdt.predict(dtest))
xx = ['{:3f}'.format(x) for x in pred]
list1 = [s for s in xx]
counter = 1461
list2=[]
for i in range(len(list1)):
    list2.append([counter+i, list1[i]])
    
head = ['Id','SalePrice']
n = len(list2)+1
for i in range(n):
    if i==0:
        wr.writerow(head)
    else:
        wr.writerow(list2[i-1])