In [1]:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques

# Stacking Starter based on Allstate Faron's Script
# https://www.kaggle.com/mmueller/allstate-claims-severity/stacking-starter/run/390867
# Preprocessing from Alexandru Papiu
# https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models

import os.path
import math
import functools
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import skew

import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso
from sklearn.svm import SVR, NuSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import f_regression, mutual_info_regression



In [2]:
TARGET = 'SalePrice'
NSPLITS = 5
SEED = 0
NROWS = None
data_dir = functools.partial(os.path.join, 'data/')
SUBMISSION_FILE = data_dir('sample_submission.csv')


## Load the data ##
train = pd.read_csv(data_dir('train.csv'))
test = pd.read_csv(data_dir('test.csv'))

# Separate the labels from the last column
labels = train[TARGET]
y_train = np.log(labels + 1)
train.drop([TARGET], axis=1, inplace=True)

ntrain = train.shape[0]
ntest = test.shape[0]

train.shape, test.shape, y_train.shape

((1460, 80), (1459, 80), (1460,))

In [3]:
all_data = pd.concat([train, test])
values = all_data['LotArea'].unique()
values.sort()
values

array([  1300,   1470,   1476, ..., 159000, 164660, 215245])

In [4]:
def digitize(all_data, column):
    values = all_data[column].unique()
    values.sort()
    return np.digitize(all_data[column], bins=values)

In [5]:
## Preprocessing ##
#all_data = pd.concat([train, test])
#all_data.drop(['Id'], axis=1, inplace=True)

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

#for col in ['MSSubClass']: #, 'YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']:
#    all_data[col] = digitize(all_data, col)

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
object_feats = all_data.dtypes[all_data.dtypes == 'object'].index
#object_feats = np.hstack([object_feats, ['MSSubClass', 'YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']])

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.7]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = all_data.fillna(all_data.median())
all_data = pd.get_dummies(all_data, columns=object_feats)

#creating matrices for sklearn:
data = all_data.values
x_train = data[:ntrain]
x_test = data[ntrain:]


x_train.shape, x_test.shape

((1460, 288), (1459, 288))

In [6]:
# mi = mutual_info_regression(x_train, labels, random_state=SEED)
# mi /= np.max(mi)

# data = all_data.loc[:, mi > 0].values
# x_train = data[:ntrain]
# x_test = data[ntrain:]
# x_train.shape, x_test.shape

In [7]:
kf = KFold(n_splits=NSPLITS, shuffle=True, random_state=SEED)

class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        if params is None:
            params = {}
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NSPLITS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
    'random_state': SEED
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
    'random_state': SEED
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 1000,
    'random_state': SEED
}

rd_params={
    'alpha': 10,
    'random_state': SEED
}


ls_params={
    'alpha': 0.005,
    'random_state': SEED
}

knr_params = {
    'n_neighbors': 10
}

xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)
svr = SklearnWrapper(clf=NuSVR, seed=SEED)
knr = SklearnWrapper(clf=KNeighborsRegressor, seed=SEED, params=knr_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)
svr_oof_train, svr_oof_test = get_oof(svr)
knr_oof_train, knr_oof_test = get_oof(knr)

print("XG-CV: {}".format(np.sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(np.sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(np.sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RD-CV: {}".format(np.sqrt(mean_squared_error(y_train, rd_oof_train))))
print("LS-CV: {}".format(np.sqrt(mean_squared_error(y_train, ls_oof_train))))
print("SVR-CV: {}".format(np.sqrt(mean_squared_error(y_train, svr_oof_train))))
print("KNR-CV: {}".format(np.sqrt(mean_squared_error(y_train, knr_oof_train))))

x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)


xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

res = xgb.cv(xgb_params, dtrain, num_boost_round=1500, nfold=NSPLITS, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=50, show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))

gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

submission = pd.read_csv(SUBMISSION_FILE)
submission[TARGET] = gbdt.predict(dtest)
saleprice = np.exp(submission['SalePrice']) - 1
submission[TARGET] = saleprice
submission.to_csv(data_dir('xgstacker_starter_submission2.csv'), index=None)

XG-CV: 0.12257812904190073
ET-CV: 0.14555761485208646
RF-CV: 0.14241118123674068
RD-CV: 0.13182895036747352
LS-CV: 0.1432176643227663
SVR-CV: 0.2511859720627804
KNR-CV: 0.2520716462695527
(1460, 7),(1459, 7)
[0]	train-rmse:11.4159+0.00809367	test-rmse:11.4158+0.0328669
[50]	train-rmse:6.91854+0.00502096	test-rmse:6.91878+0.0340042
[100]	train-rmse:4.19619+0.00323592	test-rmse:4.19648+0.027922
[150]	train-rmse:2.54772+0.00181997	test-rmse:2.54795+0.0238884
[200]	train-rmse:1.55071+0.00150019	test-rmse:1.55098+0.0206877
[250]	train-rmse:0.948899+0.00119081	test-rmse:0.94926+0.0187307
[300]	train-rmse:0.587603+0.00147743	test-rmse:0.588662+0.0178137
[350]	train-rmse:0.373694+0.00186282	test-rmse:0.375705+0.0177032
[400]	train-rmse:0.25022+0.0024844	test-rmse:0.253451+0.0180608
[450]	train-rmse:0.182257+0.00333903	test-rmse:0.186978+0.0187799
[500]	train-rmse:0.147075+0.00397375	test-rmse:0.153134+0.0195568
[550]	train-rmse:0.129673+0.00438892	test-rmse:0.136977+0.0198599
[600]	train-rmse:

In [8]:
# Ensemble-CV: 0.12850675
# Ensemble-CV: 0.1237075 (0.12236)
# Ensemble-CV: 0.12549675 (0.12399)
# Ensemble-CV: 0.125022 (0.12381)
# Ensemble-CV: 0.12068925 (0.12133)