In [23]:
# future functions
from __future__ import print_function 

# core scipy and numpy
import numpy as np
import scipy as sp

# pandas 
import pandas as pd
from IPython.display import display
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 135

# encoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# manifold for embedding analysis
from sklearn import manifold

# Cross validation 
from sklearn.model_selection import KFold

# Linear models 
from sklearn import linear_model

# Forests
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

# SVR
from sklearn.svm import SVR

# KNN
from sklearn.neighbors import KNeighborsRegressor

# PCA
from sklearn.decomposition import PCA

# vowpal wabbit
from vowpalwabbit.sklearn_vw import VWRegressor

# combinations with categorical features
from itertools import combinations

# matplotlib 
import matplotlib.pyplot as plt
%matplotlib inline

# python helpers 
from collections import namedtuple
from copy import copy

# neural network
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Merge, Reshape
from keras.layers.embeddings import Embedding

# XGboost for gradient-boosted decision trees
import xgboost as xgb

# logging
import logging
logging.basicConfig()

In [2]:
loadfile = lambda x: pd.read_csv(x, index_col=0)

In [3]:
Files = namedtuple('Files',['train','test'])
RawData = namedtuple('RawData',['train','test'])
ProcessedData = namedtuple('ProcessedData',['train','test'])

#### Load data 

In [4]:
rawfiles = Files(train='Data/train.csv.gz',
            test='Data/test.csv.gz')
raw = RawData(train=loadfile(rawfiles.train),
              test=loadfile(rawfiles.test))

In [5]:
raw.train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


#### Outliers 

In [6]:
outliers = [  30,   88,  197,  462,  495,  523,  557,  632,  691,  825,  874,
             898,  968,  970, 1169, 1170, 1182, 1423, 1432, 1453 ] # from outlier analysis
outlier_ids = raw.train.index.isin(outliers)
# processed = ProcessedData(train=raw.train.loc[~outlier_ids ,:],
processed = ProcessedData(train=raw.train,
                    test=raw.test)

#### Categorical combinations

In [7]:
# create lookup for numeric categorical columns
numeric_categorical = [
    'MSSubClass',
    'YearBuilt',
    'YearRemodAdd',
    'MoSold',
    'YrSold',
    'GarageYrBlt',
    'OverallQual',
    'OverallCond',
    'MiscVal',
]
categorical_cols = list(processed.train.dtypes[processed.train.dtypes == "object"].index)
categorical_cols += numeric_categorical
continuous_cols = list(processed.train.dtypes[processed.train.dtypes != "object"].index)
continuous_cols = [ c for c in continuous_cols if c not in numeric_categorical and c!='SalePrice' ]

for c1,c2 in combinations(categorical_cols, 2):
    newc = c1+'_'+c2
    processed.train.loc[:, newc] = processed.train[c1].astype(str) + '_' + processed.train[c2].astype(str)
    processed.test.loc[:, newc] = processed.test[c1].astype(str) + '_' + processed.test[c2].astype(str)
    categorical_cols.append(newc)

#### Preprocess categorical data

In [8]:
# concatenate all data 
cols = [ c for c in processed.train.columns if c != 'SalePrice' ]
full = pd.concat((processed.train.loc[:, cols],
                     processed.test))

# store important objects
label_encoders = {}
categorical_col_nums = []
medians = {}

# encode data and store objects
i = 0

for col, col_data in full.iteritems():
    
    # handle categorical columns 
    if col_data.dtype == object or col in numeric_categorical:
        le = LabelEncoder()
        col_data = le.fit_transform(col_data.fillna('Unk'))
        categorical_col_nums.append(full.columns.get_loc(col))
        label_encoders[col] = le
    
    # handle continuous columns
    else:
        # handle nulls with median
        m = np.median(col_data.dropna())
        medians[col] = m
        
        col_data.loc[np.isnan(col_data)] = m
    
    # store objects
    full[col] = col_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


#### Log transfoms

In [9]:
get_skew = lambda x: sp.stats.skewtest(x.dropna())[1]<0.05
for c in continuous_cols:
    if get_skew(processed.train[c]):
        processed.train.loc[:, c] = np.log1p(processed.train[c])
        processed.test.loc[:, c] = np.log1p(processed.test[c])

#### Setup standard scaler and one hot encoder

In [10]:
# standard scaler
continous_scaler = StandardScaler()
c = continous_scaler.fit_transform(full.loc[:, continuous_cols].as_matrix())

# minmax 
mm_scaler = MinMaxScaler()
_ = mm_scaler.fit(c)

# create one hot encoder
ohe = OneHotEncoder(categorical_features=categorical_col_nums)
o = ohe.fit_transform(full.as_matrix())

# pca for continuous columns
pca_cont_n = 1
pca_cont = PCA(n_components=pca_cont_n)
_ = pca_cont.fit(c)

# pca for ohe columns
pca_ohe_n = 1
pca_ohe = PCA(n_components=pca_ohe_n)
_ = pca_ohe.fit(o.A)

#### Categorical field analysis

In [11]:
categorical_field_analysis = []
for c in categorical_cols:
    # get pct freq coverage of top value 
    v = full[c].fillna('Unk').value_counts()
    first_pct = v.iloc[0]*1.0/sum(v)

    # get count of distinct values
    distinct_vals = set(full[c].values)
    d_cnt = len(distinct_vals)

    # calculate logical freq  
    logical_pct = 1.0/d_cnt
        
    # append
    categorical_field_analysis.append((c, first_pct, logical_pct, d_cnt))

categorical_field_analysis = pd.DataFrame(categorical_field_analysis,
                                 columns=['Cat_Col',
                                          'First_Freq',
                                          'Logical_Freq',
                                          'Distinct_Val_Cnt',])

In [12]:
@np.vectorize
def choose_m(freq, n):
    for i in range(1,n+1):
        if freq>=(1-float(i)/n):
            x = i
            break
    return x

In [13]:
categorical_field_analysis['m'] =\
choose_m(categorical_field_analysis.First_Freq,\
categorical_field_analysis.Distinct_Val_Cnt)

In [14]:
display(categorical_field_analysis)

Unnamed: 0,Cat_Col,First_Freq,Logical_Freq,Distinct_Val_Cnt,m
0,MSZoning,0.775951,0.166667,6,2
1,Street,0.995889,0.5,2,1
2,Alley,0.932169,0.333333,3,1
3,LotShape,0.636862,0.25,4,2
4,LandContour,0.898253,0.25,4,1
5,Utilities,0.998972,0.333333,3,1
6,LotConfig,0.73073,0.2,5,2
7,LandSlope,0.951696,0.333333,3,1
8,Neighborhood,0.151764,0.04,25,22
9,Condition1,0.860226,0.111111,9,2


### NN model with entity embedding

Choose reshape size for each categorical column. Push continuous columns through, as-is.

#### model topology

In [15]:
def gen_emb_nn_model(seed=2):
    np.random.seed(seed)
    models = []

    for i, vals in categorical_field_analysis.T.iteritems():

        # gather reshaping components
        m = vals.Distinct_Val_Cnt
        new_m = vals.m

        # special cases 
        if vals.Cat_Col == 'YearBuilt':
            new_m = 20
        elif vals.Cat_Col == 'GarageYrBlt':
            new_m = 10
        elif vals.Cat_Col == 'YearRemodAdd':
            new_m = 10
        elif vals.Cat_Col == 'Neighborhood':
            new_m = 15
        elif vals.Cat_Col == 'HouseStyle':
            new_m = 6

        # create embedding for each feature
        entity_model = Sequential()
        entity_model.add(Embedding(m, new_m, input_length=1))
        entity_model.add(Reshape(target_shape=(new_m,)))
        models.append(entity_model)

    n,m = full[continuous_cols].shape
    m += pca_cont_n
    continuous_model = Sequential()
    continuous_model.add(Dense(m, input_dim=m))
    models.append(continuous_model)

    emb_model = Sequential()
    emb_model.add(Merge(models, mode='concat'))
    emb_model.add(Dropout(0.1))
    emb_model.add(Dense(64, init='uniform'))
    emb_model.add(Activation('relu'))
    emb_model.add(Dropout(0.3))
    emb_model.add(Dense(64, init='uniform'))
    emb_model.add(Activation('relu'))
    emb_model.add(Dropout(0.3))
    emb_model.add(Dense(32, init='uniform'))
    emb_model.add(Activation('relu'))
    emb_model.add(Dropout(0.1))
    emb_model.add(Dense(1))
    emb_model.add(Activation('sigmoid'))
    emb_model.compile(loss='mean_squared_error', optimizer='adam')
    return copy(emb_model)

#### Vanilla NN

In [16]:
# vanilla NN 
def gen_vanilla_nn(seed=2):
    np.random.seed(seed)
    n, m = preprocessing_X(X_train).shape
    vanilla_nn_model = Sequential()
    vanilla_nn_model.add(Dense(1500, init='uniform', input_shape=(m,)))
    vanilla_nn_model.add(Activation('relu'))
    vanilla_nn_model.add(Dropout(0.2))
    vanilla_nn_model.add(Dense(750, init='uniform'))
    vanilla_nn_model.add(Activation('relu'))
    vanilla_nn_model.add(Dropout(0.1))
    vanilla_nn_model.add(Dense(1))
    vanilla_nn_model.add(Activation('sigmoid'))
    vanilla_nn_model.add(Dropout(0.01))
    vanilla_nn_model.compile(loss='mean_squared_error', optimizer='adam')
    return copy(vanilla_nn_model)

#### Preprocessing functions

In [17]:
_y = np.log1p(processed.train.SalePrice)
max_y = np.max(_y)


def nn_preprocessing_X(X_dat):
    X_out = []
    
    # categorical columns
    for c in categorical_cols:
        d = X_dat.loc[:, c].fillna('Unk').as_matrix()
        dt = label_encoders[c].transform(d)
        X_out.append(dt)
    
    # continuous columns
    for c in continuous_cols:
        X_dat.loc[np.isnan(X_dat[c]), c] = medians[c]
    continuous_ss = continous_scaler.transform(X_dat.loc[:, continuous_cols].as_matrix())
    continuous_mm = mm_scaler.transform(continuous_ss)
    
    # pca continuous 
    pca_Xdat_cont = pca_cont.transform(continuous_mm)
    
    X_out.append(np.concatenate((continuous_mm, pca_Xdat_cont), axis=1))
    return X_out


def preprocessing_X(X_dat):
    X_dat = X_dat.copy()
    
    # categorical columns
    for c in categorical_cols:
        d = X_dat.loc[:, c].fillna('Unk').as_matrix()
        dt = label_encoders[c].transform(d)
        X_dat.loc[:, c] = dt
    
    # continuous columns
    for c in continuous_cols:
        X_dat.loc[np.isnan(X_dat[c]), c] = medians[c]
    continuous_ss = continous_scaler.transform(X_dat.loc[:, continuous_cols].as_matrix())
    continuous_mm = mm_scaler.transform(continuous_ss)
    X_dat.loc[:, continuous_cols] = continuous_mm
    
    # pca continuous 
    pca_Xdat_cont = pca_cont.transform(continuous_mm)
                             
    # one hot encode 
    X_dat = ohe.transform(X_dat.as_matrix()).A
    
    # pca ohe 
    pca_Xdat_ohe = pca_ohe.transform(X_dat)
    
    return np.concatenate((X_dat, pca_Xdat_cont, pca_Xdat_ohe), axis=1)


def preprocessing_Y(y_dat):
    return np.log(y_dat.values+1)/max_y

### Modeling

In [18]:
processed.train.shape 

(1460, 80)

In [19]:
ntrain = processed.train.shape[0]
ntest = processed.test.shape[0]

#### Wrapper functions

In [20]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=2, params={}):
        try:
            params['random_state'] = seed
            self.clf = clf(**params)
        except:
            del params['random_state'] 
            self.clf = clf(**params)

    def train(self, x_train, y_train):
        xtr = preprocessing_X(x_train)
        ytr = preprocessing_Y(y_train).ravel()
        self.clf.fit(xtr, ytr)

    def predict(self, x):
        xte = preprocessing_X(x)
        return self.clf.predict(xte)


class XgbWrapper:
    def __init__(self, seed=2, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        xtr = preprocessing_X(x_train)
        ytr = preprocessing_Y(y_train).ravel()
        dtrain = xgb.DMatrix(xtr, label=ytr)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        xte = preprocessing_X(x)
        return self.gbdt.predict(xgb.DMatrix(xte))


class NnWrapper:
    def __init__(self, model, emb=True, nb_epoch=16, batch_size=8):
        self.model = copy(model)
        self.nb_epoch = nb_epoch
        self.batch_size = batch_size
        self.emb = emb
    
    def train(self, x_train, y_train):
        if self.emb:
            xtr = nn_preprocessing_X(x_train)
        else:
            xtr = preprocessing_X(x_train)
        ytr = preprocessing_Y(y_train).ravel()
        self.model.fit(xtr, ytr, 
                 nb_epoch = self.nb_epoch,
                 batch_size = self.batch_size,
                 verbose = 0)
    
    def predict(self, x):
        if self.emb:
            xte = nn_preprocessing_X(x)
        else:
            xte = preprocessing_X(x)
        return self.model.predict(xte).ravel()


def get_oof(clf):
    '''
        via:
        https://www.kaggle.com/eliotbarr/house-prices-
        advanced-regression-techniques/stacking-starter/code
    '''
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        x_tr = X_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = X_train.iloc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(X_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


get_rmse = lambda x,y: np.sqrt(np.mean(((x.ravel()*max_y) - np.log(y.ravel()+1))**2))

#### Train models

In [21]:
# set up training and test data
cols = [c for c in processed.train.columns if c != 'SalePrice' ]
X_train = processed.train.loc[:, cols ]
y_train = processed.train['SalePrice']
X_test = processed.test.loc[:, cols]

# set up model arrays 
train_models = []
test_models = []

# seeds 
seeds = np.random.choice(range(100), 5, replace=False)

# loop
for s in seeds:
    ## set up K Folds ##
    NFOLDS = 4
    kf = KFold(n_splits=NFOLDS, 
               shuffle=True, 
               random_state=s)

    ## Categorical embedding NN models ##
    emb_model = gen_emb_nn_model(seed=s)
    NN = NnWrapper(emb_model, emb=True)
    nn_oof_train, nn_oof_test = get_oof(NN)
    train_models.append(nn_oof_train)
    test_models.append(nn_oof_test)
    logging.warn('NNE score, {}: {}'.format(s, get_rmse(nn_oof_train, y_train)))

    ## Vanilla NN models ##
    vanilla_nn_model = gen_vanilla_nn(seed=s)
    NN2 = NnWrapper(vanilla_nn_model, emb=False)
    nn2_oof_train, nn2_oof_test = get_oof(NN2)
    train_models.append(nn2_oof_train)
    test_models.append(nn2_oof_test)
    logging.warn('NNV score, {}: {}'.format(s, get_rmse(nn2_oof_train, y_train)))

    ## XGBoost ##
    base_xgb_params ={
        'colsample_bytree': 0.75 ,
        'silent': 1 ,
        'subsample': 0.5 ,
        'learning_rate': 0.05 ,
        'objective': 'reg:linear' ,
        'max_depth': 4 ,
        'num_parallel_tree': 1 ,
        'min_child_weight': 1 ,
        'eval_metric': 'rmse' ,
        'nrounds': 700 ,
    }
    base_xgb_params['seed'] = 2
    XG = XgbWrapper(params=base_xgb_params)
    xgb_oof_train, xgb_oof_test = get_oof(XG)
    train_models.append(xgb_oof_train)
    test_models.append(xgb_oof_test)
    logging.warn('XGB score, {}: {}'.format(s, get_rmse(xgb_oof_train, y_train)))

    ## other models ##
    # build models
    LS = SklearnWrapper(clf=linear_model.Lasso, params={'alpha':0.0001}, seed=s)
    RG = SklearnWrapper(clf=linear_model.Ridge, params={'alpha':10.0}, seed=s)
    RF = SklearnWrapper(clf=RandomForestRegressor, params={
                                                    'n_jobs': 4,
                                                    'n_estimators': 400,
                                                    'max_features': 0.5,
                                                    'max_depth': 12,
                                                    'min_samples_leaf': 10,
                                                }, seed=s)
    ET = SklearnWrapper(clf=ExtraTreesRegressor, params={
                                                    'n_jobs': 4,
                                                    'n_estimators': 400,
                                                    'max_features': 0.5,
                                                    'max_depth': 12,
                                                    'min_samples_leaf': 10,
                                                }, seed=s)
    SVRL = SklearnWrapper(clf=SVR, params={'kernel':'linear','C':0.0001,'epsilon':0.001},seed=s)
    SVRB = SklearnWrapper(clf=SVR, params={'kernel':'rbf','C':1.0,'epsilon':0.001},seed=s)
    VPW = SklearnWrapper(clf=VWRegressor, params={'l':10.0,'power_t':0.1},seed=s)

    # run models
    ls_oof_train, ls_oof_test = get_oof(LS)
    rg_oof_train, rg_oof_test = get_oof(RG)
    rf_oof_train, rf_oof_test = get_oof(RF)
    et_oof_train, et_oof_test = get_oof(ET)
    svrl_oof_train, svrl_oof_test = get_oof(SVRL)
    svrb_oof_train, svrb_oof_test = get_oof(SVRB)
    vpw_oof_train, vpw_oof_test = get_oof(VPW)

    # append models
    train_models += [
        ls_oof_train,
        rg_oof_train,
        rf_oof_train,
        et_oof_train,
        svrl_oof_train,
        svrb_oof_train,
        vpw_oof_train,
    ]
    test_models += [
        ls_oof_test,
        rg_oof_test,
        rf_oof_test,
        et_oof_test,
        svrl_oof_test,
        svrb_oof_test,
        vpw_oof_test,
    ]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


KeyboardInterrupt: 

#### Combine models

In [22]:
X_train = np.concatenate(train_models, axis=1)
X_test = np.concatenate(test_models, axis=1)

dtrain = xgb.DMatrix(X_train, label=preprocessing_Y(y_train))
dtest = xgb.DMatrix(X_test)

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

res = xgb.cv(xgb_params, 
             dtrain, 
             num_boost_round=1000, 
             nfold=4, 
             stratified=False,
             early_stopping_rounds=25, 
             verbose_eval=200, 
             show_stdv=True)
best_nrounds = res.shape[0] - 1
xgb_final_model = xgb.train(xgb_params, dtrain, best_nrounds)

XGBoostError: [20:55:10] src/tree/updater_colmaker.cc:161: Check failed: (n) > (0) colsample_bytree=0.8 is too small that no feature can be included

In [None]:
calc_exp = lambda x: np.exp(x*max_y)-1
print('Ensemble:\t{:,.4f}+{:,.4f}'.format(calc_exp(res.iloc[-1,0]),calc_exp(res.iloc[-1,1])))

In [None]:
predictions = np.exp(xgb_final_model.predict(dtest)*max_y)-1

In [None]:
submission = pd.DataFrame(np.vstack((raw.test.index.astype(str).ravel(), 
                                     predictions.ravel())).T,
                          columns=['Id','SalePrice'])
submission.to_csv('submission.csv', index=False)