In [1]:
# future functions
from __future__ import print_function 

# core scipy and numpy
import numpy as np
import scipy as sp

# pandas 
import pandas as pd
from IPython.display import display
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 135

# encoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# manifold for embedding analysis
from sklearn import manifold

# Cross validation 
from sklearn.model_selection import KFold

# Linear models 
from sklearn import linear_model

# Forests
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

# SVR
from sklearn.svm import SVR

# KNN
from sklearn.neighbors import KNeighborsRegressor

# PCA
from sklearn.decomposition import PCA

# vowpal wabbit
from vowpalwabbit.sklearn_vw import VWRegressor

# combinations with categorical features
from itertools import combinations

# matplotlib 
import matplotlib.pyplot as plt
%matplotlib inline

# python helpers 
from collections import namedtuple
from copy import copy

# neural network
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Merge, Reshape
from keras.layers.embeddings import Embedding

# XGboost for gradient-boosted decision trees
import xgboost as xgb

# logging
import logging
logging.basicConfig()

Using TensorFlow backend.


In [2]:
loadfile = lambda x: pd.read_csv(x, index_col=0)

In [3]:
Files = namedtuple('Files',['train','test'])
RawData = namedtuple('RawData',['train','test'])
ProcessedData = namedtuple('ProcessedData',['train','test'])

#### Load data 

In [4]:
# load raw data
rawfiles = Files(train='Data/train.csv.gz',
            test='Data/test.csv.gz')
raw = RawData(train=loadfile(rawfiles.train),
              test=loadfile(rawfiles.test))

In [5]:
# concatenate all data 
cols = [ c for c in raw.train.columns if c != 'SalePrice' ]
full = pd.concat((raw.train.loc[:, cols],
                     raw.test))

# impute missing values
fillfunc = lambda x: x.fillna(x.value_counts().iloc[0])
full = full.apply(fillfunc)

NameError: name 'processed' is not defined

In [None]:
# create processed dataset
processed = ProcessedData(train=raw.train.apply(fillfunc),
                         test=raw.test.apply(fillfunc))
processed.train.head()

#### Outliers 

outliers = [  30,   88,  197,  462,  495,  523,  557,  632,  691,  825,  874,
             898,  968,  970, 1169, 1170, 1182, 1423, 1432, 1453 ] # from outlier analysis
outlier_ids = raw.train.index.isin(outliers)
processed = ProcessedData(train=processed.train.loc[~outlier_ids ,:],
                            test=processed.test)

#### Categorical combinations

In [None]:
# create lookup for numeric categorical columns
numeric_categorical = [
    'MSSubClass',
    'YearBuilt',
    'YearRemodAdd',
    'MoSold',
    'YrSold',
    'GarageYrBlt',
    'OverallQual',
    'OverallCond',
    'MiscVal',
]
categorical_cols = list(processed.train.dtypes[processed.train.dtypes == "object"].index)
categorical_cols += numeric_categorical
continuous_cols = list(processed.train.dtypes[processed.train.dtypes != "object"].index)
continuous_cols = [ c for c in continuous_cols if c not in numeric_categorical and c!='SalePrice' ]

for c1,c2 in combinations(categorical_cols, 2):
    newc = c1+'_'+c2
    processed.train.loc[:, newc] = processed.train[c1].astype(str) + '_' + processed.train[c2].astype(str)
    processed.test.loc[:, newc] = processed.test[c1].astype(str) + '_' + processed.test[c2].astype(str)
    categorical_cols.append(newc)

#### Preprocess categorical data

In [None]:
# store important objects
label_encoders = {}
categorical_col_nums = []


# encode data and store objects
i = 0

for col, col_data in full.iteritems():
    
    # handle categorical columns 
    if col_data.dtype == object or col in numeric_categorical:
        le = LabelEncoder()
        col_data = le.fit_transform(col_data)
        categorical_col_nums.append(full.columns.get_loc(col))
        label_encoders[col] = le
    
    # store objects
    full[col] = col_data

#### Log transfoms

In [None]:
get_skew = lambda x: sp.stats.skewtest(x.dropna())[1]<0.05
for c in continuous_cols:
    if get_skew(processed.train[c]):
        processed.train.loc[:, c] = np.log1p(processed.train[c])
        processed.test.loc[:, c] = np.log1p(processed.test[c])

#### Setup standard scaler and one hot encoder

In [None]:
# standard scaler
continous_scaler = StandardScaler()
c = continous_scaler.fit_transform(full.loc[:, continuous_cols].as_matrix())

# minmax 
mm_scaler = MinMaxScaler(feature_range=(-1, 1))
m = mm_scaler.fit_transform(c)

# transform data in full dataset
full.loc[:, continuous_cols] = m 

# create one hot encoder
ohe = OneHotEncoder(categorical_features=categorical_col_nums)
o = ohe.fit_transform(full.as_matrix())

# pca for continuous columns
pca_cont_n = 1
pca_cont = PCA(n_components=pca_cont_n)
_ = pca_cont.fit(m)

# pca for ohe columns
pca_ohe_n = 1
pca_ohe = PCA(n_components=pca_ohe_n)
_ = pca_ohe.fit(o.A)

#### Categorical field analysis

In [None]:
categorical_field_analysis = []
for c in categorical_cols:
    # get pct freq coverage of top value 
    v = full[c].fillna('Unk').value_counts()
    first_pct = v.iloc[0]*1.0/sum(v)

    # get count of distinct values
    distinct_vals = set(full[c].values)
    d_cnt = len(distinct_vals)

    # calculate logical freq  
    logical_pct = 1.0/d_cnt
        
    # append
    categorical_field_analysis.append((c, first_pct, logical_pct, d_cnt))

categorical_field_analysis = pd.DataFrame(categorical_field_analysis,
                                 columns=['Cat_Col',
                                          'First_Freq',
                                          'Logical_Freq',
                                          'Distinct_Val_Cnt',])

In [None]:
@np.vectorize
def choose_m(freq, n):
    for i in range(1,n+1):
        if freq>=(1-float(i)/n):
            x = i
            break
    return x

In [None]:
categorical_field_analysis['m'] =\
choose_m(categorical_field_analysis.First_Freq,\
categorical_field_analysis.Distinct_Val_Cnt)

In [None]:
display(categorical_field_analysis)

### NN model with entity embedding

Choose reshape size for each categorical column. Push continuous columns through, as-is.

#### model topology

In [None]:
def gen_emb_nn_model(seed=2):
    np.random.seed(seed)
    models = []

    for i, vals in categorical_field_analysis.T.iteritems():

        # gather reshaping components
        m = vals.Distinct_Val_Cnt
        new_m = vals.m

        # special cases 
        if vals.Cat_Col == 'YearBuilt':
            new_m = 20
        elif vals.Cat_Col == 'GarageYrBlt':
            new_m = 10
        elif vals.Cat_Col == 'YearRemodAdd':
            new_m = 10
        elif vals.Cat_Col == 'Neighborhood':
            new_m = 15
        elif vals.Cat_Col == 'HouseStyle':
            new_m = 6

        # create embedding for each feature
        entity_model = Sequential()
        entity_model.add(Embedding(m, new_m, input_length=1))
        entity_model.add(Reshape(target_shape=(new_m,)))
        models.append(entity_model)

    n,m = full[continuous_cols].shape
    m += pca_cont_n
    continuous_model = Sequential()
    continuous_model.add(Dense(m, input_dim=m))
    models.append(continuous_model)

    emb_model = Sequential()
    emb_model.add(Merge(models, mode='concat'))
    emb_model.add(Dropout(0.1))
    emb_model.add(Dense(64, init='uniform'))
    emb_model.add(Activation('relu'))
    emb_model.add(Dropout(0.3))
    emb_model.add(Dense(64, init='uniform'))
    emb_model.add(Activation('relu'))
    emb_model.add(Dropout(0.3))
    emb_model.add(Dense(32, init='uniform'))
    emb_model.add(Activation('relu'))
    emb_model.add(Dropout(0.1))
    emb_model.add(Dense(1))
    emb_model.add(Activation('sigmoid'))
    emb_model.compile(loss='mean_squared_error', optimizer='adam')
    return copy(emb_model)

#### Vanilla NN

In [None]:
# vanilla NN 
def gen_vanilla_nn(seed=2):
    np.random.seed(seed)
    n, m = preprocessing_X(X_train).shape
    vanilla_nn_model = Sequential()
    vanilla_nn_model.add(Dense(1500, init='uniform', input_shape=(m,)))
    vanilla_nn_model.add(Activation('tanh'))
    vanilla_nn_model.add(Dropout(0.3))
    vanilla_nn_model.add(Dense(1500, init='uniform'))
    vanilla_nn_model.add(Activation('tanh'))
    vanilla_nn_model.add(Dropout(0.3))
    vanilla_nn_model.add(Dense(1))
    vanilla_nn_model.add(Activation('sigmoid'))
    vanilla_nn_model.add(Dropout(0.01))
    vanilla_nn_model.compile(loss='mean_squared_error', optimizer='adam')
    return copy(vanilla_nn_model)

#### Preprocessing functions

In [None]:
_y = np.log1p(processed.train.SalePrice)
max_y = np.max(_y)


def nn_preprocessing_X(X_dat):
    X_out = []
    
    # categorical columns
    for c in categorical_cols:
        d = X_dat.loc[:, c].as_matrix()
        dt = label_encoders[c].transform(d)
        X_out.append(dt)
    
    # continuous columns
    continuous_ss = continous_scaler.transform(X_dat.loc[:, continuous_cols].as_matrix())
    continuous_mm = mm_scaler.transform(continuous_ss)
    
    # pca continuous 
    pca_Xdat_cont = pca_cont.transform(continuous_mm)
    
    X_out.append(np.concatenate((continuous_mm, pca_Xdat_cont), axis=1))
    return X_out


def preprocessing_X(X_dat):
    X_dat = X_dat.copy()
    
    # categorical columns
    for c in categorical_cols:
        d = X_dat.loc[:, c].as_matrix()
        dt = label_encoders[c].transform(d)
        X_dat.loc[:, c] = dt
    
    # continuous columns
    continuous_ss = continous_scaler.transform(X_dat.loc[:, continuous_cols].as_matrix())
    continuous_mm = mm_scaler.transform(continuous_ss)
    X_dat.loc[:, continuous_cols] = continuous_mm
    
    # pca continuous 
    pca_Xdat_cont = pca_cont.transform(continuous_mm)
                             
    # one hot encode 
    X_dat = ohe.transform(X_dat.as_matrix()).A
    
    # pca ohe 
    pca_Xdat_ohe = pca_ohe.transform(X_dat)
    
    return np.concatenate((X_dat, pca_Xdat_cont, pca_Xdat_ohe), axis=1)


def preprocessing_Y(y_dat):
    return np.log(y_dat.values+1)/max_y

### Modeling

In [None]:
processed.train.shape 

In [None]:
ntrain = processed.train.shape[0]
ntest = processed.test.shape[0]

#### Wrapper functions

In [None]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=2, params={}):
        try:
            params['random_state'] = seed
            self.clf = clf(**params)
        except:
            del params['random_state'] 
            self.clf = clf(**params)

    def train(self, x_train, y_train):
        xtr = preprocessing_X(x_train)
        ytr = preprocessing_Y(y_train).ravel()
        self.clf.fit(xtr, ytr)

    def predict(self, x):
        xte = preprocessing_X(x)
        return self.clf.predict(xte)


class XgbWrapper:
    def __init__(self, seed=2, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        xtr = preprocessing_X(x_train)
        ytr = preprocessing_Y(y_train).ravel()
        dtrain = xgb.DMatrix(xtr, label=ytr)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        xte = preprocessing_X(x)
        return self.gbdt.predict(xgb.DMatrix(xte))


class NnWrapper:
    def __init__(self, model, emb=True, nb_epoch=16, batch_size=8):
        self.model = copy(model)
        self.nb_epoch = nb_epoch
        self.batch_size = batch_size
        self.emb = emb
    
    def train(self, x_train, y_train):
        if self.emb:
            xtr = nn_preprocessing_X(x_train)
        else:
            xtr = preprocessing_X(x_train)
        ytr = preprocessing_Y(y_train).ravel()
        self.model.fit(xtr, ytr, 
                 nb_epoch = self.nb_epoch,
                 batch_size = self.batch_size,
                 verbose = 0)
    
    def predict(self, x):
        if self.emb:
            xte = nn_preprocessing_X(x)
        else:
            xte = preprocessing_X(x)
        return self.model.predict(xte).ravel()


def get_oof(clf):
    '''
        via:
        https://www.kaggle.com/eliotbarr/house-prices-
        advanced-regression-techniques/stacking-starter/code
    '''
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        x_tr = X_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = X_train.iloc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(X_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


get_rmse = lambda x,y: np.sqrt(np.mean(((x.ravel()*max_y) - np.log(y.ravel()+1))**2))

#### Train models

In [None]:
# set up training and test data
cols = [c for c in processed.train.columns if c != 'SalePrice' ]
X_train = processed.train.loc[:, cols ]
y_train = processed.train['SalePrice']
X_test = processed.test.loc[:, cols]

# set up model arrays 
train_models = []
test_models = []

# seeds 
seeds = np.random.choice(range(100), 3, replace=False)

# loop
for s in seeds:
    ## set up K Folds ##
    NFOLDS = 3
    kf = KFold(n_splits=NFOLDS, 
               shuffle=True, 
               random_state=s)

    ## Categorical embedding NN models ##
    emb_model = gen_emb_nn_model(seed=s)
    NN = NnWrapper(emb_model, emb=True)
    nn_oof_train, nn_oof_test = get_oof(NN)
    train_models.append(nn_oof_train)
    test_models.append(nn_oof_test)
    logging.warn('NNE score, {}: {:,.4f}'.format(s, get_rmse(nn_oof_train, y_train)))

    ## Vanilla NN models ##
    vanilla_nn_model = gen_vanilla_nn(seed=s)
    NN2 = NnWrapper(vanilla_nn_model, emb=False)
    nn2_oof_train, nn2_oof_test = get_oof(NN2)
    train_models.append(nn2_oof_train)
    test_models.append(nn2_oof_test)
    logging.warn('NNV score, {}: {:,.4f}'.format(s, get_rmse(nn2_oof_train, y_train)))

    ## XGBoost ##
    base_xgb_params ={
        'colsample_bytree': 0.75 ,
        'silent': 1 ,
        'subsample': 0.5 ,
        'learning_rate': 0.05 ,
        'objective': 'reg:linear' ,
        'max_depth': 4 ,
        'num_parallel_tree': 1 ,
        'min_child_weight': 1 ,
        'eval_metric': 'rmse' ,
        'nrounds': 700 ,
    }
    base_xgb_params['seed'] = 2
    XG = XgbWrapper(params=base_xgb_params)
    xgb_oof_train, xgb_oof_test = get_oof(XG)
    train_models.append(xgb_oof_train)
    test_models.append(xgb_oof_test)
    logging.warn('XGB score, {}: {:,.4f}'.format(s, get_rmse(xgb_oof_train, y_train)))

    ## other models ##
    # build models
    LS = SklearnWrapper(clf=linear_model.Lasso, params={'alpha':0.0001}, seed=s)
    RG = SklearnWrapper(clf=linear_model.Ridge, params={'alpha':10.0}, seed=s)
    RF = SklearnWrapper(clf=RandomForestRegressor, params={
                                                    'n_jobs': 4,
                                                    'n_estimators': 400,
                                                    'max_features': 0.5,
                                                    'max_depth': 12,
                                                    'min_samples_leaf': 10,
                                                }, seed=s)
    ET = SklearnWrapper(clf=ExtraTreesRegressor, params={
                                                    'n_jobs': 4,
                                                    'n_estimators': 400,
                                                    'max_features': 0.5,
                                                    'max_depth': 12,
                                                    'min_samples_leaf': 10,
                                                }, seed=s)
    SVRL = SklearnWrapper(clf=SVR, params={'kernel':'linear','C':0.0001,'epsilon':0.001},seed=s)
    SVRB = SklearnWrapper(clf=SVR, params={'kernel':'rbf','C':1.0,'epsilon':0.001},seed=s)
    VPW = SklearnWrapper(clf=VWRegressor, params={'l':10.0,'power_t':0.1},seed=s)

    # run models
    ls_oof_train, ls_oof_test = get_oof(LS)
    rg_oof_train, rg_oof_test = get_oof(RG)
    rf_oof_train, rf_oof_test = get_oof(RF)
    et_oof_train, et_oof_test = get_oof(ET)
    svrl_oof_train, svrl_oof_test = get_oof(SVRL)
    svrb_oof_train, svrb_oof_test = get_oof(SVRB)
    vpw_oof_train, vpw_oof_test = get_oof(VPW)

    # append models
    other_train_models = [
        ls_oof_train,
        rg_oof_train,
        rf_oof_train,
        et_oof_train,
        svrl_oof_train,
        svrb_oof_train,
        vpw_oof_train,
    ] 
    other_test_models = [
        ls_oof_test,
        rg_oof_test,
        rf_oof_test,
        et_oof_test,
        svrl_oof_test,
        svrb_oof_test,
        vpw_oof_test,
    ]
    train_models += other_train_models
    test_models += other_test_models
    
    # log scores
    lookup = ['LS','RG','RF','ET','SVRL','SVRB','VPW']
    for i, m in enumerate(other_train_models):
        logging.warn('{} score, {}: {:,.4f}'.format(lookup[i], s, get_rmse(m, y_train)))

#### Combine models

In [None]:
X_train = np.concatenate(train_models, axis=1)
X_test = np.concatenate(test_models, axis=1)

dtrain = xgb.DMatrix(X_train, label=preprocessing_Y(y_train))
dtest = xgb.DMatrix(X_test)

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

res = xgb.cv(xgb_params, 
             dtrain, 
             num_boost_round=1000, 
             nfold=4, 
             stratified=False,
             early_stopping_rounds=25, 
             verbose_eval=200, 
             show_stdv=True)
best_nrounds = res.shape[0] - 1
xgb_final_model = xgb.train(xgb_params, dtrain, best_nrounds)

In [None]:
calc_exp = lambda x: np.exp(x*max_y)-1
print('Ensemble:\t{:,.4f}+{:,.4f}'.format(calc_exp(res.iloc[-1,0]),calc_exp(res.iloc[-1,1])))

In [None]:
predictions = np.exp(xgb_final_model.predict(dtest)*max_y)-1

In [None]:
submission = pd.DataFrame(np.vstack((raw.test.index.astype(str).ravel(), 
                                     predictions.ravel())).T,
                          columns=['Id','SalePrice'])
submission.to_csv('submission.csv', index=False)