## Regression of home prices

#### Libraries

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.cross_validation import train_test_split

from sklearn.decomposition import PCA

#### Load data

In [2]:
DATADIR = './Data/'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

In [3]:
train_raw = pd.read_csv(DATADIR+TRAIN_FILE)
test_raw = pd.read_csv(DATADIR+TEST_FILE)

In [4]:
IDS = test_raw['Id']
del test_raw['Id']
del train_raw['Id']

In [5]:
Y = np.log(train_raw['SalePrice'].as_matrix())
del train_raw['SalePrice']

#### Create one-hot representation 

In [6]:
dtypes = train_raw.dtypes
categoricals = []
for i, f in enumerate(train_raw.columns):
    if dtypes[f] == np.dtype('O'):
        le = LabelEncoder()
        concat = (train_raw[f],test_raw[f])
        le.fit(np.hstack(concat))
        train_raw[f] = le.transform(train_raw[f])
        test_raw[f] = le.transform(test_raw[f])
        categoricals.append(i)

train_raw = train_raw.fillna(0)
test_raw = test_raw.fillna(0)
ohe = OneHotEncoder(categorical_features=categoricals)
ohe.fit(np.vstack((train_raw,test_raw)))
X = ohe.transform(train_raw)
X_test = ohe.transform(test_raw)

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


#### Scale data

In [7]:
ss = StandardScaler()
X = ss.fit_transform(X.A)
X_test = ss.transform(X_test.A)

#### PCA

In [8]:
pca = PCA()
pca.fit(X)
f = 5
X_pca = pca.transform(X)
X_test_pca = pca.transform(X_test)
X = np.hstack((X,X_pca[:,:f]))
X_test = np.hstack((X_test,X_test_pca[:,:f]))

#### Train-test split

In [9]:
# X_train, X_val, y_train, y_val = train_test_split( X, Y, test_size=0.25)
X_train, X_val, y_train, y_val = X[:1000], X[1000:], Y[:1000], Y[1000:]

In [10]:
dtrain = xgb.DMatrix( X_train, label=y_train )
dval = xgb.DMatrix( X_val, label=y_val )
dtest = xgb.DMatrix( X_test )

#### Boosted trees model 

In [11]:
d = 5
e = 0.01
t = 2000
param = {'max_depth':d, 
         'eta':e, 
         'subsample':0.5, 
         'colsample_bytree':0.5,
         'colsample_bylevel':0.5,
         'silent':1, 
         'lambda':1.0,
         'objective':'reg:linear' }
param['eval_metric'] = 'rmse'
param['nthread'] = 2
evallist  = [(dval,'eval'), (dtrain,'train')]
xgb_model = xgb.train(param.items(), dtrain, t+1, evallist, verbose_eval=t//10)

[0]	eval-rmse:11.403349	train-rmse:11.421618
[200]	eval-rmse:1.545437	train-rmse:1.561855
[400]	eval-rmse:0.246164	train-rmse:0.244733
[600]	eval-rmse:0.124228	train-rmse:0.089846
[800]	eval-rmse:0.117155	train-rmse:0.070159
[1000]	eval-rmse:0.114912	train-rmse:0.060818
[1200]	eval-rmse:0.113982	train-rmse:0.053011
[1400]	eval-rmse:0.113235	train-rmse:0.046531
[1600]	eval-rmse:0.112862	train-rmse:0.040902
[1800]	eval-rmse:0.112666	train-rmse:0.036015
[2000]	eval-rmse:0.112426	train-rmse:0.031687


In [12]:
predictions = xgb_model.predict(dval)
score = np.sqrt(np.mean((predictions-y_val)**2))
print 'Predicted RMSE (of log-values): {}'.format(round(score,4))

Predicted RMSE (of log-values): 0.1124


#### Re-predict with full dataset

In [13]:
dtrain = xgb.DMatrix( X, label=Y )
xgb_model = xgb.train(param.items(), dtrain, t+1)
predictions = xgb_model.predict(dtest)

In [14]:
submission = pd.DataFrame(np.vstack((IDS.astype(str), 
                                     np.exp(predictions))).T,
                          columns=['Id','SalePrice'])
submission.to_csv('submission.csv', index=False)