In [36]:
import pandas as pd
import numpy as np
import scipy as sp
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split

In [37]:
DATADIR = './Data/'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

In [38]:
train_raw = pd.read_csv(DATADIR+TRAIN_FILE)
test_raw = pd.read_csv(DATADIR+TEST_FILE)

In [39]:
IDS = test_raw['Id']
del test_raw['Id']
del train_raw['Id']

In [40]:
Y = np.log(train_raw['SalePrice'].as_matrix())
del train_raw['SalePrice']

In [41]:
for i, f in enumerate(train_raw.columns):
    dtypes = train_raw.dtypes
    categoricals = []
    for i, f in enumerate(train_raw.columns):
        if dtypes[f] == np.dtype('O'):
            le = LabelEncoder()
            concat = (train_raw[f],test_raw[f])
            le.fit(np.hstack(concat))
            train_raw[f] = le.transform(train_raw[f])
            test_raw[f] = le.transform(test_raw[f])
            categoricals.append(i)
    train_raw = train_raw.fillna(0)
    test_raw = test_raw.fillna(0)
    ohe = OneHotEncoder(categorical_features=categoricals)
    ohe.fit(np.vstack((train_raw,test_raw)))
    X = ohe.transform(train_raw)
    X_test = ohe.transform(test_raw)

In [42]:
X_train, X_val, y_train, y_val = train_test_split( X, Y, test_size=0.25)

In [43]:
X_train, X_val, y_train, y_val = X[:1000], X[1000:], Y[:1000], Y[1000:]

In [44]:
dtrain = xgb.DMatrix( X_train, label=y_train )
dval = xgb.DMatrix( X_val, label=y_val )
dtest = xgb.DMatrix( X_test )

In [45]:
d = 5
e = 0.01
t = 2000
param = {'max_depth':d, 
         'eta':e, 
         'subsample':0.5, 
         'colsample_bytree':0.5,
         'colsample_bylevel':0.5,
         'silent':1, 
         'lambda':1.0,
         'objective':'reg:linear' }
param['eval_metric'] = 'rmse'
param['nthread'] = 2
evallist  = [(dval,'eval'), (dtrain,'train')]
xgb_model = xgb.train(param.items(), dtrain, t+1, evallist, verbose_eval=t//10)

[0]	eval-rmse:11.403349	train-rmse:11.421618
[200]	eval-rmse:1.545991	train-rmse:1.562856
[400]	eval-rmse:0.252748	train-rmse:0.248438
[600]	eval-rmse:0.129108	train-rmse:0.092792
[800]	eval-rmse:0.120994	train-rmse:0.071813
[1000]	eval-rmse:0.119035	train-rmse:0.061867
[1200]	eval-rmse:0.118284	train-rmse:0.053685
[1400]	eval-rmse:0.117657	train-rmse:0.047001
[1600]	eval-rmse:0.117051	train-rmse:0.041133
[1800]	eval-rmse:0.116891	train-rmse:0.036311
[2000]	eval-rmse:0.116813	train-rmse:0.032131


In [46]:
predictions = xgb_model.predict(dval)
score = np.sqrt(np.mean((predictions-y_val)**2))
print 'Predicted RMSE (of log-values): {}'.format(round(score,4))

Predicted RMSE (of log-values): 0.1168


In [47]:
dtrain = xgb.DMatrix( X, label=Y )
xgb_model = xgb.train(param.items(), dtrain, t+1)
predictions = xgb_model.predict(dtest)

In [48]:
submission = pd.DataFrame(np.vstack((IDS.astype(str), 
                                     np.exp(predictions))).T,
                          columns=['Id','SalePrice'])
submission.to_csv('submission.csv', index=False)