In [55]:
import pandas as pd
import numpy as np
import scipy as sp
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split

In [56]:
DATADIR = './Data/'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

In [80]:
train_raw = pd.read_csv(DATADIR+TRAIN_FILE)
test_raw = pd.read_csv(DATADIR+TEST_FILE)

In [58]:
IDS = test_raw['Id']
del test_raw['Id']
del train_raw['Id']

In [59]:
Y = train_raw['SalePrice'].as_matrix()
del train_raw['SalePrice']

In [60]:
def data_transformation(df):
    dtypes = df.dtypes
    categoricals = []
    for i, f in enumerate(df.columns):
        if dtypes[f] == np.dtype('O'):
            le = LabelEncoder()
            df[f] = le.fit_transform(df[f])
            categoricals.append(i)
    df = df.fillna(0)
    ohe = OneHotEncoder(categorical_features=categoricals)
    df = ohe.fit_transform(df)
    return df

In [61]:
X = data_transformation(train_raw).A
X_test = data_transformation(test_raw).A

In [62]:
X_train, X_val, y_train, y_val = train_test_split( X, Y, test_size=0.25)

In [135]:
X_train, X_val, y_train, y_val = X[:1000], X[1000:], Y[:1000], Y[1000:]

In [136]:
dtrain = xgb.DMatrix( X_train, label=y_train )
dval = xgb.DMatrix( X_val, label=y_val )
dtest = xgb.DMatrix( X_test )

In [141]:
d = 4
e = 0.01
t = 4500
param = {'max_depth':d, 
         'eta':e, 
         'subsample':0.5, 
         'colsample_bytree':0.5,
         'colsample_bylevel':0.5,
         'silent':1, 
         'objective':'reg:linear' }
param['eval_metric'] = 'rmse'
param['nthread'] = 2
evallist  = [(dval,'eval'), (dtrain,'train')]
xgb_model = xgb.train(param.items(), dtrain, t+1, evallist, verbose_eval=t//10)

[0]	eval-rmse:192232.093750	train-rmse:197281.609375
[450]	eval-rmse:28533.083984	train-rmse:17766.572266
[900]	eval-rmse:25990.046875	train-rmse:12877.082031
[1350]	eval-rmse:25271.902344	train-rmse:10402.408203
[1800]	eval-rmse:24983.246094	train-rmse:8548.245117
[2250]	eval-rmse:24902.556641	train-rmse:7192.436523
[2700]	eval-rmse:24907.824219	train-rmse:6045.517578
[3150]	eval-rmse:24846.640625	train-rmse:5139.490723
[3600]	eval-rmse:24811.265625	train-rmse:4426.179688
[4050]	eval-rmse:24783.376953	train-rmse:3831.485840
[4500]	eval-rmse:24772.322266	train-rmse:3327.049561


In [142]:
predictions = xgb_model.predict(dval)
score = np.sqrt(np.mean((np.log(predictions)-np.log(y_val))**2))
print 'Predicted RMSE (of log-values): {}'.format(round(score,4))

Predicted RMSE (of log-values): 0.1186


In [143]:
dtrain = xgb.DMatrix( X, label=Y )
xgb_model = xgb.train(param.items(), dtrain, t+1)
predictions = xgb_model.predict(dtest)

In [146]:
submission = pd.DataFrame(np.vstack((IDS, 
                                     predictions)).T,
                          columns=['Id','SalePrice'])
submission.to_csv('submission.csv', index=False)