In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc
from sklearn.model_selection import train_test_split

In [None]:
print('Loading data ...')
train = pd.read_csv('../input/train_2016.csv')
prop = pd.read_csv('../input/properties_2016.csv')
sample = pd.read_csv('../input/sample_submission.csv')

In [None]:
print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

In [None]:
print('Creating training set ...')
df_train = train.merge(prop, how='left', on='parcelid')
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

In [None]:
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)
del df_train; gc.collect()

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train)

In [None]:
print('Building DMatrix...')
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
del x_train, x_valid; gc.collect()

In [None]:
print('Training ...')
params = {
    'eta': 0.02,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'max_depth': 4,
    'silent': 1
}
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)
del d_train, d_valid

In [None]:
print('Building test set ...')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')
del prop; gc.collect()

In [None]:
x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
del df_test, sample; gc.collect()

d_test = xgb.DMatrix(x_test)
del x_test; gc.collect()

In [None]:
print('Predicting on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

sub = pd.read_csv('../input/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

In [None]:
print('Writing csv ...')
sub.to_csv('../output/fred-baseline.csv', index=False, float_format='%.4f')