# XGBoost

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from xgboost import plot_importance
import matplotlib.pyplot as plt
%matplotlib inline
import gc

In [2]:
# data 
properties = pd.read_csv('../input/properties_2016.csv')
prop = pd.read_csv('../input/properties_2016.csv')
train = pd.read_csv("../input/train_2016_v2.csv")


# data pre-processing and train/test
for column in properties.columns:
    properties[column] = properties[column].fillna(-1)
    if properties[column].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[column].values))
        properties[column] = lbl.transform(list(properties[column].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
x_test = properties.drop(['parcelid','decktypeid', 'yardbuildingsqft26', 'basementsqft',
                        'buildingclasstypeid'], axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# outliers removal
train_df = train_df[train_df.logerror > -0.4]
train_df = train_df[train_df.logerror < 0.419]

x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate',
                        'decktypeid', 'yardbuildingsqft26', 'basementsqft',
                        'buildingclasstypeid'], axis=1)

y_train = train_df['logerror'].values.astype(np.float32)
y_mean = np.mean(y_train)

x_train.shape, x_test.shape

((90275, 53), (2985217, 53))

In [4]:
# model
xgb_params = {
    'eta': 0.037,
    'max_depth': int(5),
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 250

In [None]:
clf = xgb.XGBClassifier(xgb_params)

In [None]:
# training
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

xgb_pred1 = model.predict(dtest)

num_boost_rounds = 240

model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

# second run
xgb_pred2 = model.predict(dtest)

xgb_pred = (xgb_pred1 + xgb_pred2) / 2

print( pd.DataFrame(xgb_pred).head() )

del train_df; gc.collect()
del x_train; gc.collect()
del x_test; gc.collect()
del properties; gc.collect()
del dtest; gc.collect()
del dtrain; gc.collect()
del xgb_pred1, xgb_pred2; gc.collect

In [None]:
'''
- decktypeid
- yardbuildingsqft
- basementsqft
- buildingclasspeid
- pooltypeid
'''

ax = xgb.plot_importance(model)
fig = ax.figure
fig.set_size_inches(20, 20)

In [None]:
print(model)

## Predictions

In [None]:
test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']

sub = pd.read_csv('../submissions/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = xgb_pred

print('Writing csv ...')
sub.to_csv('xgb_rm5features.csv', index=False, float_format='%.4f')

# Cross Validation

In [5]:
# cross validation
# at 289 iterations we have a MAE 5.26%
cv_results = xgb.cv(xgb_params, dtrain, num_boost_round=5000, nfold=10, metrics=['mae'],
                   early_stopping_rounds=100, stratified=True)
cv_results.tail(5)



Unnamed: 0,test-mae-mean,test-mae-std,train-mae-mean,train-mae-std
27,0.068023,0.001607,0.067748,0.00019
28,0.068023,0.001613,0.067736,0.000188
29,0.068023,0.001612,0.067726,0.000189
30,0.068022,0.001612,0.067715,0.00019
31,0.068019,0.001614,0.067705,0.000189


In [6]:
# metrics
print('Best number of trees = {}'.format(cv_results.shape[0]))

Best number of trees = 32


In [7]:
final_gb = xgb.train(xgb_params, dtrain, num_boost_round=31)

In [8]:
f = final_gb.get_fscore()
sorted(f.items(), key=lambda x:x[1])

[('fips', 1),
 ('fireplacecnt', 1),
 ('garagetotalsqft', 2),
 ('unitcnt', 2),
 ('finishedfloor1squarefeet', 2),
 ('propertylandusetypeid', 3),
 ('taxdelinquencyflag', 5),
 ('regionidneighborhood', 5),
 ('finishedsquarefeet15', 5),
 ('taxdelinquencyyear', 6),
 ('airconditioningtypeid', 7),
 ('regionidcity', 7),
 ('censustractandblock', 8),
 ('bedroomcnt', 11),
 ('poolcnt', 11),
 ('buildingqualitytypeid', 11),
 ('heatingorsystemtypeid', 13),
 ('bathroomcnt', 15),
 ('propertycountylandusecode', 15),
 ('landtaxvaluedollarcnt', 19),
 ('propertyzoningdesc', 28),
 ('rawcensustractandblock', 28),
 ('regionidzip', 38),
 ('longitude', 39),
 ('finishedsquarefeet12', 40),
 ('yearbuilt', 46),
 ('latitude', 58),
 ('taxvaluedollarcnt', 61),
 ('lotsizesquarefeet', 68),
 ('structuretaxvaluedollarcnt', 70),
 ('taxamount', 77),
 ('calculatedfinishedsquarefeet', 81)]

In [9]:
y_pred = final_gb.predict(dtest)

In [10]:
cv_results.mean()

test-mae-mean     0.068113
test-mae-std      0.001612
train-mae-mean    0.067954
train-mae-std     0.000188
dtype: float64

In [11]:
def MAE(y, y_pred):
    return np.sum([abs(y[i] - y_pred[i]) for i in range(len(y))]) / len(y)
# x_train, y_train
print(MAE(y_train, y_pred))

0.0709602062275


In [12]:
test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']

sub = pd.read_csv('../submissions/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = y_pred

print('Writing csv ...')
sub.to_csv('xgb_cv_v2.csv', index=False, float_format='%.4f')

Writing csv ...


In [None]:

#xgb_params['n_estimators'] = cv_results.shape[0]
# fit on training
clf.set_params(n_estimators=cv_results.shape[0])
clf.set_params(max_depth=int(5))
print(clf.get_params())
clf
clf.fit(x_train, y_train, eval_metric='mae')