# XGBoost

<strong>Disclaimer</strong> -- this notebook was used for quick tests, code may not be functioning or sequential with different parameters. Please see the combined notebook for the final process used with xgboost. This file will be included as a reference.

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from xgboost import plot_importance
import matplotlib.pyplot as plt

%matplotlib inline
import gc

### Data Prep

In [None]:
# data 
properties = pd.read_csv('../input/properties_2016.csv')
train = pd.read_csv("../input/train_2016_v2.csv")


# data pre-processing and train/test
for column in properties.columns:
    properties[column] = properties[column].fillna(-1)
    if properties[column].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[column].values))
        properties[column] = lbl.transform(list(properties[column].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
x_test = properties.drop(['parcelid','decktypeid', 'yardbuildingsqft26', 'basementsqft',
                        'buildingclasstypeid'], axis=1)

In [None]:
# outliers removal
train_df = train_df[train_df.logerror > -0.4]
train_df = train_df[train_df.logerror < 0.419]

x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate',
                        'decktypeid', 'yardbuildingsqft26', 'basementsqft',
                        'buildingclasstypeid'], axis=1)

y_train = train_df['logerror'].values.astype(np.float32)
y_mean = np.mean(y_train)

print('x_train: ', x_train.shape)
print('x_test: ', x_test.shape)

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=42)
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

## Grid Search + Cross Validation

Note: Test metrics to get model working

In [None]:
cv_params = {'max_depth': [3,5,7], 
             'min_child_weight': [1,3,5]}

ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

optimized_GBM = GridSearchCV(xgb.XGBRegressor(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1) 

In [None]:
# sklearn syntax 
optimized_GBM.fit(x_train, y_train)
#xgb.train(xgb_params, dtrain, num_boost_round=5000, nfold=5, metrics=['mae'],
                   #early_stopping_rounds=100, stratified=True)

In [None]:
best_score = 0

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 250
for eta in [0.030, 0.032, 0.034, 0.036, 0.038]:
    print(eta)
    for lamb_da in [0.6, 0.7, 0.8, 0.9]:
        print(lamb_da)
        xgb_params = {
            'eta': eta,
            'max_depth': int(5),
            'subsample': 0.80,
            'objective': 'reg:linear',
            'eval_metric': 'mae',
            'lambda': lamb_da,   
            'alpha': 0.4, 
            'base_score': y_mean,
            'silent': 1
            }
        # each combination -- run single model
        print('Fitting model...')
        print('ETA = ', eta, '\nLambda = ', lamb_da)
        # perform cross validation
        scores = xgb.cv(xgb_params, dtrain, num_boost_round=5000, nfold=5, metrics=['mae'],
                   early_stopping_rounds=100, stratified=True)
        # compute mean cv accuracy
        score = np.mean(scores)
        if score > best_score:
            best_score = score
            best_params = {'eta': eta, 'lamda': lamb_da}
            
# build final model w/data

In [None]:
# model
xgb_params = {
    'eta': 0.037,
    'max_depth': int(5),
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 250

In [None]:
# training
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

xgb_pred1 = model.predict(dtest)

num_boost_rounds = 240

model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

# second run
xgb_pred2 = model.predict(dtest)

xgb_pred = (xgb_pred1 + xgb_pred2) / 2

del train_df; gc.collect()
del x_train; gc.collect()
del x_test; gc.collect()
del properties; gc.collect()
del dtest; gc.collect()
del dtrain; gc.collect()
del xgb_pred1, xgb_pred2; gc.collect

## Predictions

In [None]:
test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']

sub = pd.read_csv('../submissions/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = xgb_pred

print('Writing csv ...')
sub.to_csv('xgb_rm5features.csv', index=False, float_format='%.4f')

### Msc

In [None]:
def MAE(y, y_pred):
    return np.sum([abs(y[i] - y_pred[i]) for i in range(len(y))]) / len(y)
# x_train, y_train
print(MAE(y_train, y_pred))

In [None]:
test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']

sub = pd.read_csv('../submissions/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = y_pred

print('Writing csv ...')
sub.to_csv('xgb_cv_v2.csv', index=False, float_format='%.4f')