#### Summary

To build and explore gradient boosting models using xgboost library.

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import time as time

In [4]:
INPUT_DIR = '../input/'

In [5]:
ts = time.time()
train = pd.read_csv(INPUT_DIR + 'train.csv')
time.time() - ts

5.365730047225952

In [6]:
ts = time.time()
test = pd.read_csv(INPUT_DIR + 'test.csv')
time.time() - ts

71.05719113349915

In [3]:
train['new_target'] = np.log(train['target'] + 1.0)

NameError: name 'train' is not defined

#### Build a basic model using xgboost

In [7]:
X_COLUMNS = [col for col in train.columns if col not in ['target', 'ID']]
Y_COLUMN = 'target'

In [8]:
X = train[X_COLUMNS].values
Y = train[[Y_COLUMN]].values

In [9]:
xgb_complete_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)

In [29]:
#xgb_params = {'eta':0.01, 'eval_metric':'rmsle'}
xgb_params = {'eta':0.01, 'disable_default_eval_metric': 1}

In [30]:
from typing import Tuple, Dict, List
def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient squared log error.'''
    y = dtrain.get_label()
    return (np.log1p(predt) - np.log1p(y)) / (predt + 1)

def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return ((-np.log1p(predt) + np.log1p(y) + 1) /
            np.power(predt + 1, 2))

def squared_log(predt: np.ndarray,
                dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    '''Squared Log Error objective. A simplified version for RMSLE used as
    objective function.
    '''
    predt[predt < -1] = -1 + 1e-6
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess


def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    ''' Root mean squared log error metric.'''
    y = dtrain.get_label()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return 'PyRMSLE', float(np.sqrt(np.sum(elements) / len(y)))

In [32]:
ts = time.time()
model_xgboost = xgb.cv(params=xgb_params,
                       dtrain=xgb_complete_data,
                       num_boost_round=1000,
                       nfold=5,
                       early_stopping_rounds=5,
                       obj=squared_log,
                       feval=rmsle,
                       callbacks=[xgb.callback.print_evaluation(show_stdv=False)])
time.time() - ts

[0]	train-PyRMSLE:14.1839	test-PyRMSLE:14.1838
[1]	train-PyRMSLE:14.1747	test-PyRMSLE:14.1746
[2]	train-PyRMSLE:14.1655	test-PyRMSLE:14.1653
[3]	train-PyRMSLE:14.1563	test-PyRMSLE:14.1561
[4]	train-PyRMSLE:14.1471	test-PyRMSLE:14.1469
[5]	train-PyRMSLE:14.1378	test-PyRMSLE:14.1377
[6]	train-PyRMSLE:14.1286	test-PyRMSLE:14.1285
[7]	train-PyRMSLE:14.1194	test-PyRMSLE:14.1192
[8]	train-PyRMSLE:14.1102	test-PyRMSLE:14.11
[9]	train-PyRMSLE:14.101	test-PyRMSLE:14.1008
[10]	train-PyRMSLE:14.0917	test-PyRMSLE:14.0916
[11]	train-PyRMSLE:14.0825	test-PyRMSLE:14.0824
[12]	train-PyRMSLE:14.0733	test-PyRMSLE:14.0732
[13]	train-PyRMSLE:14.0641	test-PyRMSLE:14.0639
[14]	train-PyRMSLE:14.0549	test-PyRMSLE:14.0547
[15]	train-PyRMSLE:14.0457	test-PyRMSLE:14.0455
[16]	train-PyRMSLE:14.0364	test-PyRMSLE:14.0363
[17]	train-PyRMSLE:14.0272	test-PyRMSLE:14.0271
[18]	train-PyRMSLE:14.018	test-PyRMSLE:14.0179
[19]	train-PyRMSLE:14.0088	test-PyRMSLE:14.0087
[20]	train-PyRMSLE:13.9996	test-PyRMSLE:13.9995
[21]	t

[170]	train-PyRMSLE:12.6243	test-PyRMSLE:12.6242
[171]	train-PyRMSLE:12.6152	test-PyRMSLE:12.6151
[172]	train-PyRMSLE:12.6061	test-PyRMSLE:12.6059
[173]	train-PyRMSLE:12.597	test-PyRMSLE:12.5968
[174]	train-PyRMSLE:12.5879	test-PyRMSLE:12.5877
[175]	train-PyRMSLE:12.5788	test-PyRMSLE:12.5786
[176]	train-PyRMSLE:12.5696	test-PyRMSLE:12.5695
[177]	train-PyRMSLE:12.5605	test-PyRMSLE:12.5604
[178]	train-PyRMSLE:12.5514	test-PyRMSLE:12.5513
[179]	train-PyRMSLE:12.5423	test-PyRMSLE:12.5421
[180]	train-PyRMSLE:12.5332	test-PyRMSLE:12.533
[181]	train-PyRMSLE:12.5241	test-PyRMSLE:12.5239
[182]	train-PyRMSLE:12.515	test-PyRMSLE:12.5148
[183]	train-PyRMSLE:12.5059	test-PyRMSLE:12.5057
[184]	train-PyRMSLE:12.4968	test-PyRMSLE:12.4966
[185]	train-PyRMSLE:12.4877	test-PyRMSLE:12.4875
[186]	train-PyRMSLE:12.4786	test-PyRMSLE:12.4784
[187]	train-PyRMSLE:12.4694	test-PyRMSLE:12.4693
[188]	train-PyRMSLE:12.4603	test-PyRMSLE:12.4602
[189]	train-PyRMSLE:12.4512	test-PyRMSLE:12.4511
[190]	train-PyRMSLE:12.

[338]	train-PyRMSLE:11.1123	test-PyRMSLE:11.1121
[339]	train-PyRMSLE:11.1035	test-PyRMSLE:11.1034
[340]	train-PyRMSLE:11.0948	test-PyRMSLE:11.0946
[341]	train-PyRMSLE:11.086	test-PyRMSLE:11.0858
[342]	train-PyRMSLE:11.0772	test-PyRMSLE:11.0771
[343]	train-PyRMSLE:11.0685	test-PyRMSLE:11.0683
[344]	train-PyRMSLE:11.0597	test-PyRMSLE:11.0596
[345]	train-PyRMSLE:11.051	test-PyRMSLE:11.0508
[346]	train-PyRMSLE:11.0422	test-PyRMSLE:11.0421
[347]	train-PyRMSLE:11.0335	test-PyRMSLE:11.0333
[348]	train-PyRMSLE:11.0248	test-PyRMSLE:11.0246
[349]	train-PyRMSLE:11.0161	test-PyRMSLE:11.0159
[350]	train-PyRMSLE:11.0073	test-PyRMSLE:11.0072
[351]	train-PyRMSLE:10.9986	test-PyRMSLE:10.9985
[352]	train-PyRMSLE:10.9899	test-PyRMSLE:10.9897
[353]	train-PyRMSLE:10.9812	test-PyRMSLE:10.981
[354]	train-PyRMSLE:10.9725	test-PyRMSLE:10.9724
[355]	train-PyRMSLE:10.9638	test-PyRMSLE:10.9637
[356]	train-PyRMSLE:10.9552	test-PyRMSLE:10.955
[357]	train-PyRMSLE:10.9465	test-PyRMSLE:10.9463
[358]	train-PyRMSLE:10.9

[506]	train-PyRMSLE:9.7979	test-PyRMSLE:9.79771
[507]	train-PyRMSLE:9.79169	test-PyRMSLE:9.7915
[508]	train-PyRMSLE:9.7855	test-PyRMSLE:9.78531
[509]	train-PyRMSLE:9.77933	test-PyRMSLE:9.77915
[510]	train-PyRMSLE:9.7732	test-PyRMSLE:9.77301
[511]	train-PyRMSLE:9.76708	test-PyRMSLE:9.76689
[512]	train-PyRMSLE:9.76099	test-PyRMSLE:9.76081
[513]	train-PyRMSLE:9.75493	test-PyRMSLE:9.75474
[514]	train-PyRMSLE:9.74889	test-PyRMSLE:9.74871
[515]	train-PyRMSLE:9.74288	test-PyRMSLE:9.74269
[516]	train-PyRMSLE:9.73689	test-PyRMSLE:9.73671
[517]	train-PyRMSLE:9.73093	test-PyRMSLE:9.73075
[518]	train-PyRMSLE:9.725	test-PyRMSLE:9.72481
[519]	train-PyRMSLE:9.71909	test-PyRMSLE:9.7189
[520]	train-PyRMSLE:9.7132	test-PyRMSLE:9.71301
[521]	train-PyRMSLE:9.70734	test-PyRMSLE:9.70715
[522]	train-PyRMSLE:9.7015	test-PyRMSLE:9.70132
[523]	train-PyRMSLE:9.69569	test-PyRMSLE:9.69551
[524]	train-PyRMSLE:9.68991	test-PyRMSLE:9.68972
[525]	train-PyRMSLE:9.68415	test-PyRMSLE:9.68396
[526]	train-PyRMSLE:9.67842	t

[675]	train-PyRMSLE:9.06363	test-PyRMSLE:9.06343
[676]	train-PyRMSLE:9.06075	test-PyRMSLE:9.06056
[677]	train-PyRMSLE:9.05789	test-PyRMSLE:9.05769
[678]	train-PyRMSLE:9.05504	test-PyRMSLE:9.05484
[679]	train-PyRMSLE:9.05219	test-PyRMSLE:9.052
[680]	train-PyRMSLE:9.04937	test-PyRMSLE:9.04917
[681]	train-PyRMSLE:9.04655	test-PyRMSLE:9.04635
[682]	train-PyRMSLE:9.04374	test-PyRMSLE:9.04354
[683]	train-PyRMSLE:9.04095	test-PyRMSLE:9.04075
[684]	train-PyRMSLE:9.03816	test-PyRMSLE:9.03797
[685]	train-PyRMSLE:9.03539	test-PyRMSLE:9.0352
[686]	train-PyRMSLE:9.03263	test-PyRMSLE:9.03244
[687]	train-PyRMSLE:9.02988	test-PyRMSLE:9.02969
[688]	train-PyRMSLE:9.02715	test-PyRMSLE:9.02695
[689]	train-PyRMSLE:9.02442	test-PyRMSLE:9.02422
[690]	train-PyRMSLE:9.0217	test-PyRMSLE:9.02151
[691]	train-PyRMSLE:9.019	test-PyRMSLE:9.0188
[692]	train-PyRMSLE:9.01631	test-PyRMSLE:9.01611
[693]	train-PyRMSLE:9.01362	test-PyRMSLE:9.01343
[694]	train-PyRMSLE:9.01095	test-PyRMSLE:9.01076
[695]	train-PyRMSLE:9.00829

[843]	train-PyRMSLE:8.70378	test-PyRMSLE:8.70358
[844]	train-PyRMSLE:8.70218	test-PyRMSLE:8.70198
[845]	train-PyRMSLE:8.70058	test-PyRMSLE:8.70038
[846]	train-PyRMSLE:8.69898	test-PyRMSLE:8.69878
[847]	train-PyRMSLE:8.69739	test-PyRMSLE:8.69719
[848]	train-PyRMSLE:8.6958	test-PyRMSLE:8.6956
[849]	train-PyRMSLE:8.69422	test-PyRMSLE:8.69402
[850]	train-PyRMSLE:8.69265	test-PyRMSLE:8.69244
[851]	train-PyRMSLE:8.69107	test-PyRMSLE:8.69087
[852]	train-PyRMSLE:8.6895	test-PyRMSLE:8.6893
[853]	train-PyRMSLE:8.68794	test-PyRMSLE:8.68774
[854]	train-PyRMSLE:8.68638	test-PyRMSLE:8.68618
[855]	train-PyRMSLE:8.68482	test-PyRMSLE:8.68462
[856]	train-PyRMSLE:8.68327	test-PyRMSLE:8.68307
[857]	train-PyRMSLE:8.68173	test-PyRMSLE:8.68152
[858]	train-PyRMSLE:8.68018	test-PyRMSLE:8.67998
[859]	train-PyRMSLE:8.67864	test-PyRMSLE:8.67844
[860]	train-PyRMSLE:8.67711	test-PyRMSLE:8.67691
[861]	train-PyRMSLE:8.67558	test-PyRMSLE:8.67538
[862]	train-PyRMSLE:8.67405	test-PyRMSLE:8.67385
[863]	train-PyRMSLE:8.67

826.0873191356659

In [36]:
model_xgboost[model_xgboost['test-PyRMSLE-mean'] == model_xgboost['test-PyRMSLE-mean'].min()]['test-PyRMSLE-mean']

999    8.497095
Name: test-PyRMSLE-mean, dtype: float64

In [37]:
model_xgboost[model_xgboost['test-PyRMSLE-mean'] == model_xgboost['test-PyRMSLE-mean'].min()].index[0]

999

In [None]:
xgb.__version__