### Summary

Build a model using xgboost and verify it's performance on the validation set.


In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

##### Get relevant data in monthly form

In [2]:
sales_train = pd.read_csv('input/sales_train.csv')

In [3]:
monthly_sales_data = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()
monthly_sales_data.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
monthly_sales_data.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [4]:
monthly_sales_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month'], dtype='object')

In [5]:
items = pd.read_csv('input/items.csv')

In [6]:
items.columns

Index(['item_name', 'item_id', 'item_category_id'], dtype='object')

In [7]:
item_id_to_item_category_id = dict(zip(items.item_id, items.item_category_id))

In [8]:
monthly_sales_data['item_category_id'] = monthly_sales_data['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [9]:
train_data = monthly_sales_data[(monthly_sales_data.date_block_num != 32) &(monthly_sales_data.date_block_num != 33)]
validation_data = monthly_sales_data[monthly_sales_data.date_block_num == 32]
test_data = monthly_sales_data[monthly_sales_data.date_block_num == 33]

##### Get started with xgboost.

In [10]:
Y_COLUMN = 'item_cnt_month'
X_COLUMNS = [col for col in monthly_sales_data.columns if col != Y_COLUMN]

In [11]:
Y = train_data[[Y_COLUMN]]
X = train_data[X_COLUMNS]

In [12]:
xgb_train_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)

In [13]:
xgb_params = {'eta' : 0.1}

In [14]:
model_1 = xgb.train(params=xgb_params,
                    dtrain=xgb_train_data,
                    num_boost_round=100)

In [15]:
model_1

<xgboost.core.Booster at 0x127645588>

In [16]:
new_X = validation_data[X_COLUMNS]
new_Y = validation_data[[Y_COLUMN]]

In [17]:
xgb_validation_data = xgb.DMatrix(new_X, feature_names=X_COLUMNS)

In [18]:
validation_predictions = model_1.predict(xgb_validation_data)

In [19]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0 , 20),  np.clip(new_Y, 0, 20)))

2.3387417865406737

##### That looks encouraging. This is the best of what we have seen so far. Let us try training the model a bit further.

In [21]:
xgb_params = {'eta' : 0.01}

In [22]:
model_1 = xgb.train(params=xgb_params,
                    dtrain=xgb_train_data,
                    num_boost_round=1000)

In [23]:
validation_predictions = model_1.predict(xgb_validation_data)

In [24]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0 , 20),  np.clip(new_Y, 0, 20)))

2.3200486166464533

##### Okay that looks to be showing diminishing returns and let us see early stopping would help us. Note, however that because of clipping, this may not offer us the best solution, but should be reasonably close. Let us go on !

In [25]:
xgb_validation_data = xgb.DMatrix(new_X, new_Y, feature_names=X_COLUMNS)

In [26]:
xgb_params = {'eta' : 0.01, 'eval_metric' : 'rmse'}

In [27]:
model_1 = xgb.train(params=xgb_params,
                    dtrain=xgb_train_data,
                    num_boost_round=1000,
                    evals=[(xgb_validation_data, 'eval')],
                    early_stopping_rounds=5)

[0]	eval-rmse:18.5338
Will train until eval-rmse hasn't improved in 5 rounds.
[1]	eval-rmse:18.518
[2]	eval-rmse:18.5026
[3]	eval-rmse:18.4876
[4]	eval-rmse:18.4735
[5]	eval-rmse:18.4592
[6]	eval-rmse:18.4458
[7]	eval-rmse:18.4321
[8]	eval-rmse:18.4187
[9]	eval-rmse:18.4062
[10]	eval-rmse:18.3934
[11]	eval-rmse:18.3809
[12]	eval-rmse:18.3688
[13]	eval-rmse:18.357
[14]	eval-rmse:18.3461
[15]	eval-rmse:18.3348
[16]	eval-rmse:18.3239
[17]	eval-rmse:18.3136
[18]	eval-rmse:18.3056
[19]	eval-rmse:18.2953
[20]	eval-rmse:18.288
[21]	eval-rmse:18.2782
[22]	eval-rmse:18.2709
[23]	eval-rmse:18.2615
[24]	eval-rmse:18.2546
[25]	eval-rmse:18.2461
[26]	eval-rmse:18.2395
[27]	eval-rmse:18.2308
[28]	eval-rmse:18.2241
[29]	eval-rmse:18.2183
[30]	eval-rmse:18.2102
[31]	eval-rmse:18.2035
[32]	eval-rmse:18.1949
[33]	eval-rmse:18.189
[34]	eval-rmse:18.1806
[35]	eval-rmse:18.175
[36]	eval-rmse:18.1676
[37]	eval-rmse:18.162
[38]	eval-rmse:18.1542
[39]	eval-rmse:18.1495
[40]	eval-rmse:18.1422
[41]	eval-rmse:18

[346]	eval-rmse:17.5583
[347]	eval-rmse:17.5557
[348]	eval-rmse:17.5528
[349]	eval-rmse:17.5515
[350]	eval-rmse:17.5514
[351]	eval-rmse:17.55
[352]	eval-rmse:17.55
[353]	eval-rmse:17.5474
[354]	eval-rmse:17.546
[355]	eval-rmse:17.5434
[356]	eval-rmse:17.5407
[357]	eval-rmse:17.5395
[358]	eval-rmse:17.5384
[359]	eval-rmse:17.5372
[360]	eval-rmse:17.537
[361]	eval-rmse:17.5362
[362]	eval-rmse:17.5361
[363]	eval-rmse:17.5335
[364]	eval-rmse:17.5334
[365]	eval-rmse:17.5326
[366]	eval-rmse:17.5301
[367]	eval-rmse:17.5274
[368]	eval-rmse:17.5266
[369]	eval-rmse:17.5255
[370]	eval-rmse:17.5247
[371]	eval-rmse:17.5247
[372]	eval-rmse:17.5221
[373]	eval-rmse:17.5194
[374]	eval-rmse:17.5168
[375]	eval-rmse:17.5141
[376]	eval-rmse:17.5133
[377]	eval-rmse:17.5113
[378]	eval-rmse:17.5086
[379]	eval-rmse:17.5079
[380]	eval-rmse:17.5066
[381]	eval-rmse:17.5041
[382]	eval-rmse:17.5034
[383]	eval-rmse:17.5007
[384]	eval-rmse:17.5
[385]	eval-rmse:17.4999
[386]	eval-rmse:17.4976
[387]	eval-rmse:17.4969
[

In [28]:
validation_predictions = model_1.predict(xgb_validation_data, ntree_limit=model_1.best_ntree_limit)

In [29]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0, 20), np.clip(new_Y, 0, 20)))

2.316780274224487

##### Okay , that does not look to give us something now, but let us check the variable importance here.

In [30]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(16, 9))
xgb.plot_importance(model_1, ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x12f378d68>

##### Let us try if cross validation helps us here. One may question the use of cross validation, since we are dealing with time series data, but we are not exactly treating this as a time series problem, but rather as a modelling problem with item_cnt_month and other parameters.

In [31]:
Y = train_data[[Y_COLUMN]]
X = train_data[X_COLUMNS]

In [32]:
xgb_train_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)

In [33]:
xgb_params = {'eta' : 0.1, 'eval_metric' : 'rmse'}

In [34]:
model_1 = xgb.cv(params=xgb_params,
                 dtrain=xgb_train_data,
                 num_boost_round=1000,
                 nfold=5,
                 early_stopping_rounds=5,
                 callbacks=[xgb.callback.print_evaluation(show_stdv=False)])

[0]	train-rmse:7.99368	test-rmse:7.96755
[1]	train-rmse:7.6766	test-rmse:7.65507
[2]	train-rmse:7.40793	test-rmse:7.39184
[3]	train-rmse:7.18181	test-rmse:7.17257
[4]	train-rmse:6.98281	test-rmse:6.98576
[5]	train-rmse:6.82234	test-rmse:6.82667
[6]	train-rmse:6.6897	test-rmse:6.69962
[7]	train-rmse:6.57976	test-rmse:6.59977
[8]	train-rmse:6.48322	test-rmse:6.5101
[9]	train-rmse:6.39822	test-rmse:6.43156
[10]	train-rmse:6.32507	test-rmse:6.36505
[11]	train-rmse:6.26638	test-rmse:6.313
[12]	train-rmse:6.20237	test-rmse:6.25762
[13]	train-rmse:6.15164	test-rmse:6.21604
[14]	train-rmse:6.10977	test-rmse:6.18031
[15]	train-rmse:6.07227	test-rmse:6.14751
[16]	train-rmse:6.02941	test-rmse:6.11304
[17]	train-rmse:5.99583	test-rmse:6.08269
[18]	train-rmse:5.97154	test-rmse:6.06165
[19]	train-rmse:5.93322	test-rmse:6.02932
[20]	train-rmse:5.90095	test-rmse:5.997
[21]	train-rmse:5.87228	test-rmse:5.9734
[22]	train-rmse:5.82793	test-rmse:5.93602
[23]	train-rmse:5.79579	test-rmse:5.91274
[24]	train

[195]	train-rmse:4.7241	test-rmse:5.08272
[196]	train-rmse:4.72084	test-rmse:5.07975
[197]	train-rmse:4.71972	test-rmse:5.0787
[198]	train-rmse:4.71864	test-rmse:5.07742
[199]	train-rmse:4.71626	test-rmse:5.07611
[200]	train-rmse:4.71459	test-rmse:5.0751
[201]	train-rmse:4.71237	test-rmse:5.07316
[202]	train-rmse:4.70829	test-rmse:5.07231
[203]	train-rmse:4.70593	test-rmse:5.07088
[204]	train-rmse:4.70378	test-rmse:5.06827
[205]	train-rmse:4.69957	test-rmse:5.06669
[206]	train-rmse:4.6976	test-rmse:5.06634
[207]	train-rmse:4.69515	test-rmse:5.06447
[208]	train-rmse:4.68874	test-rmse:5.06198
[209]	train-rmse:4.68644	test-rmse:5.06149
[210]	train-rmse:4.68263	test-rmse:5.05884
[211]	train-rmse:4.68025	test-rmse:5.05758
[212]	train-rmse:4.67886	test-rmse:5.05684
[213]	train-rmse:4.67732	test-rmse:5.05509
[214]	train-rmse:4.67495	test-rmse:5.05423
[215]	train-rmse:4.67291	test-rmse:5.05331
[216]	train-rmse:4.67207	test-rmse:5.05223
[217]	train-rmse:4.66864	test-rmse:5.05094
[218]	train-rms

[387]	train-rmse:4.31979	test-rmse:4.89148
[388]	train-rmse:4.31816	test-rmse:4.89014
[389]	train-rmse:4.31682	test-rmse:4.88891
[390]	train-rmse:4.31371	test-rmse:4.8886
[391]	train-rmse:4.31097	test-rmse:4.88788
[392]	train-rmse:4.30885	test-rmse:4.8872
[393]	train-rmse:4.3071	test-rmse:4.88681
[394]	train-rmse:4.30374	test-rmse:4.88792
[395]	train-rmse:4.30286	test-rmse:4.88751
[396]	train-rmse:4.30216	test-rmse:4.88708
[397]	train-rmse:4.30127	test-rmse:4.88638
[398]	train-rmse:4.2998	test-rmse:4.88601
[399]	train-rmse:4.29886	test-rmse:4.88548
[400]	train-rmse:4.29794	test-rmse:4.88528
[401]	train-rmse:4.29752	test-rmse:4.88497
[402]	train-rmse:4.29716	test-rmse:4.88472
[403]	train-rmse:4.29474	test-rmse:4.88405
[404]	train-rmse:4.29373	test-rmse:4.88352
[405]	train-rmse:4.29257	test-rmse:4.88297
[406]	train-rmse:4.29141	test-rmse:4.88203
[407]	train-rmse:4.28919	test-rmse:4.88083
[408]	train-rmse:4.28752	test-rmse:4.8816
[409]	train-rmse:4.28527	test-rmse:4.87852
[410]	train-rmse

In [35]:
model_1

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,7.993683,0.173063,7.967547,0.716690
1,7.676595,0.164397,7.655071,0.707542
2,7.407930,0.157907,7.391842,0.697180
3,7.181811,0.151891,7.172567,0.683760
4,6.982813,0.148613,6.985764,0.661336
5,6.822337,0.146630,6.826675,0.649484
6,6.689696,0.141505,6.699616,0.640954
7,6.579756,0.141152,6.599768,0.631106
8,6.483219,0.133613,6.510103,0.624256
9,6.398223,0.132193,6.431563,0.616674


In [40]:
model_2 = xgb.train(params=xgb_params,
                    dtrain=xgb_train_data,
                    num_boost_round=1000)

In [41]:
model_2

<xgboost.core.Booster at 0x12cffdfd0>

In [42]:
validation_predictions = model_2.predict(xgb_validation_data)

In [43]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0 , 20),  np.clip(new_Y, 0, 20)))

2.2703912594568307

##### That looks like a reasonably good jump !!! I am not sure what else we can do with xgboost on the raw features alone (we can create other features and then apply xgboost on the same, and we will do it later in a separate notebook).