### Summary

Build a model using xgboost and verify it's performance on the validation set.


In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

##### Get relevant data in monthly form

In [2]:
sales_train = pd.read_csv('input/sales_train.csv')

In [3]:
monthly_sales_data = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()
monthly_sales_data.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
monthly_sales_data.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [4]:
monthly_sales_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month'], dtype='object')

In [5]:
items = pd.read_csv('input/items.csv')

In [6]:
items.columns

Index(['item_name', 'item_id', 'item_category_id'], dtype='object')

In [7]:
item_id_to_item_category_id = dict(zip(items.item_id, items.item_category_id))

In [8]:
monthly_sales_data['item_category_id'] = monthly_sales_data['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [9]:
train_data = monthly_sales_data[monthly_sales_data.date_block_num != 32]
validation_data = monthly_sales_data[monthly_sales_data.date_block_num == 32]

##### Get started with xgboost.

In [10]:
Y_COLUMN = 'item_cnt_month'
X_COLUMNS = [col for col in monthly_sales_data.columns if col != Y_COLUMN]

In [11]:
Y = train_data[[Y_COLUMN]]
X = train_data[X_COLUMNS]

In [12]:
xgb_train_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)

In [13]:
xgb_params = {'eta' : 0.1}

In [17]:
model_1 = xgb.train(params=xgb_params,
                    dtrain=xgb_train_data,
                    num_boost_round=100)

In [15]:
model_1

<xgboost.core.Booster at 0x1228f4550>

In [14]:
new_X = validation_data[X_COLUMNS]
new_Y = validation_data[[Y_COLUMN]]

In [15]:
xgb_validation_data = xgb.DMatrix(new_X, feature_names=X_COLUMNS)

In [18]:
validation_predictions = model_1.predict(xgb_validation_data)

In [19]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0 , 20),  np.clip(new_Y, 0, 20)))

2.2702181217098234

##### That looks encouraging. This is the best of what we have seen so far. Let us try training the model a bit further.

In [20]:
xgb_params = {'eta' : 0.01}

In [21]:
model_1 = xgb.train(params=xgb_params,
                    dtrain=xgb_train_data,
                    num_boost_round=1000)

In [22]:
validation_predictions = model_1.predict(xgb_validation_data)

In [23]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0 , 20),  np.clip(new_Y, 0, 20)))

2.2645925539553553

##### Okay that looks to be showing diminishing returns and let us see early stopping would help us. Note, however that because of clipping, this may not offer us the best solution, but should be reasonably close. Let us go on !

In [24]:
xgb_validation_data = xgb.DMatrix(new_X, new_Y, feature_names=X_COLUMNS)

In [25]:
xgb_params = {'eta' : 0.01, 'eval_metric' : 'rmse'}

In [26]:
model_1 = xgb.train(params=xgb_params,
                    dtrain=xgb_train_data,
                    num_boost_round=1000,
                    evals=[(xgb_validation_data, 'eval')],
                    early_stopping_rounds=5)

[0]	eval-rmse:18.5337
Will train until eval-rmse hasn't improved in 5 rounds.
[1]	eval-rmse:18.5178
[2]	eval-rmse:18.5029
[3]	eval-rmse:18.4878
[4]	eval-rmse:18.4735
[5]	eval-rmse:18.4591
[6]	eval-rmse:18.4455
[7]	eval-rmse:18.4319
[8]	eval-rmse:18.4185
[9]	eval-rmse:18.4061
[10]	eval-rmse:18.3934
[11]	eval-rmse:18.3809
[12]	eval-rmse:18.3693
[13]	eval-rmse:18.3574
[14]	eval-rmse:18.3459
[15]	eval-rmse:18.3345
[16]	eval-rmse:18.3258
[17]	eval-rmse:18.3155
[18]	eval-rmse:18.3072
[19]	eval-rmse:18.2991
[20]	eval-rmse:18.2895
[21]	eval-rmse:18.2817
[22]	eval-rmse:18.272
[23]	eval-rmse:18.2645
[24]	eval-rmse:18.2554
[25]	eval-rmse:18.2488
[26]	eval-rmse:18.24
[27]	eval-rmse:18.2334
[28]	eval-rmse:18.2249
[29]	eval-rmse:18.2185
[30]	eval-rmse:18.2094
[31]	eval-rmse:18.2025
[32]	eval-rmse:18.1956
[33]	eval-rmse:18.188
[34]	eval-rmse:18.1812
[35]	eval-rmse:18.1737
[36]	eval-rmse:18.1673
[37]	eval-rmse:18.16
[38]	eval-rmse:18.1537
[39]	eval-rmse:18.147
[40]	eval-rmse:18.1413
[41]	eval-rmse:18.

[345]	eval-rmse:17.4764
[346]	eval-rmse:17.4759
[347]	eval-rmse:17.4747
[348]	eval-rmse:17.4749
[349]	eval-rmse:17.4747
[350]	eval-rmse:17.4745
[351]	eval-rmse:17.4745
[352]	eval-rmse:17.4735
[353]	eval-rmse:17.473
[354]	eval-rmse:17.4723
[355]	eval-rmse:17.4701
[356]	eval-rmse:17.4704
[357]	eval-rmse:17.4714
[358]	eval-rmse:17.4721
[359]	eval-rmse:17.4718
[360]	eval-rmse:17.4696
[361]	eval-rmse:17.4703
[362]	eval-rmse:17.4697
[363]	eval-rmse:17.4706
[364]	eval-rmse:17.47
[365]	eval-rmse:17.4698
Stopping. Best iteration:
[360]	eval-rmse:17.4696



In [27]:
validation_predictions = model_1.predict(xgb_validation_data, ntree_limit=model_1.best_ntree_limit)

In [28]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0, 20), np.clip(new_Y, 0, 20)))

2.296194608869891

##### Okay , that does not look to give us something now, but let us check the variable importance here.

In [29]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(16, 9))
xgb.plot_importance(model_1, ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x12616cb00>

##### Let us try if cross validation helps us here. One may question the use of cross validation, since we are dealing with time series data, but we are not exactly treating this as a time series problem, but rather as a modelling problem with item_cnt_month and other parameters.

In [21]:
Y = train_data[[Y_COLUMN]]
X = train_data[X_COLUMNS]

In [22]:
xgb_train_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)

In [23]:
xgb_params = {'eta' : 0.1, 'eval_metric' : 'rmse'}

In [35]:
model_1 = xgb.cv(params=xgb_params,
                 dtrain=xgb_train_data,
                 num_boost_round=1000,
                 nfold=5,
                 early_stopping_rounds=5,
                 callbacks=[xgb.callback.print_evaluation(show_stdv=False)])

[0]	train-rmse:8.17519	test-rmse:8.13578
[1]	train-rmse:7.8668	test-rmse:7.83215
[2]	train-rmse:7.60899	test-rmse:7.58025
[3]	train-rmse:7.39351	test-rmse:7.37076
[4]	train-rmse:7.20926	test-rmse:7.18789
[5]	train-rmse:7.05832	test-rmse:7.04036
[6]	train-rmse:6.93321	test-rmse:6.92575
[7]	train-rmse:6.82464	test-rmse:6.82752
[8]	train-rmse:6.72335	test-rmse:6.74105
[9]	train-rmse:6.64122	test-rmse:6.66217
[10]	train-rmse:6.5737	test-rmse:6.60178
[11]	train-rmse:6.49989	test-rmse:6.5383
[12]	train-rmse:6.438	test-rmse:6.48861
[13]	train-rmse:6.38363	test-rmse:6.44636
[14]	train-rmse:6.33694	test-rmse:6.40712
[15]	train-rmse:6.29158	test-rmse:6.36382
[16]	train-rmse:6.24971	test-rmse:6.33297
[17]	train-rmse:6.21501	test-rmse:6.29893
[18]	train-rmse:6.18745	test-rmse:6.27834
[19]	train-rmse:6.15075	test-rmse:6.25137
[20]	train-rmse:6.11848	test-rmse:6.22223
[21]	train-rmse:6.07578	test-rmse:6.18334
[22]	train-rmse:6.02687	test-rmse:6.14345
[23]	train-rmse:5.9973	test-rmse:6.11946
[24]	tra

[195]	train-rmse:4.80002	test-rmse:5.34552
[196]	train-rmse:4.79839	test-rmse:5.34236
[197]	train-rmse:4.79539	test-rmse:5.34109
[198]	train-rmse:4.79031	test-rmse:5.34017
[199]	train-rmse:4.78822	test-rmse:5.33894
[200]	train-rmse:4.78618	test-rmse:5.33567
[201]	train-rmse:4.78065	test-rmse:5.32852
[202]	train-rmse:4.77436	test-rmse:5.32425
[203]	train-rmse:4.7699	test-rmse:5.32276
[204]	train-rmse:4.76675	test-rmse:5.32151
[205]	train-rmse:4.76483	test-rmse:5.32047
[206]	train-rmse:4.76318	test-rmse:5.32
[207]	train-rmse:4.76024	test-rmse:5.31652
[208]	train-rmse:4.75671	test-rmse:5.3152
[209]	train-rmse:4.75389	test-rmse:5.31128
[210]	train-rmse:4.7521	test-rmse:5.31031
[211]	train-rmse:4.74599	test-rmse:5.30869
[212]	train-rmse:4.74315	test-rmse:5.30621
[213]	train-rmse:4.73943	test-rmse:5.30527
[214]	train-rmse:4.73693	test-rmse:5.30413
[215]	train-rmse:4.73552	test-rmse:5.3031
[216]	train-rmse:4.72897	test-rmse:5.30179
[217]	train-rmse:4.72405	test-rmse:5.30021
[218]	train-rmse:4

[387]	train-rmse:4.37677	test-rmse:5.10997
[388]	train-rmse:4.37445	test-rmse:5.10928
[389]	train-rmse:4.37336	test-rmse:5.10864
[390]	train-rmse:4.373	test-rmse:5.10849
[391]	train-rmse:4.37028	test-rmse:5.10814
[392]	train-rmse:4.3693	test-rmse:5.10861
[393]	train-rmse:4.36883	test-rmse:5.10832
[394]	train-rmse:4.36824	test-rmse:5.10765
[395]	train-rmse:4.3658	test-rmse:5.10679
[396]	train-rmse:4.36436	test-rmse:5.10739
[397]	train-rmse:4.36376	test-rmse:5.10712
[398]	train-rmse:4.36338	test-rmse:5.10687
[399]	train-rmse:4.36066	test-rmse:5.1061
[400]	train-rmse:4.35844	test-rmse:5.10531
[401]	train-rmse:4.35622	test-rmse:5.10505
[402]	train-rmse:4.35525	test-rmse:5.10401
[403]	train-rmse:4.35427	test-rmse:5.10386
[404]	train-rmse:4.35316	test-rmse:5.10259
[405]	train-rmse:4.35103	test-rmse:5.1019
[406]	train-rmse:4.34961	test-rmse:5.10084
[407]	train-rmse:4.34769	test-rmse:5.10137
[408]	train-rmse:4.34619	test-rmse:5.10105
[409]	train-rmse:4.34498	test-rmse:5.10049
[410]	train-rmse:

[579]	train-rmse:4.14382	test-rmse:5.00386
[580]	train-rmse:4.14151	test-rmse:5.00254
[581]	train-rmse:4.14049	test-rmse:5.00178
[582]	train-rmse:4.14001	test-rmse:5.00165
[583]	train-rmse:4.13892	test-rmse:5.00096
[584]	train-rmse:4.1369	test-rmse:5.00027
[585]	train-rmse:4.13467	test-rmse:5.00011
[586]	train-rmse:4.13408	test-rmse:4.99961
[587]	train-rmse:4.13288	test-rmse:4.99902
[588]	train-rmse:4.13213	test-rmse:4.99787
[589]	train-rmse:4.13007	test-rmse:4.99677
[590]	train-rmse:4.12882	test-rmse:4.9959
[591]	train-rmse:4.12837	test-rmse:4.99569
[592]	train-rmse:4.12798	test-rmse:4.99554
[593]	train-rmse:4.12731	test-rmse:4.99523
[594]	train-rmse:4.12604	test-rmse:4.99497
[595]	train-rmse:4.12551	test-rmse:4.99457
[596]	train-rmse:4.12475	test-rmse:4.99403
[597]	train-rmse:4.12425	test-rmse:4.9936
[598]	train-rmse:4.12362	test-rmse:4.9931
[599]	train-rmse:4.12316	test-rmse:4.99291
[600]	train-rmse:4.12208	test-rmse:4.99143
[601]	train-rmse:4.12183	test-rmse:4.99129
[602]	train-rms

[771]	train-rmse:3.95602	test-rmse:4.90869
[772]	train-rmse:3.95419	test-rmse:4.90762
[773]	train-rmse:3.95362	test-rmse:4.90724
[774]	train-rmse:3.95326	test-rmse:4.9069
[775]	train-rmse:3.95247	test-rmse:4.90652
[776]	train-rmse:3.95157	test-rmse:4.90621
[777]	train-rmse:3.95034	test-rmse:4.90718
[778]	train-rmse:3.94999	test-rmse:4.90699
[779]	train-rmse:3.94922	test-rmse:4.9061
[780]	train-rmse:3.94765	test-rmse:4.90608
[781]	train-rmse:3.94702	test-rmse:4.90589
[782]	train-rmse:3.94676	test-rmse:4.90574
[783]	train-rmse:3.94611	test-rmse:4.90568
[784]	train-rmse:3.94504	test-rmse:4.90539
[785]	train-rmse:3.94302	test-rmse:4.90474
[786]	train-rmse:3.94236	test-rmse:4.90478
[787]	train-rmse:3.94141	test-rmse:4.90413
[788]	train-rmse:3.94046	test-rmse:4.90366
[789]	train-rmse:3.9389	test-rmse:4.90278
[790]	train-rmse:3.93841	test-rmse:4.9028
[791]	train-rmse:3.93788	test-rmse:4.90267
[792]	train-rmse:3.93681	test-rmse:4.90208
[793]	train-rmse:3.93581	test-rmse:4.90113
[794]	train-rms

[963]	train-rmse:3.81087	test-rmse:4.84558


In [36]:
model_1

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,8.175192,0.226822,8.135778,0.899370
1,7.866800,0.222125,7.832152,0.878476
2,7.608992,0.216531,7.580255,0.867566
3,7.393505,0.214725,7.370764,0.854560
4,7.209263,0.211733,7.187895,0.852783
5,7.058324,0.213192,7.040357,0.845423
6,6.933207,0.221954,6.925748,0.838595
7,6.824641,0.217021,6.827521,0.848070
8,6.723354,0.211403,6.741054,0.848490
9,6.641220,0.220266,6.662170,0.827373


In [25]:
model_2 = xgb.train(params=xgb_params,
                    dtrain=xgb_train_data,
                    num_boost_round=958)

In [26]:
model_2

<xgboost.core.Booster at 0x1110307f0>

In [27]:
validation_predictions = model_2.predict(xgb_validation_data)

In [28]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0 , 20),  np.clip(new_Y, 0, 20)))

2.0723484427163665

##### That looks like a reasonably good jump !!! I am not sure what else we can do with xgboost. Let us generate predictions on the test set now.

In [29]:
test = pd.read_csv('input/test.csv')

In [31]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [33]:
test['date_block_num'] = 33

In [35]:
test['item_category_id'] = test['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [36]:
test['item_category_id'].describe()

count    214200.000000
mean         46.309608
std          16.716581
min           0.000000
25%          37.000000
50%          43.000000
75%          58.000000
max          83.000000
Name: item_category_id, dtype: float64

In [37]:
test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_category_id
0,0,5,5037,33,19
1,1,5,5320,33,55
2,2,5,5233,33,19
3,3,5,5232,33,23
4,4,5,5268,33,20


In [38]:
test_X = test[X_COLUMNS]

In [39]:
xgb_test_predictions = xgb.DMatrix(test_X, feature_names=X_COLUMNS)

In [40]:
test_predictions = model_2.predict(xgb_test_predictions)

In [41]:
test_predictions

array([1.6810371, 2.7009327, 1.9303876, ..., 1.0022887, 1.0603472,
       1.0034266], dtype=float32)

In [45]:
test['item_cnt_month'] = np.clip(test_predictions, 0, 20)

In [50]:
test[['ID', 'item_cnt_month']].to_csv('submissions/submission_xgboost.csv', index=False)