### Summary

Build a model using xgboost and verify it's performance on the validation set.


In [95]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

##### Get relevant data in monthly form

In [96]:
sales_train = pd.read_csv('input/sales_train.csv')

In [97]:
monthly_sales_data = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()
monthly_sales_data.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
monthly_sales_data.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [98]:
monthly_sales_data['shop_id_item_id_date_block_num'] = list(zip(monthly_sales_data.shop_id, 
                                                                monthly_sales_data.item_id,
                                                                monthly_sales_data.date_block_num))

In [99]:
shop_id_item_id_date_block_num_to_item_cnt_month = dict(zip(monthly_sales_data.shop_id_item_id_date_block_num,
                                                            monthly_sales_data.item_cnt_month))

In [100]:
monthly_sales_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'shop_id_item_id_date_block_num'],
      dtype='object')

In [101]:
items = pd.read_csv('input/items.csv')

In [102]:
items.columns

Index(['item_name', 'item_id', 'item_category_id'], dtype='object')

In [103]:
item_id_to_item_category_id = dict(zip(items.item_id, items.item_category_id))

In [104]:
monthly_sales_data['item_category_id'] = monthly_sales_data['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [105]:
monthly_sales_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'shop_id_item_id_date_block_num', 'item_category_id'],
      dtype='object')

In [106]:
monthly_sales_data['prev_month_sale'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_month.get((x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [107]:
monthly_sales_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'shop_id_item_id_date_block_num', 'item_category_id',
       'prev_month_sale'],
      dtype='object')

In [108]:
train_data = monthly_sales_data[monthly_sales_data.date_block_num != 32]
validation_data = monthly_sales_data[monthly_sales_data.date_block_num == 32]

##### Let us try to find the best model using cross validation.

In [109]:
X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_month_sale']
Y_COLUMN = 'item_cnt_month'

In [110]:
Y = train_data[[Y_COLUMN]]
X = train_data[X_COLUMNS]

In [111]:
new_Y = validation_data[[Y_COLUMN]]
new_X = validation_data[X_COLUMNS]

In [112]:
xgb_train_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)

In [113]:
xgb_validation_data = xgb.DMatrix(new_X, new_Y, feature_names=X_COLUMNS)

In [114]:
xgb_params = {'eta' : 0.1, 'eval_metric' : 'rmse'}

In [115]:
#model_1 = xgb.cv(params=xgb_params,
#                 dtrain=xgb_train_data,
#                 num_boost_round=10,
#                 nfold=5,
#                 callbacks=[xgb.callback.print_evaluation(show_stdv=False)])

[0]	train-rmse:8.05453	test-rmse:8.05347
[1]	train-rmse:7.63568	test-rmse:7.67949
[2]	train-rmse:7.27265	test-rmse:7.36524
[3]	train-rmse:6.95776	test-rmse:7.09065
[4]	train-rmse:6.68654	test-rmse:6.86855
[5]	train-rmse:6.45138	test-rmse:6.67885
[6]	train-rmse:6.25012	test-rmse:6.52323
[7]	train-rmse:6.07591	test-rmse:6.39004
[8]	train-rmse:5.92852	test-rmse:6.28126
[9]	train-rmse:5.7985	test-rmse:6.18853


In [116]:
#model_1

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,8.054535,0.214693,8.053467,0.873601
1,7.635675,0.194543,7.67949,0.829216
2,7.272653,0.17747,7.365245,0.795057
3,6.957764,0.161885,7.090654,0.760476
4,6.686538,0.148378,6.868545,0.732464
5,6.451382,0.137769,6.678851,0.703577
6,6.250123,0.128743,6.52323,0.68158
7,6.075908,0.122885,6.390039,0.662186
8,5.928516,0.117253,6.281265,0.646697
9,5.798498,0.114081,6.188533,0.640566


In [117]:
# We need to run this, but be aware of the fact that running for 4000 rounds will take some time on a normal CPU.
#model_2 = xgb.train(params=xgb_params,
#                    dtrain=xgb_train_data,
#                    num_boost_round=4000)

In [118]:
model_2

<xgboost.core.Booster at 0x12c15eba8>

In [119]:
validation_predictions = model_2.predict(xgb_validation_data)

In [120]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0 , 20),  np.clip(new_Y, 0, 20)))

2.210531084380242

##### That looks like a reasonably good jump !!! I am not sure what else we can do with xgboost. Let us generate predictions on the test set now.

In [121]:
test = pd.read_csv('input/test.csv')

In [122]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [123]:
test['date_block_num'] = 33

In [124]:
test['item_category_id'] = test['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [125]:
test['item_category_id'].describe()

count    214200.000000
mean         46.309608
std          16.716581
min           0.000000
25%          37.000000
50%          43.000000
75%          58.000000
max          83.000000
Name: item_category_id, dtype: float64

In [126]:
test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_category_id
0,0,5,5037,33,19
1,1,5,5320,33,55
2,2,5,5233,33,19
3,3,5,5232,33,23
4,4,5,5268,33,20


In [128]:
test['prev_month_sale'] = test.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_month.get((x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [129]:
test_X = test[X_COLUMNS]

In [130]:
xgb_test_predictions = xgb.DMatrix(test_X, feature_names=X_COLUMNS)

In [131]:
test_predictions = model_2.predict(xgb_test_predictions)

In [132]:
test_predictions

array([1.486395 , 1.1248791, 1.6999326, ..., 1.1248791, 1.1248791,
       1.1248791], dtype=float32)

In [133]:
test['item_cnt_month'] = np.clip(test_predictions, 0, 20)

In [134]:
test[['ID', 'item_cnt_month']].to_csv('submissions/submission_gradient_boosting_using_xgboost_new_variables.csv', 
                                      index=False)