### Summary

Build a model using xgboost and verify it's performance on the validation set.


In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

##### Get relevant data in monthly form

In [2]:
INPUT_DIR = '../input/'
#INPUT_DIR = '../input/competitive-data-science-predict-future-sales/'

In [3]:
sales_train = pd.read_csv(INPUT_DIR + 'sales_train.csv')

In [5]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2935844,10.10.2015,33,25,7409,299.0,1.0
2935845,09.10.2015,33,25,7460,299.0,1.0
2935846,14.10.2015,33,25,7459,349.0,1.0
2935847,22.10.2015,33,25,7440,299.0,1.0
2935848,03.10.2015,33,25,7460,299.0,1.0


In [5]:
str_val = '02.01.2013'
int(str_val.split('.')[-1])

2013

In [6]:
sales_train['day'] = sales_train['date'].apply(lambda x : int(x.split('.')[0]))
sales_train['month'] = sales_train['date'].apply(lambda x : int(x.split('.')[1]))
sales_train['year'] = sales_train['date'].apply(lambda x : int(x.split('.')[-1]))


In [7]:
sales_train['date'].max()

'31.12.2014'

### Evaluate days since last transaction for every shop_id, item_id pair.

We do the same by first sorting the data, and then keeping a pointer to the previous trade date while iterating the same. I would have loved to prevent the iteration part, but simply could not do it !

In [8]:
sales_train_sorted = sales_train.sort_values(by=['year', 'month', 'day'])

In [9]:
sales_train_sorted['shop_id_item_id'] = list(zip(sales_train_sorted.shop_id,sales_train_sorted.item_id))

In [10]:
sales_train_sorted['shop_id_item_id_date'] = list(zip(sales_train_sorted.shop_id,
                                                      sales_train_sorted.item_id,
                                                      sales_train_sorted.date))

In [11]:
sales_train_sorted.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,day,month,year,shop_id_item_id,shop_id_item_id_date
7554,01.01.2013,0,19,18976,399.0,1.0,1,1,2013,"(19, 18976)","(19, 18976, 01.01.2013)"
7644,01.01.2013,0,19,18284,199.0,1.0,1,1,2013,"(19, 18284)","(19, 18284, 01.01.2013)"
7646,01.01.2013,0,19,18320,199.0,1.0,1,1,2013,"(19, 18320)","(19, 18320, 01.01.2013)"
7647,01.01.2013,0,19,18329,299.0,1.0,1,1,2013,"(19, 18329)","(19, 18329, 01.01.2013)"
7694,01.01.2013,0,19,19367,399.0,1.0,1,1,2013,"(19, 19367)","(19, 19367, 01.01.2013)"


##### The core loop, where we populate the relevant map storing the previous trade date

In [12]:
shop_id_item_id_to_cur_date = dict()
shop_id_item_id_cur_date_to_prev_date = dict()

In [13]:
for index, row in sales_train_sorted.iterrows():
    shop_id_item_id = row.shop_id_item_id
    if shop_id_item_id in shop_id_item_id_to_cur_date.keys():
        prev_date = shop_id_item_id_to_cur_date.get(shop_id_item_id)
        shop_id_item_id_date = row.shop_id_item_id_date
        shop_id_item_id_cur_date_to_prev_date[shop_id_item_id_date] = prev_date
    else:
        shop_id_item_id_to_cur_date[shop_id_item_id] = row.date



In [14]:
sales_train_sorted[sales_train_sorted['shop_id_item_id'] == (2, 2920)].head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,day,month,year,shop_id_item_id,shop_id_item_id_date
32459,01.01.2013,0,2,2920,599.0,2.0,1,1,2013,"(2, 2920)","(2, 2920, 01.01.2013)"
32460,02.01.2013,0,2,2920,598.5,-1.0,2,1,2013,"(2, 2920)","(2, 2920, 02.01.2013)"
32461,05.01.2013,0,2,2920,598.5,1.0,5,1,2013,"(2, 2920)","(2, 2920, 05.01.2013)"
32462,06.01.2013,0,2,2920,599.0,2.0,6,1,2013,"(2, 2920)","(2, 2920, 06.01.2013)"
32463,08.01.2013,0,2,2920,599.0,1.0,8,1,2013,"(2, 2920)","(2, 2920, 08.01.2013)"


In [15]:
sales_train_sorted['prev_date'] = sales_train_sorted['shop_id_item_id_date'].apply(
    lambda x : shop_id_item_id_cur_date_to_prev_date.get(x, '1.1.1'))

##### Get the difference in the number of days using python datetime module.

In [16]:
from datetime import datetime

def get_days_diff(cur_date, prev_date):
    cur_date_split = cur_date.split('.')
    cur_date_obj = datetime(int(cur_date_split[2]), 
                            int(cur_date_split[1]), 
                            int(cur_date_split[0])) # provide UTC time

    prev_date_split = prev_date.split('.')
    prev_date_obj = datetime(int(prev_date_split[2]), 
                             int(prev_date_split[1]), 
                             int(prev_date_split[0])) # provide UTC time
    
    age = cur_date_obj - prev_date_obj
    return(age.days)


In [17]:
get_days_diff('27.2.2020', '1.1.1')

737481

In [18]:
sales_train_sorted['days_since_last_trade'] = sales_train_sorted.apply(lambda x : 
                                                                       get_days_diff(x.date, x.prev_date), axis=1)

In [19]:
monthly_sales_sorted = sales_train_sorted[['date_block_num', 'shop_id', 'item_id', 'days_since_last_trade']].groupby(
    by=['date_block_num', 'shop_id', 'item_id']).min()
monthly_sales_sorted.rename(columns={'days_since_last_trade' : 'days_since_last_trade_min_month'}, inplace=True)
monthly_sales_sorted.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [20]:
monthly_sales_sorted.head()

Unnamed: 0,date_block_num,shop_id,item_id,days_since_last_trade_min_month
0,0,0,32,18
1,0,0,33,14
2,0,0,35,734898
3,0,0,43,734898
4,0,0,51,18


In [21]:
monthly_sales_sorted['shop_id_item_id_date_block_num'] = list(zip(monthly_sales_sorted.shop_id,
                                                                  monthly_sales_sorted.item_id,
                                                                  monthly_sales_sorted.date_block_num))

##### Checking the cases where there was no previous trade.

By giving a default value of the previous trade date as 1.1.1, these cases would result in an exceptionally large value of days_since_last_trade_min_month variable. To make things sensible, we cap that value at 5000.

In [22]:
monthly_sales_sorted[monthly_sales_sorted['days_since_last_trade_min_month'] < 70000]['days_since_last_trade_min_month'].describe()

count    1.299738e+06
mean     2.378135e+02
std      2.222109e+02
min      0.000000e+00
25%      6.000000e+01
50%      1.710000e+02
75%      3.540000e+02
max      1.032000e+03
Name: days_since_last_trade_min_month, dtype: float64

In [23]:
monthly_sales_sorted.loc[(monthly_sales_sorted['days_since_last_trade_min_month'] > 70000), 
                         'days_since_last_trade_min_month'] = 5000

In [24]:
monthly_sales_sorted.head()

Unnamed: 0,date_block_num,shop_id,item_id,days_since_last_trade_min_month,shop_id_item_id_date_block_num
0,0,0,32,18,"(0, 32, 0)"
1,0,0,33,14,"(0, 33, 0)"
2,0,0,35,5000,"(0, 35, 0)"
3,0,0,43,5000,"(0, 43, 0)"
4,0,0,51,18,"(0, 51, 0)"


In [25]:
shop_id_item_id_date_block_num_to_days_since_last_trade_min = dict(
    zip(monthly_sales_sorted.shop_id_item_id_date_block_num,
        monthly_sales_sorted.days_since_last_trade_min_month))

#### Testing out multiple features.

Here, we test out multiple aggregate features in the following manner :

1. Aggregate the daily dataframe into a monthly one, using the appropriate aggregation method.

2. Concatenate shop_id, item_id,date_block_num to a tuple, so as to build an easy key for a record.

3. Build a hash map from shop_id, item_id, date_block_num to the aggregated feature.

4. Add another column to the aggregated data frame, which shows the aggregated value for the previous month (one can obtain this value from the hash map populated in step 2).

5. Do the same transformation on test data as well.

##### 1. Aggregate features to get monthly data.

In [26]:
monthly_sales_data = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()
monthly_sales_data.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
monthly_sales_data.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [27]:
monthly_sales_data_monthly_max = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).max()
monthly_sales_data_monthly_max.rename(columns={'item_cnt_day':'item_cnt_day_max_per_month'}, inplace=True)
monthly_sales_data_monthly_max.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [28]:
monthly_sales_data_monthly_min = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).min()
monthly_sales_data_monthly_min.rename(columns={'item_cnt_day':'item_cnt_day_min_per_month'}, inplace=True)
monthly_sales_data_monthly_min.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [29]:
monthly_sales_data_monthly_median = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).median()
monthly_sales_data_monthly_median.rename(columns={'item_cnt_day':'item_cnt_day_median_per_month'}, inplace=True)
monthly_sales_data_monthly_median.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [30]:
monthly_mean_sale_price = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_price']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).mean()
monthly_mean_sale_price.rename(columns={'item_price': 'item_price_monthly_mean'}, inplace=True)
monthly_mean_sale_price.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

##### 2. Concatenate shop_id, item_id,date_block_num to a tuple, so as to build an easy key for a record.

In [31]:
monthly_sales_data['shop_id_item_id_date_block_num'] = list(zip(monthly_sales_data.shop_id, 
                                                                monthly_sales_data.item_id,
                                                                monthly_sales_data.date_block_num))

In [32]:
monthly_sales_data_monthly_max['shop_id_item_id_date_block_num'] = list(zip(
    monthly_sales_data_monthly_max.shop_id, 
    monthly_sales_data_monthly_max.item_id,
    monthly_sales_data_monthly_max.date_block_num))

In [33]:
monthly_sales_data_monthly_min['shop_id_item_id_date_block_num'] = list(zip(
    monthly_sales_data_monthly_min.shop_id, 
    monthly_sales_data_monthly_min.item_id,
    monthly_sales_data_monthly_min.date_block_num))

In [34]:
monthly_sales_data_monthly_median['shop_id_item_id_date_block_num'] = list(zip(
    monthly_sales_data_monthly_median.shop_id, 
    monthly_sales_data_monthly_median.item_id,
    monthly_sales_data_monthly_median.date_block_num))

In [35]:
monthly_mean_sale_price['shop_id_item_id_date_block_num'] = list(zip(
    monthly_mean_sale_price.shop_id,
    monthly_mean_sale_price.item_id,
    monthly_mean_sale_price.date_block_num))

###### 3. Build a hash map from shop_id, item_id, date_block_num to the aggregated feature.

In [36]:
shop_id_item_id_date_block_num_to_item_cnt_month = dict(zip(monthly_sales_data.shop_id_item_id_date_block_num,
                                                            monthly_sales_data.item_cnt_month))

In [37]:
shop_id_item_id_date_block_num_to_item_cnt_monthly_max = dict(
    zip(monthly_sales_data_monthly_max.shop_id_item_id_date_block_num, 
        monthly_sales_data_monthly_max.item_cnt_day_max_per_month))

In [38]:
shop_id_item_id_date_block_num_to_item_cnt_monthly_min = dict(
    zip(monthly_sales_data_monthly_min.shop_id_item_id_date_block_num, 
        monthly_sales_data_monthly_min.item_cnt_day_min_per_month))

In [39]:
shop_id_item_id_date_block_num_to_item_cnt_monthly_median = dict(
    zip(monthly_sales_data_monthly_median.shop_id_item_id_date_block_num, 
        monthly_sales_data_monthly_median.item_cnt_day_median_per_month))

In [40]:
shop_id_item_id_date_block_num_to_item_price_monthly_mean = dict(
    zip(monthly_mean_sale_price.shop_id_item_id_date_block_num, 
        monthly_mean_sale_price.item_price_monthly_mean))

In [41]:
monthly_sales_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'shop_id_item_id_date_block_num'],
      dtype='object')

In [42]:
monthly_mean_sale_price.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price_monthly_mean,shop_id_item_id_date_block_num
0,0,0,32,221.0,"(0, 32, 0)"
1,0,0,33,347.0,"(0, 33, 0)"
2,0,0,35,247.0,"(0, 35, 0)"
3,0,0,43,221.0,"(0, 43, 0)"
4,0,0,51,128.5,"(0, 51, 0)"


###### 4. Add another column to the aggregated data frame, which shows the aggregated value for the previous month (one can obtain this value from the hash map populated in step 2).


In [43]:
monthly_sales_data['prev_month_sale'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_month.get((x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [44]:
monthly_sales_data['prev_month_price_mean'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_price_monthly_mean.get(
        (x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [45]:
monthly_sales_data['prev_month_item_cnt_max'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_monthly_max.get(
        (x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [46]:
monthly_sales_data['prev_month_item_cnt_min'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_monthly_min.get(
        (x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [47]:
monthly_sales_data['prev_month_item_cnt_median'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_monthly_median.get(
        (x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [48]:
monthly_sales_data['prev_month_days_since_last_trade_min'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_days_since_last_trade_min.get(
    (x.shop_id, x.item_id, x.date_block_num - 1), 0), axis=1)

In [49]:
monthly_sales_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'shop_id_item_id_date_block_num', 'prev_month_sale',
       'prev_month_price_mean', 'prev_month_item_cnt_max',
       'prev_month_item_cnt_min', 'prev_month_item_cnt_median',
       'prev_month_days_since_last_trade_min'],
      dtype='object')

##### Add another column for item_category_id

In [50]:
items = pd.read_csv(INPUT_DIR + 'items.csv')

In [51]:
items.columns

Index(['item_name', 'item_id', 'item_category_id'], dtype='object')

In [52]:
item_id_to_item_category_id = dict(zip(items.item_id, items.item_category_id))

In [53]:
monthly_sales_data['item_category_id'] = monthly_sales_data['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [54]:
monthly_sales_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'shop_id_item_id_date_block_num', 'prev_month_sale',
       'prev_month_price_mean', 'prev_month_item_cnt_max',
       'prev_month_item_cnt_min', 'prev_month_item_cnt_median',
       'prev_month_days_since_last_trade_min', 'item_category_id'],
      dtype='object')

In [55]:
monthly_sales_data['item_cnt_month'].clip(lower=0, upper=20, inplace=True)

In [56]:
train_data = monthly_sales_data[(monthly_sales_data.date_block_num != 32) &(monthly_sales_data.date_block_num != 33)]
validation_data = monthly_sales_data[monthly_sales_data.date_block_num == 32]
test_data = monthly_sales_data[monthly_sales_data.date_block_num == 33]

##### Let us try to find the best model using cross validation.

In [57]:
#X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_month_sale', 
#             'prev_month_item_cnt_min', 'prev_month_item_cnt_max', 'prev_month_item_cnt_median']
X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_month_sale']

Y_COLUMN = 'item_cnt_month'

In [58]:
Y = train_data[[Y_COLUMN]]
X = train_data[X_COLUMNS]

In [59]:
new_Y = validation_data[[Y_COLUMN]]
new_X = validation_data[X_COLUMNS]

In [60]:
xgb_train_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)

In [61]:
xgb_validation_data = xgb.DMatrix(new_X, new_Y, feature_names=X_COLUMNS)

In [62]:
xgb_params = {'eta' : 0.1, 'eval_metric' : 'rmse'}

In [63]:
#model_1 = xgb.cv(params=xgb_params,
#                 dtrain=xgb_train_data,
#                 num_boost_round=10,
#                 nfold=5,
#                 callbacks=[xgb.callback.print_evaluation(show_stdv=False)])

In [64]:
#model_1

In [65]:
# We need to run this, but be aware of the fact that running for 4000 rounds will take some time on a normal CPU.
model_2 = xgb.train(params=xgb_params,
                    dtrain=xgb_train_data,
                    num_boost_round=4000)

In [66]:
model_2

<xgboost.core.Booster at 0x7fba0141dcc0>

In [67]:
validation_predictions = model_2.predict(xgb_validation_data)

In [68]:
np.sqrt(mean_squared_error(np.clip(validation_predictions, 0 , 20),  np.clip(new_Y, 0, 20)))

2.0169792761545295

##### That looks like a reasonably good jump !!! I am not sure what else we can do with xgboost. Let us generate predictions on the test set now. 


##### NOTE: Since we have split the input data into training, validation and test sets, we should ideally evaluate the test error from the test set and then train the relevant model over the entire input data set before evaluating predictions on the kaggle test set. In this case, we are directly evaluating predicitons on the kaggle test set. Though this is not complete, I am still leaving the code as it is , as it will be easier for us to reuse the same, once we finalize our model.


In [69]:
test = pd.read_csv(INPUT_DIR + 'test.csv')

In [70]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [71]:
test['date_block_num'] = 34

In [72]:
test['item_category_id'] = test['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [73]:
test['item_category_id'].describe()

count    214200.000000
mean         46.309608
std          16.716581
min           0.000000
25%          37.000000
50%          43.000000
75%          58.000000
max          83.000000
Name: item_category_id, dtype: float64

In [74]:
test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_category_id
0,0,5,5037,33,19
1,1,5,5320,33,55
2,2,5,5233,33,19
3,3,5,5232,33,23
4,4,5,5268,33,20


In [75]:
test['prev_month_sale'] = test.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_month.get((x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [76]:
test['prev_month_price_mean'] = test.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_price_monthly_mean.get((x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [77]:
test['prev_month_item_cnt_max'] = test.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_monthly_max.get(
        (x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [78]:
test['prev_month_item_cnt_min'] = test.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_monthly_min.get(
        (x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [79]:
test['prev_month_item_cnt_median'] = test.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_monthly_median.get(
        (x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

In [80]:
test['prev_month_days_since_last_trade_min'] = test.apply(
    lambda x : shop_id_item_id_date_block_num_to_days_since_last_trade_min.get(
    (x.shop_id, x.item_id, x.date_block_num - 1), 0), axis=1)

In [81]:
test_X = test[X_COLUMNS] 

In [82]:
xgb_test_predictions = xgb.DMatrix(test_X, feature_names=X_COLUMNS)

In [83]:
test_predictions = model_2.predict(xgb_test_predictions)

In [84]:
test_predictions

array([1.9714772 , 1.7481686 , 1.8508744 , ..., 0.9183872 , 0.95098054,
       0.9618436 ], dtype=float32)

In [85]:
test['item_cnt_month'] = np.clip(test_predictions, 0, 20)

In [86]:
test[['ID', 'item_cnt_month']].to_csv('submission_gradient_boosting_using_xgboost_new_variables.csv', 
                                      index=False)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
