### Summary

Build a model using xgboost and verify it's performance on the validation set.


In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

##### Get relevant data in monthly form

In [3]:
#INPUT_DIR = '../input/'
INPUT_DIR = '../input/competitive-data-science-predict-future-sales/'

In [4]:
sales_train = pd.read_csv(INPUT_DIR + 'sales_train.csv')

In [5]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


##### 1. Aggregate features to get monthly data.

In [6]:
monthly_sales_data = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()
monthly_sales_data.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
monthly_sales_data.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [7]:
monthly_sales_data['shop_id_item_id'] = list(zip(monthly_sales_data.shop_id,monthly_sales_data.item_id))

In [8]:
monthly_sales_data['shop_id_item_id_date_block_num'] = list(zip(monthly_sales_data.shop_id, 
                                                                monthly_sales_data.item_id,
                                                                monthly_sales_data.date_block_num))

In [9]:
shop_id_item_id_date_block_num_to_item_cnt_month = dict(zip(monthly_sales_data.shop_id_item_id_date_block_num,
                                                            monthly_sales_data.item_cnt_month))

In [10]:
monthly_sales_data['prev_month_sale'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_month.get((x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

##### Add another column for item_category_id

In [11]:
items = pd.read_csv(INPUT_DIR + 'items.csv')

In [12]:
items.columns

Index(['item_name', 'item_id', 'item_category_id'], dtype='object')

In [13]:
item_id_to_item_category_id = dict(zip(items.item_id, items.item_category_id))

In [14]:
monthly_sales_data['item_category_id'] = monthly_sales_data['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [15]:
monthly_sales_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month',
       'shop_id_item_id', 'shop_id_item_id_date_block_num', 'prev_month_sale',
       'item_category_id'],
      dtype='object')

In [16]:
monthly_sales_data['item_cnt_month'].clip(lower=0, upper=20, inplace=True)

In [17]:
monthly_sales_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,shop_id_item_id,shop_id_item_id_date_block_num,prev_month_sale,item_category_id
0,0,0,32,6.0,"(0, 32)","(0, 32, 0)",0.0,40
1,0,0,33,3.0,"(0, 33)","(0, 33, 0)",0.0,37
2,0,0,35,1.0,"(0, 35)","(0, 35, 0)",0.0,40
3,0,0,43,1.0,"(0, 43)","(0, 43, 0)",0.0,40
4,0,0,51,2.0,"(0, 51)","(0, 51, 0)",0.0,57


##### Let us try to find the best model using cross validation.

In [18]:
def get_most_recent_item_cnt(shop_id, item_id, date_block_num):
    val = 0
    for i in range(date_block_num, -1, -1):
        if (shop_id, item_id, i) in shop_id_item_id_date_block_num_to_item_cnt_month.keys():
            val = shop_id_item_id_date_block_num_to_item_cnt_month.get((shop_id, item_id, i))
            break
            
    return val
            

In [19]:
monthly_sales_data['most_recent_item_cnt'] = monthly_sales_data.apply(
    lambda x : get_most_recent_item_cnt(x.shop_id, x.item_id, x.date_block_num - 1), axis=1)

In [20]:
monthly_sales_data['prev_item_cnt'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_month.get(
        (x.shop_id, x.item_id, x.date_block_num - 1), 0), axis=1)

In [21]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.prev_item_cnt, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

2.705738462944679

In [22]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.most_recent_item_cnt, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

2.658860835541009

#### What about those cases, when we do not have any entries in the training data.

We have couple of options here. We can take an average of the entire shop for the previous month or the average of a similar item in that same shop for the previous month (NOTE: how do you measure similarity )?
 

###### Let us start adding item_category_id to the daily training data, so that we could use that as a proxy if the corresponding item is not present.

In [26]:
sales_train['item_category_id'] = sales_train['item_id'].apply(lambda x : item_id_to_item_category_id.get(x))

In [27]:
monthly_sales_item_category = sales_train[
    ['date_block_num', 'shop_id', 'item_category_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_category_id']).sum()

monthly_sales_item_category.reset_index(['date_block_num', 'shop_id', 'item_category_id'], inplace=True)
monthly_sales_item_category.rename(columns={'item_cnt_day' : 'item_cnt_month'}, inplace=True)

In [28]:
monthly_sales_item_category.head()

Unnamed: 0,date_block_num,shop_id,item_category_id,item_cnt_month
0,0,0,2,53.0
1,0,0,3,28.0
2,0,0,4,16.0
3,0,0,5,28.0
4,0,0,6,65.0


###### So the idea is this . Let us compute the total number of items sold for the item category for a month and the total number of item_id's corresponding to that category. Dividing the two numbers would give us the average number of items sold  for a particular item_id corresponding that category.

In [29]:
monthly_sales_item_category['shop_id_item_category_id_date_block_num'] = list(
    zip(monthly_sales_item_category.shop_id,
        monthly_sales_item_category.item_category_id,
        monthly_sales_item_category.date_block_num))

In [30]:
shop_id_item_category_id_date_block_num_to_tot_cnt_per_month = dict(zip(
    monthly_sales_item_category.shop_id_item_category_id_date_block_num,
    monthly_sales_item_category.item_cnt_month))

In [31]:
monthly_num_items_per_cat = sales_train[['date_block_num', 'shop_id', 'item_category_id', 'item_id']].groupby(
    ['date_block_num', 'shop_id', 'item_category_id']).nunique()

In [32]:
monthly_num_items_per_cat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date_block_num,shop_id,item_category_id,item_id
date_block_num,shop_id,item_category_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,2,1,1,1,16
0,0,3,1,1,1,1
0,0,4,1,1,1,5
0,0,5,1,1,1,11
0,0,6,1,1,1,10


In [33]:
monthly_num_items_per_cat.columns

Index(['date_block_num', 'shop_id', 'item_category_id', 'item_id'], dtype='object')

In [34]:
monthly_num_items_per_cat.drop(columns=['date_block_num', 'shop_id', 'item_category_id'], inplace=True)

In [35]:
monthly_num_items_per_cat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_id
date_block_num,shop_id,item_category_id,Unnamed: 3_level_1
0,0,2,16
0,0,3,1
0,0,4,5
0,0,5,11
0,0,6,10


In [36]:
monthly_num_items_per_cat.reset_index(['date_block_num', 'shop_id', 'item_category_id'], inplace=True)

In [37]:
monthly_num_items_per_cat['shop_id_item_category_id_date_block_num'] = list(
    zip(monthly_num_items_per_cat.shop_id,
        monthly_num_items_per_cat.item_category_id,
        monthly_num_items_per_cat.date_block_num,))

In [38]:
shop_id_item_category_id_date_block_num_to_num_items = dict(zip(
    monthly_num_items_per_cat.shop_id_item_category_id_date_block_num,
    monthly_num_items_per_cat.item_id))

In [39]:
shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((25, 42, 32), 0)/\
shop_id_item_category_id_date_block_num_to_num_items.get((25, 42, 32), 1)

13.875

###### Now, let design several predictors and see how well they perform on the validation set.

In [40]:
def get_prev_count_or_category_count(shop_id, item_id, item_category_id, date_block_num):
    if (shop_id, item_id, date_block_num) in shop_id_item_id_date_block_num_to_item_cnt_month.keys():
        val = shop_id_item_id_date_block_num_to_item_cnt_month.get((shop_id, item_id, date_block_num))
    else:
        val = shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, 
                                                                                item_category_id, 
                                                                                date_block_num), 0)/\
        shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, date_block_num), 1)
    return val

In [41]:
def get_most_recent_item_cnt_or_category_count(shop_id, item_id, item_category_id, date_block_num):
    val = 0
    for i in range(date_block_num, -1, -1):
        if (shop_id, item_id, i) in shop_id_item_id_date_block_num_to_item_cnt_month.keys():
            val = shop_id_item_id_date_block_num_to_item_cnt_month.get((shop_id, item_id, i))
            break
    
    val = shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, 
                                                                            item_category_id, 
                                                                            date_block_num), 0)/\
    shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, date_block_num), 1)
    
    return val
            

In [42]:
monthly_sales_data['most_recent_item_cnt_or_category_count'] = monthly_sales_data.apply(
    lambda x : get_most_recent_item_cnt_or_category_count(
        x.shop_id, x.item_id, x.item_category_id, x.date_block_num - 1), axis=1)

In [43]:
monthly_sales_data['prev_count_or_category_count'] = monthly_sales_data.apply(
    lambda x : get_prev_count_or_category_count(
        x.shop_id, x.item_id, x.item_category_id, x.date_block_num - 1), axis=1)

In [44]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.most_recent_item_cnt_or_category_count, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

2.4737989864358956

In [45]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.prev_count_or_category_count, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

2.4727568520088608

##### Let us try one more refinement, where we to iteratively find the average monthly count for the same item category for the most recent month.

In [46]:
def get_most_recent_item_cnt_or_most_recent_category_count(shop_id, item_id, item_category_id, date_block_num):
    val = 0
    for i in range(date_block_num, -1, -1):
        if (shop_id, item_id, i) in shop_id_item_id_date_block_num_to_item_cnt_month.keys():
            val = shop_id_item_id_date_block_num_to_item_cnt_month.get((shop_id, item_id, i))
            break
    
    if val != 0:
        for i in range(date_block_num, -1, -1):

            if (shop_id, item_category_id, i) in shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.keys():
                val = shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, 
                                                                                        item_category_id, 
                                                                                        i))/\
                shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, i))
                break
    
    return val
            

In [47]:
def get_prev_month_category_count(shop_id, item_id, item_category_id, date_block_num):
    val = shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, 
                                                                            item_category_id, 
                                                                            date_block_num), 0)/\
    shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, date_block_num), 1)
    
    return val
            

In [48]:
monthly_sales_data['most_recent_item_cnt_or_most_recent_category_count'] = monthly_sales_data.apply(
    lambda x : get_most_recent_item_cnt_or_most_recent_category_count(
        x.shop_id, x.item_id, x.item_category_id, x.date_block_num - 1), axis=1) 

In [49]:
monthly_sales_data['prev_month_category_count'] = monthly_sales_data.apply(
    lambda x : get_prev_month_category_count(
        x.shop_id, x.item_id, x.item_category_id, x.date_block_num - 1), axis=1) 

In [50]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.most_recent_item_cnt_or_most_recent_category_count, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

2.6751849254483018

#### Formalize what we did above to an xgboost algorithm.

In [56]:
X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_month_sale', 
             'prev_month_category_count']
#X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_count_or_category_count']

Y_COLUMN = 'item_cnt_month'

In [53]:
train_data = monthly_sales_data[(monthly_sales_data.date_block_num != 32) &(monthly_sales_data.date_block_num != 33)]
validation_data = monthly_sales_data[monthly_sales_data.date_block_num == 32]
test_data = monthly_sales_data[monthly_sales_data.date_block_num == 33]

In [54]:
train_data.date_block_num.describe()

count    1.577593e+06
mean     1.429833e+01
std      9.274825e+00
min      0.000000e+00
25%      6.000000e+00
50%      1.400000e+01
75%      2.200000e+01
max      3.200000e+01
Name: date_block_num, dtype: float64

In [55]:
X = train_data[X_COLUMNS]
Y = train_data[[Y_COLUMN]]

NameError: name 'X_COLUMNS' is not defined

In [57]:
new_X = validation_data[X_COLUMNS]
new_Y = validation_data[[Y_COLUMN]]

In [58]:
def build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS):
    xgb_train_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)
    xgb_validation_data = xgb.DMatrix(new_X, new_Y, feature_names=X_COLUMNS)
    xgb_params = {'eta' : 0.1, 'eval_metric' : 'rmse'}
    model_obj = xgb.train(params=xgb_params,
                      dtrain=xgb_train_data,
                      num_boost_round=4000)
    validation_predictions = model_obj.predict(xgb_validation_data)
    print(np.sqrt(mean_squared_error(np.clip(validation_predictions, 0, 20), np.clip(new_Y, 0, 20))))

In [None]:
build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS)

In [59]:
X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_count_or_category_count']
X = train_data[X_COLUMNS]

In [None]:
new_X = validation_data[X_COLUMNS]

In [None]:
build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS)