### Summary

Build a model using lstm and verify it's performance on the validation set.


In [4]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

##### Get relevant data in monthly form

In [5]:
# Comment out the relevant line depending on whether we are running in kaggle or locally.
#INPUT_DIR = '../input/'
INPUT_DIR = '../input/competitive-data-science-predict-future-sales/'

In [6]:
complete_train_data = pd.read_csv(INPUT_DIR + 'sales_train.csv')

In [7]:
monthly_sales_data = complete_train_data[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()
monthly_sales_data.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
monthly_sales_data.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [8]:
monthly_sales_data['shop_id_item_id'] = list(zip(monthly_sales_data.shop_id, monthly_sales_data.item_id))

In [9]:
len(monthly_sales_data.shop_id_item_id.unique())

424124

In [10]:
len(monthly_sales_data[monthly_sales_data.date_block_num != 33])

1577593

In [11]:
monthly_sales_data[monthly_sales_data.date_block_num != 33].head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,shop_id_item_id
0,0,0,32,6.0,"(0, 32)"
1,0,0,33,3.0,"(0, 33)"
2,0,0,35,1.0,"(0, 35)"
3,0,0,43,1.0,"(0, 43)"
4,0,0,51,2.0,"(0, 51)"


In [12]:
complete_train_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [13]:
complete_train_data.date_block_num.describe()

count    2.935849e+06
mean     1.456991e+01
std      9.422988e+00
min      0.000000e+00
25%      7.000000e+00
50%      1.400000e+01
75%      2.300000e+01
max      3.300000e+01
Name: date_block_num, dtype: float64

##### Let us separate data into training and validaiton sets to ensure that they do not overlap !

In [14]:
sales_train = complete_train_data[(complete_train_data.date_block_num != 32) &(complete_train_data.date_block_num != 33)]
sales_validation = complete_train_data[complete_train_data.date_block_num == 32]
sales_test = complete_train_data[complete_train_data.date_block_num == 33]

In [15]:
len(sales_train)

2831747

In [16]:
train_shop_id_item_ids = set(list(zip(sales_train.shop_id, sales_train.item_id)))

In [17]:
validation_shop_id_item_ids = set(list(zip(sales_validation.shop_id, sales_validation.item_id)))

##### Let us how many entries are present in the validation set that are not present in the training set.

In [18]:
diff = validation_shop_id_item_ids.difference(train_shop_id_item_ids)

In [19]:
len(diff)

5164

In [20]:
len(validation_shop_id_item_ids)

29678

In [21]:
len(train_shop_id_item_ids)

411840

##### Let us create a pivot table as it is easier to train an LSTM model on the same.

In [22]:
sales_train.date_block_num.describe()

count    2.831747e+06
mean     1.391024e+01
std      8.931705e+00
min      0.000000e+00
25%      6.000000e+00
50%      1.300000e+01
75%      2.200000e+01
max      3.100000e+01
Name: date_block_num, dtype: float64

In [23]:
data_pivot = pd.pivot_table(sales_train, 
                            index=['shop_id', 'item_id'], 
                            values='item_cnt_day', 
                            columns=['date_block_num'], 
                            aggfunc='sum',
                            fill_value=0)

In [24]:
data_pivot.reset_index(inplace=True)

In [25]:
data_pivot.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,22,23,24,25,26,27,28,29,30,31
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
len(data_pivot)

411840

In [27]:
# remove the unnecessary columns.
data_pivot.drop(columns=['shop_id', 'item_id'], inplace=True)

In [28]:
data_pivot.head()

date_block_num,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0,31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
np.shape(data_pivot)

(411840, 32)

##### Now, let us get to the meat of training. Let us make a training set and train LSTM model on the same.

In [30]:
X_train = np.expand_dims(data_pivot.values[:,:-1], axis=2)

In [31]:
X_train.shape

(411840, 31, 1)

In [32]:
Y_train = data_pivot.values[:,-1:]

In [33]:
Y_train.shape

(411840, 1)

In [34]:
from keras.models import Sequential
from keras.layers import LSTM, Dense,Dropout

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [37]:
def build_lstm_model_keras(X_train, Y_train, input_len=31):
    my_model = Sequential()
    my_model.add(LSTM(units=64, input_shape=(input_len,1)))
    my_model.add(Dropout(0.4))
    my_model.add(Dense(1))
    my_model.compile(loss = 'mse', 
                     optimizer = 'adam', 
                     metrics = ['mean_squared_error'])
    my_model.summary()
    my_model.fit(X_train, Y_train, batch_size=4096, epochs=10)
    return my_model


In [38]:
my_model = build_lstm_model_keras(X_train, Y_train, 31)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 64)                16896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 16,961
Trainable params: 16,961
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1


##### In this section, we try to handle those cases when there is no data for the corresponding shop_id, item_id combination in the validation set

In [39]:
items = pd.read_csv(INPUT_DIR + 'items.csv')

In [40]:
item_id_to_item_category_id = dict(zip(items.item_id, items.item_category_id))

In [41]:
complete_train_data['item_category_id'] = complete_train_data['item_id'].apply(
    lambda x : item_id_to_item_category_id.get(x))

In [42]:
monthly_sales_item_category = complete_train_data[
    ['date_block_num', 'shop_id', 'item_category_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_category_id']).sum()

monthly_sales_item_category.reset_index(['date_block_num', 'shop_id', 'item_category_id'], inplace=True)
monthly_sales_item_category.rename(columns={'item_cnt_day' : 'item_cnt_month'}, inplace=True)

In [43]:
monthly_sales_item_category['shop_id_item_category_id_date_block_num'] = list(
    zip(monthly_sales_item_category.shop_id,
        monthly_sales_item_category.item_category_id,
        monthly_sales_item_category.date_block_num))

In [44]:
shop_id_item_category_id_date_block_num_to_tot_cnt_per_month = dict(zip(
    monthly_sales_item_category.shop_id_item_category_id_date_block_num,
    monthly_sales_item_category.item_cnt_month))

In [45]:
monthly_num_items_per_cat = complete_train_data[['date_block_num', 'shop_id', 'item_category_id', 'item_id']].groupby(
    ['date_block_num', 'shop_id', 'item_category_id']).nunique()

In [46]:
monthly_num_items_per_cat.drop(columns=['date_block_num', 'shop_id', 'item_category_id'], inplace=True)

In [47]:
monthly_num_items_per_cat.reset_index(['date_block_num', 'shop_id', 'item_category_id'], inplace=True)

In [48]:
monthly_num_items_per_cat['shop_id_item_category_id_date_block_num'] = list(
    zip(monthly_num_items_per_cat.shop_id,
        monthly_num_items_per_cat.item_category_id,
        monthly_num_items_per_cat.date_block_num,))

In [49]:
shop_id_item_category_id_date_block_num_to_num_items = dict(zip(
    monthly_num_items_per_cat.shop_id_item_category_id_date_block_num,
    monthly_num_items_per_cat.item_id))

In [50]:
shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((25, 42, 32), 0)/\
shop_id_item_category_id_date_block_num_to_num_items.get((25, 42, 32), 1)

13.875

In [51]:
def get_proxy_using_shop_id_and_item_category(shop_id, item_category_id, date_block_num):
    return shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, item_category_id, date_block_num), 0)/\
shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, date_block_num), 1)

In [52]:
get_proxy_using_shop_id_and_item_category(25, 42, 32)

13.875

In [53]:
get_proxy_using_shop_id_and_item_category(25, 73, 33)

3.142857142857143

##### Now, let us make predictions on the validation set and see how far we can get.

In [54]:
data_pivot = pd.pivot_table(sales_train, 
                            index=['shop_id', 'item_id'], 
                            values='item_cnt_day', 
                            columns=['date_block_num'], 
                            aggfunc='sum',
                            fill_value=0)

In [55]:
validation_monthly = sales_validation[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()

In [56]:
validation_monthly.reset_index(inplace=True)

In [57]:
validation_monthly.rename(columns={'item_cnt_day' : 'item_cnt_month'}, inplace=True)

In [58]:
validation_monthly.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,32,2,33,1.0
1,32,2,486,1.0
2,32,2,792,1.0
3,32,2,975,1.0
4,32,2,1090,1.0


In [59]:
len(validation_monthly)

29678

In [60]:
validation_data_with_hist = pd.merge(data_pivot, validation_monthly, on=['shop_id', 'item_id'], how='inner')

In [61]:
len(validation_data_with_hist)

24514

In [62]:
len(validation_monthly)

29678

In [63]:
validation_data_with_hist.head()

Unnamed: 0,shop_id,item_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,date_block_num,item_cnt_month
0,2,33,1,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,0,32,1.0
1,2,486,0,0,0,0,0,0,0,0,...,0,3,2,1,0,2,0,0,32,1.0
2,2,792,0,0,0,0,0,0,0,0,...,1,2,0,0,1,1,0,2,32,1.0
3,2,975,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,32,1.0
4,2,1090,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,32,1.0


In [64]:
# remove the unnecessary columns.
validation_data_for_pred = validation_data_with_hist.drop(columns=['shop_id', 
                                                                   'item_id', 
                                                                   'date_block_num', 
                                                                   'item_cnt_month'])

In [65]:
validation_data_for_pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,2,0,0,3,2,1,0,2,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,2,0,0,1,1,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [66]:
X_validation = np.expand_dims(validation_data_for_pred.values[:,1:], axis=2)

In [67]:
X_validation.shape

(24514, 31, 1)

In [68]:
validation_predictions = my_model.predict(X_validation)

In [69]:
validation_data_with_hist['predictions'] = np.clip(validation_predictions, 0, 20)

In [70]:
validation_data_with_hist.head()

Unnamed: 0,shop_id,item_id,0,1,2,3,4,5,6,7,...,25,26,27,28,29,30,31,date_block_num,item_cnt_month,predictions
0,2,33,1,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,32,1.0,0.29888
1,2,486,0,0,0,0,0,0,0,0,...,3,2,1,0,2,0,0,32,1.0,0.31228
2,2,792,0,0,0,0,0,0,0,0,...,2,0,0,1,1,0,2,32,1.0,0.778344
3,2,975,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,32,1.0,0.204213
4,2,1090,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,32,1.0,0.137968


In [71]:
np.sqrt(mean_squared_error(np.clip(validation_data_with_hist.predictions, 0 , 20),  
                           np.clip(validation_data_with_hist.item_cnt_month, 0, 20)))


2.0173358427779973

In [72]:
len(validation_data_with_hist.predictions)

24514

In [89]:
validation_monthly.date_block_num.unique()

array([32])

In [73]:
validation_data_with_hist['shop_id_item_id_date_block_num'] = list(zip(validation_data_with_hist.shop_id,
                                                                       validation_data_with_hist.item_id,
                                                                       validation_data_with_hist.date_block_num))

In [74]:
shop_id_item_id_date_block_num_to_prediction = dict(zip(validation_data_with_hist.shop_id_item_id_date_block_num,
                                                        validation_data_with_hist.predictions))

In [75]:
validation_monthly['shop_id_item_id_date_block_num'] = list(zip(validation_monthly.shop_id,
                                                                validation_monthly.item_id,
                                                                validation_monthly.date_block_num))

##### We use the predictions from the model, when we have historical data for that particular shop_id and item_id in the training set and 0 otherwise.

In [76]:
validation_monthly['predictions'] = validation_monthly['shop_id_item_id_date_block_num'].apply(
    lambda x : shop_id_item_id_date_block_num_to_prediction.get(x, 0))

In [77]:
np.sqrt(mean_squared_error(np.clip(validation_monthly.predictions, 0 , 20),  
                           np.clip(validation_monthly.item_cnt_month, 0, 20)))


2.5975882031066018

##### To validate our model, let us quantify the performance only on those cases where we have historical data.

In [78]:
validation_monthly['present_in_hist'] = validation_monthly['shop_id_item_id_date_block_num'].apply(
    lambda x : shop_id_item_id_date_block_num_to_prediction.get(x, -1))

In [79]:
np.sqrt(mean_squared_error(np.clip(validation_monthly[validation_monthly.present_in_hist != -1].predictions, 0 , 20),  
                           np.clip(validation_monthly[validation_monthly.present_in_hist != -1].item_cnt_month, 0, 20)))


2.0173358427779973

In [82]:
validation_monthly['predictions'] = validation_monthly.apply(
    lambda x : shop_id_item_id_date_block_num_to_prediction.get((x.shop_id, x.item_id, 32), 0), axis=1)

In [83]:
np.sqrt(mean_squared_error(np.clip(validation_monthly.predictions, 0 , 20),  
                           np.clip(validation_monthly.item_cnt_month, 0, 20)))


2.5975882031066018

##### Let us us the average number of items sold for the corresponding item category for the previous month as the default instead of 0

In [84]:
validation_monthly['item_category_id'] = validation_monthly['item_id'].apply(lambda x : item_id_to_item_category_id.get(x))

In [85]:
validation_monthly['predictions'] = validation_monthly.apply(
    lambda x : shop_id_item_id_date_block_num_to_prediction.get((x.shop_id, x.item_id, 32), 
                                                                get_proxy_using_shop_id_and_item_category(
                                                                    x.shop_id, x.item_category_id, 31)), axis=1)

In [86]:
np.sqrt(mean_squared_error(np.clip(validation_monthly.predictions, 0 , 20),  
                           np.clip(validation_monthly.item_cnt_month, 0, 20)))


2.139307100259668