### Summary

Build a model using lstm and verify it's performance on the validation set.


In [151]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

##### Get relevant data in monthly form

In [152]:
# Comment out the relevant line depending on whether we are running in kaggle or locally.
#INPUT_DIR = '../input/'
INPUT_DIR = '../input/competitive-data-science-predict-future-sales/'

In [153]:
complete_train_data = pd.read_csv(INPUT_DIR + 'sales_train.csv')

In [154]:
monthly_sales_data = complete_train_data[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()
monthly_sales_data.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
monthly_sales_data.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [155]:
monthly_sales_data['shop_id_item_id'] = list(zip(monthly_sales_data.shop_id, monthly_sales_data.item_id))

In [156]:
len(monthly_sales_data.shop_id_item_id.unique())

424124

In [157]:
len(monthly_sales_data[monthly_sales_data.date_block_num != 33])

1577593

In [158]:
monthly_sales_data[monthly_sales_data.date_block_num != 33].head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,shop_id_item_id
0,0,0,32,6.0,"(0, 32)"
1,0,0,33,3.0,"(0, 33)"
2,0,0,35,1.0,"(0, 35)"
3,0,0,43,1.0,"(0, 43)"
4,0,0,51,2.0,"(0, 51)"


In [159]:
complete_train_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [160]:
complete_train_data.date_block_num.describe()

count    2.935849e+06
mean     1.456991e+01
std      9.422988e+00
min      0.000000e+00
25%      7.000000e+00
50%      1.400000e+01
75%      2.300000e+01
max      3.300000e+01
Name: date_block_num, dtype: float64

##### Let us separate data into training and validaiton sets to ensure that they do not overlap !

In [161]:
sales_validation = complete_train_data[complete_train_data.date_block_num == 33]
sales_train = complete_train_data[complete_train_data.date_block_num != 33]

In [162]:
len(sales_train)

2882335

In [163]:
train_shop_id_item_ids = set(list(zip(sales_train.shop_id, sales_train.item_id)))

In [164]:
validation_shop_id_item_ids = set(list(zip(sales_validation.shop_id, sales_validation.item_id)))

##### Let us how many entries are present in the validation set that are not present in the training set.

In [165]:
diff = validation_shop_id_item_ids.difference(train_shop_id_item_ids)

In [166]:
len(diff)

7120

In [167]:
len(validation_shop_id_item_ids)

31531

In [168]:
len(train_shop_id_item_ids)

417004

##### Let us create a pivot table as it is easier to train an LSTM model on the same.

In [169]:
sales_train.date_block_num.describe()

count    2.882335e+06
mean     1.422773e+01
std      9.166123e+00
min      0.000000e+00
25%      6.000000e+00
50%      1.300000e+01
75%      2.200000e+01
max      3.200000e+01
Name: date_block_num, dtype: float64

In [170]:
data_pivot = pd.pivot_table(sales_train, 
                            index=['shop_id', 'item_id'], 
                            values='item_cnt_day', 
                            columns=['date_block_num'], 
                            aggfunc='sum',
                            fill_value=0)

In [171]:
data_pivot.reset_index(inplace=True)

In [172]:
data_pivot.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,23,24,25,26,27,28,29,30,31,32
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [173]:
len(data_pivot)

417004

In [176]:
# remove the unnecessary columns.
data_pivot.drop(columns=['shop_id', 'item_id'], inplace=True)

In [177]:
data_pivot.head()

date_block_num,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,0,31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [178]:
np.shape(data_pivot)

(417004, 33)

##### Now, let us get to the meat of training. Let us make a training set and train LSTM model on the same.

In [179]:
X_train = np.expand_dims(data_pivot.values[:,:-1], axis=2)

In [180]:
X_train.shape

(417004, 32, 1)

In [181]:
Y_train = data_pivot.values[:,-1:]

In [182]:
Y_train.shape

(417004, 1)

In [183]:
from keras.models import Sequential
from keras.layers import LSTM, Dense,Dropout

In [None]:
def build_lstm_model_keras(X_train, Y_train, input_len=32):
    my_model = Sequential()
    my_model.add(LSTM(units=64, input_shape=(input_len,1)))
    my_model.add(Dropout(0.4))
    my_model.add(Dense(1))
    my_model.compile(loss = 'mse', 
                     optimizer = 'adam', 
                     metrics = ['mean_squared_error'])
    my_model.summary()
    my_model.fit(X_train, Y_train, batch_size=4096, epochs=10)
    return my_model


In [None]:
my_model = build_lstm_model_keras(X_train, Y_train, 32)

##### In this section, we try to handle those cases when there is no data for the corresponding shop_id, item_id combination in the validation set

In [186]:
items = pd.read_csv(INPUT_DIR + 'items.csv')

In [187]:
item_id_to_item_category_id = dict(zip(items.item_id, items.item_category_id))

In [188]:
complete_train_data['item_category_id'] = complete_train_data['item_id'].apply(
    lambda x : item_id_to_item_category_id.get(x))

In [189]:
monthly_sales_item_category = complete_train_data[
    ['date_block_num', 'shop_id', 'item_category_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_category_id']).sum()

monthly_sales_item_category.reset_index(['date_block_num', 'shop_id', 'item_category_id'], inplace=True)
monthly_sales_item_category.rename(columns={'item_cnt_day' : 'item_cnt_month'}, inplace=True)

In [190]:
monthly_sales_item_category['shop_id_item_category_id_date_block_num'] = list(
    zip(monthly_sales_item_category.shop_id,
        monthly_sales_item_category.item_category_id,
        monthly_sales_item_category.date_block_num))

In [191]:
shop_id_item_category_id_date_block_num_to_tot_cnt_per_month = dict(zip(
    monthly_sales_item_category.shop_id_item_category_id_date_block_num,
    monthly_sales_item_category.item_cnt_month))

In [192]:
monthly_num_items_per_cat = complete_train_data[['date_block_num', 'shop_id', 'item_category_id', 'item_id']].groupby(
    ['date_block_num', 'shop_id', 'item_category_id']).nunique()

In [193]:
monthly_num_items_per_cat.drop(columns=['date_block_num', 'shop_id', 'item_category_id'], inplace=True)

In [194]:
monthly_num_items_per_cat.reset_index(['date_block_num', 'shop_id', 'item_category_id'], inplace=True)

In [195]:
monthly_num_items_per_cat['shop_id_item_category_id_date_block_num'] = list(
    zip(monthly_num_items_per_cat.shop_id,
        monthly_num_items_per_cat.item_category_id,
        monthly_num_items_per_cat.date_block_num,))

In [196]:
shop_id_item_category_id_date_block_num_to_num_items = dict(zip(
    monthly_num_items_per_cat.shop_id_item_category_id_date_block_num,
    monthly_num_items_per_cat.item_id))

In [197]:
shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((25, 42, 32), 0)/\
shop_id_item_category_id_date_block_num_to_num_items.get((25, 42, 32), 1)

13.875

In [246]:
def get_proxy_using_shop_id_and_item_category(shop_id, item_category_id, date_block_num):
    return shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, item_category_id, date_block_num), 0)/\
shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, date_block_num), 1)

In [247]:
get_proxy_using_shop_id_and_item_category(25, 42, 32)

13.875

In [252]:
get_proxy_using_shop_id_and_item_category(25, 73, 33)

0.0

##### Now, let us make predictions on the validation set and see how far we can get.

In [200]:
data_pivot = pd.pivot_table(sales_train, 
                            index=['shop_id', 'item_id'], 
                            values='item_cnt_day', 
                            columns=['date_block_num'], 
                            aggfunc='sum',
                            fill_value=0)

In [201]:
validation_monthly = sales_validation[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()

In [202]:
validation_monthly.reset_index(inplace=True)

In [203]:
validation_monthly.rename(columns={'item_cnt_day' : 'item_cnt_month'}, inplace=True)

In [204]:
validation_monthly.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,33,2,31,1.0
1,33,2,486,3.0
2,33,2,787,1.0
3,33,2,794,1.0
4,33,2,968,1.0


In [205]:
len(validation_monthly)

31531

In [206]:
validation_data_with_hist = pd.merge(data_pivot, validation_monthly, on=['shop_id', 'item_id'], how='inner')

In [207]:
len(validation_data_with_hist)

24411

In [208]:
len(validation_monthly)

31531

In [209]:
validation_data_with_hist.head()

Unnamed: 0,shop_id,item_id,0,1,2,3,4,5,6,7,...,25,26,27,28,29,30,31,32,date_block_num,item_cnt_month
0,2,31,0,4,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,33,1.0
1,2,486,0,0,0,0,0,0,0,0,...,3,2,1,0,2,0,0,1,33,3.0
2,2,787,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,33,1.0
3,2,1075,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,33,1.0
4,2,1377,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,33,1.0


In [210]:
# remove the unnecessary columns.
validation_data_for_pred = validation_data_with_hist.drop(columns=['shop_id', 
                                                                   'item_id', 
                                                                   'date_block_num', 
                                                                   'item_cnt_month'])

In [211]:
validation_data_for_pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,0,4,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,3,2,1,0,2,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [212]:
X_validation = np.expand_dims(validation_data_for_pred.values[:,1:], axis=2)

In [213]:
X_validation.shape

(24411, 32, 1)

In [214]:
validation_predictions = my_model.predict(X_validation)

In [215]:
validation_data_with_hist['predictions'] = np.clip(validation_predictions, 0, 20)

In [216]:
validation_data_with_hist.head()

Unnamed: 0,shop_id,item_id,0,1,2,3,4,5,6,7,...,26,27,28,29,30,31,32,date_block_num,item_cnt_month,predictions
0,2,31,0,4,1,1,0,0,0,0,...,0,0,0,0,0,0,0,33,1.0,0.064369
1,2,486,0,0,0,0,0,0,0,0,...,2,1,0,2,0,0,1,33,3.0,0.359206
2,2,787,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,33,1.0,0.06433
3,2,1075,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,33,1.0,0.172842
4,2,1377,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,33,1.0,0.05151


In [217]:
np.sqrt(mean_squared_error(np.clip(validation_data_with_hist.predictions, 0 , 20),  
                           np.clip(validation_data_with_hist.item_cnt_month, 0, 20)))


2.0763016228409255

In [218]:
len(validation_data_with_hist.predictions)

24411

In [219]:
validation_data_with_hist['shop_id_item_id_date_block_num'] = list(zip(validation_data_with_hist.shop_id,
                                                                       validation_data_with_hist.item_id,
                                                                       validation_data_with_hist.date_block_num))

In [220]:
shop_id_item_id_date_block_num_to_prediction = dict(zip(validation_data_with_hist.shop_id_item_id_date_block_num,
                                                        validation_data_with_hist.predictions))

In [221]:
validation_monthly['shop_id_item_id_date_block_num'] = list(zip(validation_monthly.shop_id,
                                                                validation_monthly.item_id,
                                                                validation_monthly.date_block_num))

##### We use the predictions from the model, when we have historical data for that particular shop_id and item_id in the training set and 0 otherwise.

In [222]:
validation_monthly['predictions'] = validation_monthly['shop_id_item_id_date_block_num'].apply(
    lambda x : shop_id_item_id_date_block_num_to_prediction.get(x, 0))

In [223]:
np.sqrt(mean_squared_error(np.clip(validation_monthly.predictions, 0 , 20),  
                           np.clip(validation_monthly.item_cnt_month, 0, 20)))


2.7421426869351615

##### To validate our model, let us quantify the performance only on those cases where we have historical data.

In [224]:
validation_monthly['present_in_hist'] = validation_monthly['shop_id_item_id_date_block_num'].apply(
    lambda x : shop_id_item_id_date_block_num_to_prediction.get(x, -1))

In [225]:
np.sqrt(mean_squared_error(np.clip(validation_monthly[validation_monthly.present_in_hist != -1].predictions, 0 , 20),  
                           np.clip(validation_monthly[validation_monthly.present_in_hist != -1].item_cnt_month, 0, 20)))


2.0763016228409255

In [227]:
validation_monthly['predictions'] = validation_monthly.apply(
    lambda x : shop_id_item_id_date_block_num_to_prediction.get((x.shop_id, x.item_id, 33), 0), axis=1)

In [228]:
np.sqrt(mean_squared_error(np.clip(validation_monthly.predictions, 0 , 20),  
                           np.clip(validation_monthly.item_cnt_month, 0, 20)))


2.7421426869351615

##### Let us us the average number of items sold for the corresponding item category for the previous month as the default instead of 0

In [241]:
validation_monthly['item_category_id'] = validation_monthly['item_id'].apply(lambda x : item_id_to_item_category_id.get(x))

In [253]:
validation_monthly['predictions'] = validation_monthly.apply(
    lambda x : shop_id_item_id_date_block_num_to_prediction.get((x.shop_id, x.item_id, 33), 
                                                                get_proxy_using_shop_id_and_item_category(
                                                                    x.shop_id, x.item_category_id, 32)), axis=1)

In [255]:
np.sqrt(mean_squared_error(np.clip(validation_monthly.predictions, 0 , 20),  
                           np.clip(validation_monthly.item_cnt_month, 0, 20)))


2.511400940760026

##### Okay, now let us make predictions on the test set 

In [16]:
data_pivot = pd.pivot_table(complete_train_data, 
                            index=['shop_id', 'item_id'], 
                            values='item_cnt_day', 
                            columns=['date_block_num'], 
                            aggfunc='sum',
                            fill_value=0)

In [17]:
data_pivot.reset_index(inplace=True)

In [18]:
data_pivot.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
test_data = pd.read_csv(INPUT_DIR + 'test.csv')

In [20]:
test_data.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [21]:
len(data_pivot)

424124

In [22]:
len(test_data)

214200

In [23]:
#data_pivot2 = pd.merge(data_pivot, test_data, on=['shop_id', 'item_id'], how='inner')

In [24]:
#len(test_data) - len(data_pivot2)

In [25]:
# remove the unnecessary columns.
data_pivot.drop(columns=['shop_id', 'item_id'], inplace=True)

In [26]:
data_pivot.head()

date_block_num,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0,31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
np.shape(data_pivot)

(424124, 34)

In [28]:
X_train = np.expand_dims(data_pivot.values[:,:-1], axis=2)

In [29]:
X_train.shape

(424124, 33, 1)

In [30]:
Y_train = data_pivot.values[:,-1:]

In [31]:
Y_train.shape

(424124, 1)

In [32]:
from keras.models import Sequential
from keras.layers import LSTM, Dense,Dropout

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [33]:
my_model = build_lstm_model_keras(X_train, Y_train, 33)


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 64)                16896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 16,961
Trainable params: 16,961
Non-trainable params: 0
_________________________________________________________________


##### Let us (finally) make predictions on the test set !

In [39]:
data_pivot = pd.pivot_table(complete_train_data, 
                            index=['shop_id', 'item_id'], 
                            values='item_cnt_day', 
                            columns=['date_block_num'], 
                            aggfunc='sum',
                            fill_value=0)

In [40]:
test_data_with_hist = pd.merge(data_pivot, test_data, on=['shop_id', 'item_id'], how='inner')

In [41]:
# remove the unnecessary columns.
test_data_for_pred = test_data_with_hist.drop(columns=['shop_id', 'item_id', 'ID'])

In [44]:
test_data_for_pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,4,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [45]:
X_test = np.expand_dims(test_data_for_pred.values[:,1:], axis=2)

In [46]:
X_test.shape

(111404, 33, 1)

In [None]:
test_predictions = my_model.predict(X_test)

In [None]:
test_data_with_hist['predictions'] = np.clip(test_predictions, 0, 20)

In [None]:
test_data_with_hist

In [None]:
id_to_predictions = dict(zip(test_data_with_hist.ID, test_data_with_hist.predictions))

In [None]:
id_to_predictions.keys

In [None]:
test_data['item_category_id'] = test_data['item_id'].apply(
    lambda x : item_id_to_item_category_id.get(x))

In [None]:
test_data['item_cnt_month'] = test_data.apply(
    lambda x : id_to_predictions.get(x.ID, 
                                     get_proxy_using_shop_id_and_item_category(x.shop_id, x.item_category_id, 33)), axis=1)

In [None]:
test_data[['ID', 'item_cnt_month']].to_csv('submission_lstm_keras.csv', index=False)