In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from xgboost import XGBRegressor


In [2]:
# Read the data into variable

train = pd.read_csv('./final_project_data/sales_train.csv')
items = pd.read_csv('./final_project_data/items.csv')
item_categories = pd.read_csv('./final_project_data/item_categories.csv')
shops = pd.read_csv('./final_project_data/shops.csv')
test = pd.read_csv('./final_project_data/test.csv')

In [3]:
print(f'Shapes of data are:\n train:{train.shape}\n test:{test.shape}\n items:{items.shape}' +
     f'\n item_categories:{item_categories.shape} \n shops:{shops.shape}')

Shapes of data are:
 train:(2935849, 6)
 test:(214200, 3)
 items:(22170, 3)
 item_categories:(84, 2) 
 shops:(60, 2)


In [5]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [6]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [7]:
# Copy the original dataset to temp 
tn = train.copy()
ts = test.copy()

In [8]:
ts.drop_duplicates()
print(f' before duplicate drop:{test.shape}  after duplicate drop:{ts.shape}')

 before duplicate drop:(214200, 3)  after duplicate drop:(214200, 3)


In [9]:
tn['item_revenue'] = tn['item_price'] * tn['item_cnt_day']
tn.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_revenue
0,02.01.2013,0,59,22154,999.0,1.0,999.0
1,03.01.2013,0,25,2552,899.0,1.0,899.0
2,05.01.2013,0,25,2552,899.0,-1.0,-899.0
3,06.01.2013,0,25,2554,1709.05,1.0,1709.05
4,15.01.2013,0,25,2555,1099.0,1.0,1099.0


### Copy from coursera

A good exercise is to reproduce previous_value_benchmark. As the name suggest - in this benchmark for the each shop/item pair our predictions are just monthly sales from the previous month, i.e. October 2015.

The most important step at reproducing this score is correctly aggregating daily data and constructing monthly sales data frame. You need to get lagged values, fill NaNs with zeros and clip the values into [0,20] range. If you do it correctly, you'll get precisely 1.16777 on the public leaderboard.

Generating features like this is a necessary basis for more complex models. Also, if you decide to fit some model, don't forget to clip the target into [0,20] range, it makes a big difference.

#### Yes the score I got was 1.16777

In [10]:
# Take a copy and select 33rd month i.e Oct'15 a month before Nov'15, the prediction month.
df_train = train.copy()

# Rows selected which satisfy the condition
tn_month_33 = df_train[df_train.date_block_num == 33]

# Keep only the required columns and rename daily to monthly
tn_33_reqd_cols = tn_month_33[['shop_id','item_id','item_cnt_day']]
tn_33_agg_icd = tn_33_reqd_cols.groupby(['shop_id','item_id'])['item_cnt_day'].sum().reset_index()
tn_33_agg_icd = tn_33_agg_icd.rename(columns = {'item_cnt_day' : 'item_cnt_month'})


In [11]:
# A look at the item count data
mean = tn_33_agg_icd.item_cnt_month.mean()
median = tn_33_agg_icd.item_cnt_month.median()
# print(f'mean:{mean}, median:{median}')
# print(tn_33_agg_icd.item_cnt_month.describe())
# tn_33_agg_icd.head()

In [12]:
# Map the previous month data to prediction month data.
df_merge = ts.merge(tn_33_agg_icd, on=['shop_id', 'item_id'], how='left')[['ID', 'item_cnt_month']]

# Fill the NaN's with zero. There are many NaN's as plenty of item are missing in last month
print(f'Nulls in dataset due to missing item_id in train set:\n{df_merge.isna().sum()}')

# Clip the values as given in coursera advisory
df_merge = df_merge.item_cnt_month.fillna(0).clip(0,20).reset_index()

# Rename for submission 
df_merge = df_merge.rename(columns ={'index':'ID'})

df_merge.head()


Nulls in dataset due to missing item_id in train set:
ID                     0
item_cnt_month    185520
dtype: int64


Unnamed: 0,ID,item_cnt_month
0,0,0.0
1,1,0.0
2,2,1.0
3,3,0.0
4,4,0.0


In [13]:
submission = df_merge.set_index('ID')
submission.to_csv('benchmark.csv')

In [14]:
# Trying by using the increase of last years from Oct to Nov

# Rows selected of Oct'14 and Nov'14
tn_month_21 = df_train[df_train.date_block_num == 21]
tn_month_22 = df_train[df_train.date_block_num == 22]

tn_21_agg_icd = tn_month_21.groupby(['shop_id','item_id'])['item_cnt_day'].sum().reset_index()
tn_22_agg_icd = tn_month_22.groupby(['shop_id','item_id'])['item_cnt_day'].sum().reset_index()

mean_21 = tn_21_agg_icd.item_cnt_day.mean()
mean_22 = tn_22_agg_icd.item_cnt_day.mean()

monthly_increase = mean_22 / mean_21
# print(monthly_increase)

df_merge_increase = df_merge['item_cnt_month'] * monthly_increase
df_merge_increase.head()


0    0.000000
1    0.000000
2    1.021264
3    0.000000
4    0.000000
Name: item_cnt_month, dtype: float64

In [15]:
submission = df_merge.set_index('ID')
submission.to_csv('benchmark_inc.csv')

Same score of 1.16777

### Dictionaries

1. Category
2. Shop-Item price

In [16]:
# Make dictionary of shop-item pair to item price
# And of item_id to item_category_id

# Use only the required columns
df_shop_item = tn[['shop_id', 'item_id', 'item_price']]
# New coloumn by concatinating shop item pair
df_shop_item['shop_item'] = df_shop_item['shop_id'].astype(str) +'_'+ df_shop_item['item_id'].astype(str)

# Drop all duplicates and keep the last as it is time series forecast
df_shop_item.drop_duplicates('shop_item',keep='last', inplace=True)

# Check why error.
# item_price = df_shop_item[['shop_item','item_price']].to_dict()['item_price']

df_shop_item.drop(['shop_id','item_id'], axis = 1, inplace=True)

# Make the dictionary
df_shop_item.set_index('shop_item', inplace=True)
item_price = df_shop_item.to_dict()['item_price']

# print(df_shop_item.head(2))

# We create the dictionary
item_cat = items[['item_id','item_category_id']].to_dict()['item_category_id']


### Arithmetic Series Sum

$$ S_n  = \frac{1}{2}n(a_1 +a_n) $$

In [17]:
series_sum = 0.5 * ts.ID.count() * (ts.ID[0] + ts.ID[ts.shape[0] - 1])
series_mean = 0.5 * (ts.ID[0] + ts.ID[ts.shape[0] - 1])
print(series_sum == ts.ID.sum())
print(series_mean == ts.ID.mean())

True
True


Looks as if ID is a counter. So can be dropped and brought back.

In [18]:
# The train data is daily, change to Monthly as test data is monthly
# Include category column as it is an important feature
df_tn_reduce = train.copy()
print(df_tn_reduce.shape)
df_tn_reduce = df_tn_reduce.groupby(
        ['date_block_num','shop_id','item_id','item_price'])['item_cnt_day'].sum().reset_index()


# Include Categories
df_tn_reduce['item_category_id'] = df_tn_reduce['item_id'].map(item_cat)
df_tn_reduce.sort_values(by=['date_block_num'], inplace=True)

y = df_tn_reduce['item_cnt_day']

print(df_tn_reduce.shape)
#df_tn_reduce = df_tn_reduce.drop(['date','item_cnt_day'], axis = 1)
df_tn_reduce.tail()

(2935849, 6)
(1739022, 6)


Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
1716882,33,24,6740,999.0,1.0,30
1716881,33,24,6738,999.0,2.0,30
1716880,33,24,6710,599.0,2.0,30
1716910,33,24,7499,399.0,1.0,55
1739021,33,59,22102,1250.0,1.0,42


In [19]:
# Split the train set to train and validate
# Take oct and nov months of last 2 years as they are similar to test set which is nov month
df_train = df_tn_reduce.loc[(df_tn_reduce['date_block_num'] != 22) & (
                df_tn_reduce['date_block_num'] != 21) & (df_tn_reduce['date_block_num'] != 10
                ) & (df_tn_reduce['date_block_num'] != 11)]

df_val = df_tn_reduce.loc[(df_tn_reduce['date_block_num'] == 22) | (
                df_tn_reduce['date_block_num'] == 21) | (df_tn_reduce['date_block_num'] == 10
                ) | (df_tn_reduce['date_block_num'] == 11)]

# # Check the total count
# print(f' Total rows: {df_tn_reduce.shape[0]} Train set:{df_train.shape[0]} Validation set: {df_val.shape[0]}')
# print(df_train.shape[0] + df_val.shape[0])


In [20]:
# Take out the targets for train and validate
y_train = df_train['item_cnt_day']
x_train = df_train.drop(['item_cnt_day'], axis=1)

y_val = df_val['item_cnt_day']
x_val = df_val.drop(['item_cnt_day'], axis=1)

# # Shuffle
# y_train = y_train.sample(frac=1,random_state=1)
# x_train = x_train.sample(frac=1,random_state=1)
# y_val = y_val.sample(frac=1,random_state=1)
# x_val = x_val.sample(frac=1,random_state=1)

#y_val.shape

In [21]:
x_train.shape[1]

5

In [22]:
# Add features to test set to make it similar to train set for use in the model
# Use the 2 dictionaries of category and shop-item pair and the price

df_ts_expand = test.copy()
df_ts_expand['shop_item']=df_ts_expand["shop_id"].astype(str) +"_"+ df_ts_expand["item_id"].astype(str)
df_ts_expand['item_price'] = df_ts_expand['shop_item'].map(item_price)
df_ts_expand['item_price'].fillna(0, inplace=True)
df_ts_expand['item_category_id'] = df_tn_reduce['item_id'].map(item_cat)

df_ts_expand['date_block_num'] = 34

ts_index = df_ts_expand['ID']
df_ts_expand.drop(['ID', 'shop_item'], axis = 1, inplace = True)
df_ts_expand = df_ts_expand.reindex(
            ['date_block_num','shop_id','item_id', 'item_price', 'item_category_id'],axis=1)

df_ts_expand.head()
#df_ts_expand.isna().sum()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_category_id
0,34,5,5037,749.5,40
1,34,5,5320,0.0,37
2,34,5,5233,1199.0,40
3,34,5,5232,599.0,40
4,34,5,5268,0.0,57


In [23]:
# Implement callback function to stop training
# when accuracy reaches e.g. 
ERROR_THRESHOLD = 10850.0
class myCallback(tf.keras.callbacks.Callback): 
    def on_epoch_end(self, epoch, logs={}): 
        if(logs.get('loss') < ERROR_THRESHOLD):   
            print("\nReached %2.2f%% accuracy, so stopping training!!" %(ERROR_THRESHOLD))   
            self.model.stop_training = True
            


In [24]:
# Instantiate a callback object
callbacks = myCallback()

In [25]:
def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation='relu',input_shape=[x_train.shape[1]]),
        layers.Dense(64, activation='relu'),
       layers.Dense(1)
      ])

    optimizer = 'adam'
    
    model.compile(loss='mean_absolute_error',
                optimizer=optimizer,
                metrics=['mae',])
    return model

In [26]:
keras_model = build_model()

In [27]:
EPOCHS = 5

history = keras_model.fit(
  x_train, y_train,
  epochs=EPOCHS,
    batch_size=50,
    validation_data=(x_val, y_val),
    #callbacks=[callbacks],
  )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
# Preprocessing of test data, fit model
preds_test = keras_model.predict(df_ts_expand)

# The prediction is of type numpy.ndarray
preds_list = preds_test.tolist()

# Extract the prediction and put it in a list
prediction = []
for item in preds_list:
    prediction.append(item[0])

In [29]:
# xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
# xgb_model.fit(x_train, y_train, 
#              early_stopping_rounds=5, 
#              eval_set=[(x_val, y_val)], 
#              verbose=False)

In [30]:
# preds = xgb_model.predict(df_ts_expand)

In [31]:
# prediction = (np.clip(preds, a_min = 0, a_max = 20)).tolist()
#prediction

In [32]:
# Save test predictions to file
output = pd.DataFrame({'ID': ts_index,
                       'item_cnt_month': prediction})


In [33]:
#output.to_csv('XbgModel.csv', index=False)

In [34]:
output.to_csv('KerasModel.csv', index=False)