# imports and constants

In [5]:
import pandas as pd

from os.path import join as join_pth
import matplotlib.pyplot as plt
import random
import seaborn as sns
# % matplotlib inline
sns.set_style('whitegrid')
import torch
from torch.utils.data.dataloader import DataLoader
import numpy as np
from src import data_loader,models,model_training,utils,data_preprocessing
device='cuda' if torch.cuda.is_available() else 'cpu'


In [45]:
from importlib import reload,import_module


reload(data_loader)
reload(data_preprocessing)
reload(models)
reload(model_training)
reload(utils)

<module 'src.utils' from 'D:\\2022 acheivments\\Projects\\DeepLearning NanoDegree\\Rossmann-Store-Sales\\src\\utils.py'>

array([[1, 3],
       [4, 6],
       [7, 9]])

In [7]:
raw_dataset_path = "../dataset/raw"
prep_dataset_path="../dataset/prep/lstm_model"
nn_model_weights_pth="../model_weights/lstm_model"
nn_model_train_data_pth="../train_data/lstm_model"

# roadmap
- data preparation after analysis run bulk preprocessing functions that have all preprocessing and feature engineering done at the analysis section
- start with the neural network without lstm layers and get the best score after hyperparameters tuning
    - create data loader for the neural network
    - find the best hyperparameters for the first epochs
    - train the model on the best hyperparameters

# Data preparation

## bulk preprocessing

In [8]:
stores_df = pd.read_csv(join_pth(raw_dataset_path, "store.csv"), low_memory=False)
stores_sales_df = pd.read_csv(join_pth(raw_dataset_path, "train.csv"), low_memory=False)

# Sales bulk preprocessing
stores_sales_df_prep=data_preprocessing.store_sales_prep(stores_sales_df=stores_sales_df)

# Store data bulk preprocessing
stores_data_df_prep=data_preprocessing.store_data_prep(store_data_df=stores_df)
# merge and do bulk preprocessing
merge_prep=data_preprocessing.merge_store_sales(sales_data_df=stores_sales_df_prep,store_data_df=stores_data_df_prep)

# drop closed stores data and open column
merge_prep=data_preprocessing.drop_closed_days(merge_prep)



In [9]:
# we have data from jan 2013 to jul 2015
print(merge_prep.Date.min())
print(merge_prep.Date.max())

print(f"months {2*12 + 7}")
print(f"test months = {int(31*0.3)}")

2013-01-01 00:00:00
2015-07-31 00:00:00
months 31
test months = 9


## train test split

In [10]:
import datetime
boundary=datetime.datetime.strptime("2015-02-01","%Y-%m-%d")
train_data=merge_prep[merge_prep.Date<boundary]
train_data=train_data.sort_values(by='Date')
train_data=data_preprocessing.drop_extra_cols(train_data)
train_data=data_preprocessing.hot_encoding(train_data)

test_data=merge_prep[merge_prep.Date>=boundary]
test_data=test_data.sort_values(by='Date')
test_data=data_preprocessing.drop_extra_cols(test_data)
test_data=data_preprocessing.hot_encoding(test_data)

print(f"train data size {len(train_data)}")
print(f"test data size  {len(test_data)}")

train data size 677123
test data size  167269


In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 677123 entries, 621467 to 166885
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Store                   677123 non-null  float64
 1   Sales                   677123 non-null  float64
 2   Promo                   677123 non-null  float64
 3   SchoolHoliday           677123 non-null  float64
 4   month                   677123 non-null  float64
 5   day                     677123 non-null  float64
 6   CompetitionDistance     677123 non-null  float64
 7   Promo2                  677123 non-null  float64
 8   Promo2Since             677123 non-null  float64
 9   CompetitionOpenSince    677123 non-null  float64
 10  isPromoMonth            677123 non-null  float64
 11  DayOfWeek_1             677123 non-null  float64
 12  DayOfWeek_2             677123 non-null  float64
 13  DayOfWeek_3             677123 non-null  float64
 14  DayOfWeek_4    

In [12]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 167269 entries, 76916 to 0
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Store                 167269 non-null  float64
 1   Sales                 167269 non-null  float64
 2   Promo                 167269 non-null  float64
 3   SchoolHoliday         167269 non-null  float64
 4   month                 167269 non-null  float64
 5   day                   167269 non-null  float64
 6   CompetitionDistance   167269 non-null  float64
 7   Promo2                167269 non-null  float64
 8   Promo2Since           167269 non-null  float64
 9   CompetitionOpenSince  167269 non-null  float64
 10  isPromoMonth          167269 non-null  float64
 11  DayOfWeek_1           167269 non-null  float64
 12  DayOfWeek_2           167269 non-null  float64
 13  DayOfWeek_3           167269 non-null  float64
 14  DayOfWeek_4           167269 non-null  float64
 15  D

In [13]:
# no Christmas days in test data so, we will add zero column to the test data
test_data['StateHoliday_christmas']=0.0
test_data=test_data[train_data.columns]
test_data.columns

Index(['Store', 'Sales', 'Promo', 'SchoolHoliday', 'month', 'day',
       'CompetitionDistance', 'Promo2', 'Promo2Since', 'CompetitionOpenSince',
       'isPromoMonth', 'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3',
       'DayOfWeek_4', 'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7',
       'StateHoliday_christmas', 'StateHoliday_easter', 'StateHoliday_public',
       'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d',
       'Assortment_a', 'Assortment_b', 'Assortment_c'],
      dtype='object')

## arrange train columns

<img src="./assets/lstm_nn.png"  alt="./assets/lstm_nn.png"/>

In [16]:
print(len(train_data.columns))
print(len(test_data.columns))

28
28


In [15]:
train_data.head()

Unnamed: 0,Store,Sales,Promo,SchoolHoliday,month,day,CompetitionDistance,Promo2,Promo2Since,CompetitionOpenSince,...,StateHoliday_christmas,StateHoliday_easter,StateHoliday_public,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
621467,682.0,3375.0,0.0,1.0,1.0,1.0,5.010635,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
386137,423.0,9643.0,0.0,1.0,1.0,1.0,7.146772,0.0,0.0,2.772589,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
700661,769.0,5035.0,0.0,1.0,1.0,1.0,6.733402,1.0,0.693147,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
77677,85.0,4220.0,0.0,1.0,1.0,1.0,7.533694,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
305081,335.0,2401.0,0.0,1.0,1.0,1.0,4.49981,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [17]:
test_data.head()

Unnamed: 0,Store,Sales,Promo,SchoolHoliday,month,day,CompetitionDistance,Promo2,Promo2Since,CompetitionOpenSince,...,StateHoliday_christmas,StateHoliday_easter,StateHoliday_public,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
76916,85.0,13899.0,0.0,0.0,2.0,1.0,7.533694,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
863366,948.0,9867.0,0.0,0.0,2.0,1.0,7.26543,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
666908,733.0,18263.0,0.0,0.0,2.0,1.0,6.756932,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
237578,262.0,30525.0,0.0,0.0,2.0,1.0,7.07327,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
699900,769.0,13823.0,0.0,0.0,2.0,1.0,6.733402,1.0,3.295837,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## save train test data

In [18]:
# train_data
#   sales data
file_name="merged_sales_train.csv"
train_data.to_csv(join_pth(prep_dataset_path,file_name),index=False)


# test_data
#   sales data
file_name="merged_sales_test.csv"
test_data.to_csv(join_pth(prep_dataset_path,file_name),index=False)



# load data

In [19]:
file_name="merged_sales_train.csv"
merged_sales_train=pd.read_csv(join_pth(prep_dataset_path,file_name))


file_name="merged_sales_test.csv"
merged_sales_test=pd.read_csv(join_pth(prep_dataset_path,file_name))


# Train loader

In [46]:
train_dataset=data_loader.LSTMSalesDataset(merged_sales_train,seq_length=30)
test_dataset=data_loader.LSTMSalesDataset(merged_sales_test,seq_length=30)

In [47]:
batch_size=32
train_loader=DataLoader(train_dataset,batch_size=batch_size,drop_last=True)
test_loader=DataLoader(test_dataset,batch_size=batch_size,drop_last=True)

In [48]:
# data loader test
iterr=iter(train_loader)
lstm_in,nn_in,out=next(iterr)

print(f"lstm in shape --> {lstm_in.shape} nn in shape --> {nn_in.shape} out-> shape {out.shape}")



lstm in shape --> torch.Size([32, 30, 15]) nn in shape --> torch.Size([32, 26]) out-> shape torch.Size([32, 1])


# model training

In [49]:
lstm_architecture={"input_size":train_dataset.no_lstm_cols,"num_layers":1,"hidden_size":256}
nn_hidden=[512,256]
nn_architecture={"input_size":train_dataset.no_nn_cols,"hidden_shape":nn_hidden}
lstm_model=models.SalesLstm(lstm_architecture,nn_architecture,dropout_prop=0.5)
lstm_model.to(device)



In [50]:
lstm_model

SalesLstm(
  (lstm): LSTM(15, 256, batch_first=True, dropout=0.5)
  (fcn): Sequential(
    (0): Linear(in_features=282, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
    (4): Dropout(p=0.5, inplace=False)
    (5): Linear(in_features=256, out_features=1, bias=True)
    (6): ReLU()
  )
)

In [51]:

out,lstm_hidden=lstm_model(lstm_in,nn_in)

In [52]:
out.shape

torch.Size([32, 1])

In [53]:
train_losses,valid_losses=model_training.lstm_train(lstm_model,train_loader,test_loader,1,last_weights=True,train_data_dir=nn_model_train_data_pth,weights_dir=nn_model_weights_pth,device=device)

Testing before training
 testing [..........] time remaining (m) = 9.5 Avg Test_Loss=329156.8771186414

KeyboardInterrupt: 