# imports and constants

In [96]:
import pandas as pd

from os.path import join as join_pth
import matplotlib.pyplot as plt
import random
from IPython.core.display_functions import display
import seaborn as sns
# % matplotlib inline
sns.set_style('whitegrid')
import torch
from src import *
from src import data_loader,models
from torch.utils.data.dataloader import  DataLoader

In [97]:
from importlib import reload,import_module

reload(data_loader)
reload(data_preprocessing)
reload(models)

<module 'src.models' from 'D:\\2022 acheivments\\Projects\\DeepLearning NanoDegree\\Rossmann-Store-Sales\\src\\models.py'>

In [67]:
raw_dataset_path = "../dataset/raw"
prep_dataset_path="../dataset/prep"

# roadmap
- data preparation after analysis run bulk preprocessing functions that have all preprocessing and feature engineering done at the analysis section
- start with the neural network without lstm layers and get the best score after hyperparameters tuning
    - create data loader for the neural network
    - find the best hyperparameters for the first epochs
    - train the model on the best hyperparameters

# Data preparation
- after analysing , handling outliers the data and do feature engineering on the data columns in the [rossmann-store-sales-analysis](./Rossmann-Store-Sales.ipynb) notebook column by column
- all steps are combined in the [data_preprocessing.py](../src/data_preprocessing.py) module for bulk preprocessing
    - store data preprocessing and feature engineering
    - store_sales preprocessing and feature engineering
    - merging store data and store sales and add new columns



In [49]:
stores_df = pd.read_csv(join_pth(raw_dataset_path, "store.csv"), low_memory=False)
stores_sales_df = pd.read_csv(join_pth(raw_dataset_path, "train.csv"), low_memory=False)

# Sales bulk preprocessing
stores_sales_df_prep=data_preprocessing.store_sales_prep(stores_sales_df=stores_sales_df)

# Store data bulk preprocessing
stores_data_df_prep=data_preprocessing.store_data_prep(store_data_df=stores_df)

# merge and do bulk preprocessing
merge_prep=data_preprocessing.merge_store_sales(sales_data_df=stores_sales_df_prep,store_data_df=stores_data_df_prep)

# drop closed stores data and open column
merge_prep=data_preprocessing.drop_closed_days(merge_prep)



In [50]:
merge_prep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844392 entries, 0 to 1017207
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Store                 844392 non-null  int64   
 1   DayOfWeek             844392 non-null  category
 2   Sales                 844392 non-null  int64   
 3   Promo                 844392 non-null  category
 4   StateHoliday          910 non-null     object  
 5   SchoolHoliday         844392 non-null  category
 6   month                 844392 non-null  int64   
 7   day                   844392 non-null  int64   
 8   StoreType             844392 non-null  category
 9   Assortment            844392 non-null  category
 10  CompetitionDistance   844392 non-null  float64 
 11  Promo2                844392 non-null  category
 12  Promo2Since           844392 non-null  float64 
 13  CompetitionOpenSince  844392 non-null  int64   
 14  isPromoMonth          844392 non-nu

In [53]:
# hot-encode the categorical data
encoded_data=data_preprocessing.hot_encoding(merged_data=merge_prep)

In [54]:
encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844392 entries, 0 to 1017207
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Store                   844392 non-null  float64
 1   Sales                   844392 non-null  float64
 2   Promo                   844392 non-null  float64
 3   SchoolHoliday           844392 non-null  float64
 4   month                   844392 non-null  float64
 5   day                     844392 non-null  float64
 6   CompetitionDistance     844392 non-null  float64
 7   Promo2                  844392 non-null  float64
 8   Promo2Since             844392 non-null  float64
 9   CompetitionOpenSince    844392 non-null  float64
 10  isPromoMonth            844392 non-null  float64
 11  DayOfWeek_1             844392 non-null  float64
 12  DayOfWeek_2             844392 non-null  float64
 13  DayOfWeek_3             844392 non-null  float64
 14  DayOfWeek_4        

<p style="font-size:18;font-weight:bold">Save preprocessed data</p>

In [55]:
file_name="merged_sales.csv"
encoded_data.to_csv(join_pth(prep_dataset_path,file_name),index=False)

# Data Loader
- load preprocessed data csv
- create neural network model data loader

In [56]:
file_name="merged_sales.csv"
sales_dataset=pd.read_csv(join_pth(prep_dataset_path,file_name))
sales_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 844392 entries, 0 to 844391
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Store                   844392 non-null  float64
 1   Sales                   844392 non-null  float64
 2   Promo                   844392 non-null  float64
 3   SchoolHoliday           844392 non-null  float64
 4   month                   844392 non-null  float64
 5   day                     844392 non-null  float64
 6   CompetitionDistance     844392 non-null  float64
 7   Promo2                  844392 non-null  float64
 8   Promo2Since             844392 non-null  float64
 9   CompetitionOpenSince    844392 non-null  float64
 10  isPromoMonth            844392 non-null  float64
 11  DayOfWeek_1             844392 non-null  float64
 12  DayOfWeek_2             844392 non-null  float64
 13  DayOfWeek_3             844392 non-null  float64
 14  DayOfWeek_4         

In [98]:
nn_sales_dataset=data_loader.NNSalesDataset(sales_dataset)

In [99]:
# dataset testing
dataset_iter=iter(nn_sales_dataset)
x,y=next(dataset_iter)
print(x)
print(x.shape)
print(y)

tensor([ 1.0000,  1.0000,  7.0000, 31.0000,  1.2700,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,
         0.0000,  0.0000])
torch.Size([26])
tensor(5263.)


In [100]:
display(merge_prep.iloc[0])
display(encoded_data.iloc[0])
display(x)
display(y)


Store                       1
DayOfWeek                   5
Sales                    5263
Promo                       1
StateHoliday              NaN
SchoolHoliday               1
month                       7
day                        31
StoreType                   c
Assortment                  a
CompetitionDistance      1.27
Promo2                      0
Promo2Since               0.0
CompetitionOpenSince        0
isPromoMonth            False
Name: 0, dtype: object

Store                        1.00
Sales                     5263.00
Promo                        1.00
SchoolHoliday                1.00
month                        7.00
day                         31.00
CompetitionDistance          1.27
Promo2                       0.00
Promo2Since                  0.00
CompetitionOpenSince         0.00
isPromoMonth                 0.00
DayOfWeek_1                  0.00
DayOfWeek_2                  0.00
DayOfWeek_3                  0.00
DayOfWeek_4                  0.00
DayOfWeek_5                  1.00
DayOfWeek_6                  0.00
DayOfWeek_7                  0.00
StateHoliday_christmas       0.00
StateHoliday_easter          0.00
StateHoliday_public          0.00
StoreType_a                  0.00
StoreType_b                  0.00
StoreType_c                  1.00
StoreType_d                  0.00
Assortment_a                 1.00
Assortment_b                 0.00
Assortment_c                 0.00
Name: 0, dtype: float64

tensor([ 1.0000,  1.0000,  7.0000, 31.0000,  1.2700,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,
         0.0000,  0.0000])

tensor(5263.)

In [101]:
nn_sales_dataloader=DataLoader(nn_sales_dataset,batch_size=64,shuffle=True)

In [102]:
# Data loader testing
dataloader_iter=iter(nn_sales_dataloader)
x,y=next(dataloader_iter)
print(x.shape)
print(y.shape)

torch.Size([64, 26])
torch.Size([64])


In [103]:
input_size=nn_sales_dataset.no_cols
hidden_shape=[256,32]
output_size=1
dropout_prop=0.5
nn_model=models.SalesNN(input_size,hidden_shape,output_size,dropout_prop)

In [105]:
nn_model(x).shape

torch.Size([64, 1])