# imports and constants

In [29]:
import pandas as pd

from os.path import join as join_pth
import matplotlib.pyplot as plt
import random
from IPython.core.display_functions import display
import seaborn as sns
# % matplotlib inline
sns.set_style('whitegrid')
import torch
from src import *
from src import data_loader,models,model_training
from torch.utils.data.dataloader import  DataLoader
from sklearn.model_selection import train_test_split

In [33]:
from importlib import reload,import_module

reload(data_loader)
reload(data_preprocessing)
reload(models)
reload(model_training)

<module 'src.model_training' from 'D:\\2022 acheivments\\Projects\\DeepLearning NanoDegree\\Rossmann-Store-Sales\\src\\model_training.py'>

In [31]:
raw_dataset_path = "../dataset/raw"
prep_dataset_path="../dataset/prep"
nn_model_weights_pth="../model_weights/nn_model"
nn_model_train_data_pth="../train_data/nn_model"

# roadmap
- data preparation after analysis run bulk preprocessing functions that have all preprocessing and feature engineering done at the analysis section
- start with the neural network without lstm layers and get the best score after hyperparameters tuning
    - create data loader for the neural network
    - find the best hyperparameters for the first epochs
    - train the model on the best hyperparameters

# Data preparation
- after analysing , handling outliers the data and do feature engineering on the data columns in the [rossmann-store-sales-analysis](./Rossmann-Store-Sales.ipynb) notebook column by column
- all steps are combined in the [data_preprocessing.py](../src/data_preprocessing.py) module for bulk preprocessing
    - store data preprocessing and feature engineering
    - store_sales preprocessing and feature engineering
    - merging store data and store sales and add new columns



In [4]:
stores_df = pd.read_csv(join_pth(raw_dataset_path, "store.csv"), low_memory=False)
stores_sales_df = pd.read_csv(join_pth(raw_dataset_path, "train.csv"), low_memory=False)

# Sales bulk preprocessing
stores_sales_df_prep=data_preprocessing.store_sales_prep(stores_sales_df=stores_sales_df)

# Store data bulk preprocessing
stores_data_df_prep=data_preprocessing.store_data_prep(store_data_df=stores_df)

# merge and do bulk preprocessing
merge_prep=data_preprocessing.merge_store_sales(sales_data_df=stores_sales_df_prep,store_data_df=stores_data_df_prep)

# drop closed stores data and open column
merge_prep=data_preprocessing.drop_closed_days(merge_prep)



In [5]:
merge_prep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844392 entries, 0 to 1017207
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Store                 844392 non-null  int64   
 1   DayOfWeek             844392 non-null  category
 2   Sales                 844392 non-null  int64   
 3   Promo                 844392 non-null  category
 4   StateHoliday          910 non-null     object  
 5   SchoolHoliday         844392 non-null  category
 6   month                 844392 non-null  int64   
 7   day                   844392 non-null  int64   
 8   StoreType             844392 non-null  category
 9   Assortment            844392 non-null  category
 10  CompetitionDistance   844392 non-null  float64 
 11  Promo2                844392 non-null  category
 12  Promo2Since           844392 non-null  float64 
 13  CompetitionOpenSince  844392 non-null  int64   
 14  isPromoMonth          844392 non-nu

In [6]:
# hot-encode the categorical data
encoded_data=data_preprocessing.hot_encoding(merged_data=merge_prep)

In [7]:
encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844392 entries, 0 to 1017207
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Store                   844392 non-null  float64
 1   Sales                   844392 non-null  float64
 2   Promo                   844392 non-null  float64
 3   SchoolHoliday           844392 non-null  float64
 4   month                   844392 non-null  float64
 5   day                     844392 non-null  float64
 6   CompetitionDistance     844392 non-null  float64
 7   Promo2                  844392 non-null  float64
 8   Promo2Since             844392 non-null  float64
 9   CompetitionOpenSince    844392 non-null  float64
 10  isPromoMonth            844392 non-null  float64
 11  DayOfWeek_1             844392 non-null  float64
 12  DayOfWeek_2             844392 non-null  float64
 13  DayOfWeek_3             844392 non-null  float64
 14  DayOfWeek_4        

In [39]:
train_data,test_data=train_test_split(encoded_data,test_size=0.2)



In [40]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 675513 entries, 113922 to 78624
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Store                   675513 non-null  float64
 1   Sales                   675513 non-null  float64
 2   Promo                   675513 non-null  float64
 3   SchoolHoliday           675513 non-null  float64
 4   month                   675513 non-null  float64
 5   day                     675513 non-null  float64
 6   CompetitionDistance     675513 non-null  float64
 7   Promo2                  675513 non-null  float64
 8   Promo2Since             675513 non-null  float64
 9   CompetitionOpenSince    675513 non-null  float64
 10  isPromoMonth            675513 non-null  float64
 11  DayOfWeek_1             675513 non-null  float64
 12  DayOfWeek_2             675513 non-null  float64
 13  DayOfWeek_3             675513 non-null  float64
 14  DayOfWeek_4     

<p style="font-size:18;font-weight:bold">Save preprocessed data</p>

In [41]:
file_name="merged_sales_train.csv"
train_data.to_csv(join_pth(prep_dataset_path,file_name),index=False)

file_name="merged_sales_test.csv"
test_data.to_csv(join_pth(prep_dataset_path,file_name),index=False)

# Data Loader
- load preprocessed data csv
- create neural network model data loader

In [21]:
file_name="merged_sales_train.csv"
sales_train_dataset=pd.read_csv(join_pth(prep_dataset_path,file_name))

file_name="merged_sales_test.csv"
sales_test_dataset=pd.read_csv(join_pth(prep_dataset_path,file_name))
sales_train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 675513 entries, 0 to 675512
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Store                   675513 non-null  float64
 1   Sales                   675513 non-null  float64
 2   Promo                   675513 non-null  float64
 3   SchoolHoliday           675513 non-null  float64
 4   month                   675513 non-null  float64
 5   day                     675513 non-null  float64
 6   CompetitionDistance     675513 non-null  float64
 7   Promo2                  675513 non-null  float64
 8   Promo2Since             675513 non-null  float64
 9   CompetitionOpenSince    675513 non-null  float64
 10  isPromoMonth            675513 non-null  float64
 11  DayOfWeek_1             675513 non-null  float64
 12  DayOfWeek_2             675513 non-null  float64
 13  DayOfWeek_3             675513 non-null  float64
 14  DayOfWeek_4         

In [22]:
nn_sales_train_dataset=data_loader.NNSalesDataset(sales_train_dataset)
nn_sales_test_dataset=data_loader.NNSalesDataset(sales_test_dataset)

In [23]:
# dataset testing
dataset_iter=iter(nn_sales_train_dataset)
x,y=next(dataset_iter)
print(x)
print(x.shape)
print(y)

tensor([ 0.0000,  0.0000, 11.0000, 28.0000,  0.7600,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  1.0000,
         0.0000,  0.0000])
torch.Size([26])
tensor([12142.])


In [7]:
display(merge_prep.iloc[0])
display(encoded_data.iloc[0])
display(x)
display(y)


NameError: name 'merge_prep' is not defined

In [24]:
batch_size=64
nn_sales_train_dataloader=DataLoader(nn_sales_train_dataset,batch_size=batch_size,shuffle=True)
nn_sales_test_dataloader=DataLoader(nn_sales_test_dataset,batch_size=batch_size,shuffle=True)

In [25]:
# Data loader testing
dataloader_iter=iter(nn_sales_train_dataloader)
x,y=next(dataloader_iter)
print(x.shape)
print(y.shape)

torch.Size([64, 26])
torch.Size([64, 1])


In [26]:
input_size=nn_sales_train_dataloader.dataset.no_cols
hidden_shape=[256,32]
output_size=1
dropout_prop=0.5
nn_model=models.SalesNN(input_size,hidden_shape,output_size,dropout_prop)
train_losses,valid_losses=model_training.nn_model_train(nn_model,nn_sales_train_dataloader,nn_sales_test_dataloader,25,last_weights=True,train_data_dir=nn_model_train_data_pth,weights_dir=nn_model_weights_pth)

 training epoch 1 time Taken (m) = 1.99 Avg Train_Loss=10589939.94000474263223
 Test  time Taken (m) = 2.36 Avg Test_Loss=7361836.41890868s=7361172.974412439223
 epoch 1 train_loss =10589939.94000474 test_loss=7361836.41890868 total_time= 4.35
 training epoch 2 time Taken (m) = 2.23 Avg Train_Loss=10154112.8407863631741285
 Test  time Taken (m) = 2.18 Avg Test_Loss=7256644.15384615s=7256394.986163768273
 epoch 2 train_loss =10154112.84078636 test_loss=7256644.15384615 total_time= 4.41
new minimum test loss 7256644.15384615  achieved, model weights saved 
 training epoch 3 time Taken (m) = 2.12 Avg Train_Loss=9996776.320369494798258713
 Test  time Taken (m) = 2.11 Avg Test_Loss=7085382.27690413s=7085184.851118273823
 epoch 3 train_loss =9996776.32036949 test_loss=7085382.27690413 total_time= 4.23
new minimum test loss 7085382.27690413  achieved, model weights saved 
 training epoch 4 time Taken (m) = 2.04 Avg Train_Loss=9913537.11364282482287797
 Test  time Taken (m) = 2.03 Avg Test_Los

TypeError: cannot unpack non-iterable NoneType object

In [17]:
np.expand_dims(x,axis=0)

array([[1, 2]])