### Setup

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import seaborn
import tensorflow as tf
from tensorflow import keras
%matplotlib inline

# project paths
project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

data_path = os.path.join(project_root_dir, "data")
os.makedirs(data_path, exist_ok=True)

# function for reading data
def read_data(filename, date_cols=None, file_path=data_path):
    csv_path = os.path.join(file_path, filename)
    return pd.read_csv(csv_path, parse_dates=date_cols)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)

### Read Data 

In [2]:
train = read_data("TRAIN.CSV", date_cols=["Date"])
test = read_data("TEST_FINAL.csv", date_cols=["Date"])
submission = read_data("SAMPLE.csv")

In [3]:
train.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [4]:
train.groupby("Date")['Store_id'].count()

Date
2018-01-01    365
2018-01-02    365
2018-01-03    365
2018-01-04    365
2018-01-05    365
             ... 
2019-05-27    365
2019-05-28    365
2019-05-29    365
2019-05-30    365
2019-05-31    365
Name: Store_id, Length: 516, dtype: int64

In [5]:
test.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount
0,T1188341,171,S4,L2,R3,2019-06-01,0,No
1,T1188342,172,S1,L1,R1,2019-06-01,0,No
2,T1188343,173,S4,L2,R1,2019-06-01,0,No
3,T1188344,174,S1,L1,R4,2019-06-01,0,No
4,T1188345,170,S1,L1,R2,2019-06-01,0,No


In [6]:
test.groupby("Date")['Store_id'].count()

Date
2019-06-01    365
2019-06-02    365
2019-06-03    365
2019-06-04    365
2019-06-05    365
             ... 
2019-07-27    365
2019-07-28    365
2019-07-29    365
2019-07-30    365
2019-07-31    365
Name: Store_id, Length: 61, dtype: int64

In [7]:
test.shape

(22265, 8)

## Prepare data for ML

In [8]:
# create day, month and year from Date column

train['Day'] = train['Date'].dt.day
test['Day'] = train['Date'].dt.day

train['Month'] = train['Date'].dt.month
test['Month'] = train['Date'].dt.month

train['Year'] = train['Date'].dt.year
test['Year'] = train['Date'].dt.year

# Drop Date column
train_date = train.pop("Date")
test_date = test.pop("Date")

In [9]:
# drop extra column from the training set
train.drop("#Order",axis=1, inplace=True)
# set the ID column as index as we are not allowed to use this column
train.set_index('ID', inplace=True)
test.set_index('ID', inplace=True)

In [10]:
# convert int cols that should be categorical
train['Holiday'] = train['Holiday'].map({0:"No", 1:"Yes"})
test['Holiday'] = test['Holiday'].map({0:"No", 1:"Yes"})

train['Day'] = train['Day'].astype(str)
test['Day'] = test['Day'].astype(str)

train['Month'] = train['Month'].astype(str)
test['Month'] = test['Month'].astype(str)

train['Year'] = train['Year'].astype(str)
test['Year'] = test['Year'].astype(str)

In [11]:
# seperate features and target
X_train = train.drop("Sales",axis=1).copy()
y_train = train['Sales'].copy()
X_test = test.copy()

In [12]:
# create preprocessing pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# select numerical and categorical columns
num_cols = X_train.select_dtypes(exclude="object").columns.tolist()
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

# numerical pipeline
num_pipe = make_pipeline(SimpleImputer(strategy="mean"))

# categorical pipeline
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

# full pipeline for data preprocessing
full_pipe = ColumnTransformer(
    [("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)]
)
full_pipe

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer())]),
                                 ['Store_id']),
                                ('cat',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='NA',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['Store_Type', 'Location_Type', 'Region_Code',
                                  'Holiday', 'Discount', 'Day', 'Month',
                                  'Year'])])

In [13]:
X_train = full_pipe.fit_transform(X_train)
X_test = full_pipe.transform(X_test)

In [14]:
full_pipe.named_transformers_.cat.named_steps.onehotencoder.categories_

[array(['S1', 'S2', 'S3', 'S4'], dtype=object),
 array(['L1', 'L2', 'L3', 'L4', 'L5'], dtype=object),
 array(['R1', 'R2', 'R3', 'R4'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
        '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29',
        '3', '30', '31', '4', '5', '6', '7', '8', '9'], dtype=object),
 array(['1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9'],
       dtype=object),
 array(['2018', '2019'], dtype=object)]

In [15]:
# Get the list of categories generated by the one-hot-encoder
ohe_categories = full_pipe.named_transformers_.cat.named_steps.onehotencoder.categories_

# Create nice names for our one hot encoded features
new_ohe_features = [
    f"{col}__{val}" for col, vals in zip(cat_cols, ohe_categories) for val in vals
]

# Create a new list with all names of features
all_features = num_cols + new_ohe_features

In [16]:
# Create pandas dataframe
X_train = pd.DataFrame(X_train, columns=all_features)
X_test = pd.DataFrame(X_test, columns=all_features)

In [17]:
X_train.head()

Unnamed: 0,Store_id,Store_Type__S1,Store_Type__S2,Store_Type__S3,Store_Type__S4,Location_Type__L1,Location_Type__L2,Location_Type__L3,Location_Type__L4,Location_Type__L5,...,Month__2,Month__3,Month__4,Month__5,Month__6,Month__7,Month__8,Month__9,Year__2018,Year__2019
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,253.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,252.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,251.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,250.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
X_test.head()

Unnamed: 0,Store_id,Store_Type__S1,Store_Type__S2,Store_Type__S3,Store_Type__S4,Location_Type__L1,Location_Type__L2,Location_Type__L3,Location_Type__L4,Location_Type__L5,...,Month__2,Month__3,Month__4,Month__5,Month__6,Month__7,Month__8,Month__9,Year__2018,Year__2019
0,171.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,172.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,173.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,174.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,170.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Train, validation and Test split

In [19]:
n = len(X_train)
X_train = X_train[0:int(n*0.7)]
y_train = y_train[0:int(n*0.7)]

X_valid = X_train[int(n*0.7):]
y_valid = y_train[int(n*0.7):]

X_test = X_test.copy()

### Normalize Data 

In [20]:
train_mean = X_train.mean()
train_std = X_train.std()

X_train = (X_train - train_mean) / train_std
X_valid = (X_valid - train_mean) / train_std
X_test = (X_test - train_mean) / train_std

## Time Series Generator

In [21]:
train_gen = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_train, y_train, length=365, sampling_rate=1, stride=1, batch_size=32
)

valid_gen = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    X_valid, y_valid, length=365, sampling_rate=1, stride=1, batch_size=32
)

# test_gen = tf.keras.preprocessing.sequence.TimeseriesGenerator(
#     X_train,length=365, sampling_rate=1, stride=1, batch_size=32
# )

ValueError: `start_index+length=365 > end_index=-1` is disallowed, as no part of the sequence would be left to be used as current step.