In [None]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
import seaborn as sns


#### **1. GET THE DATA**

Import the data from CSV files

In [None]:
sales = pd.read_csv('./data/raw/sales.csv')
sku = pd.read_csv('./data/raw/sku.csv')
geo_params = pd.read_csv('./data/raw/geo_params.csv')
test = pd.read_csv('./data/raw/test.csv')

#### **2. BASIC INSPECTION OF THE DATA**

##### sales

In [None]:
sales.head()

In [None]:
sales.shape

There are 4605985 rows and 6 columns. Let's check if there are multiple records for a same sales ID

In [None]:
sales[sales.duplicated(subset=['ID'], keep= False)]

There are no multiple records for the same sales ID, which means that each sale is associated with only one product

In [None]:
sales.describe()

There are entries with price 0 and sales 0.001. It may be an error. Let´s also check if there are NaN entries

In [None]:
sales.isnull().values.any()

Let's see which columns

In [None]:
sales.isna().any()

In [None]:
learning_dates = [sales["date"].min(), sales["date"].max()]
learning_dates

##### sku

In [None]:
sku.head()

In [None]:
sku.shape

There are 60 rows and 9 columns.

##### geoparams

In [None]:
geo_params.head()

In [None]:
geo_params.shape

##### test

In [None]:
test.head()

Convert date to same format as *sales*

In [None]:
test['date'] = pd.to_datetime(test.date, format='%m/%d/%Y')
test['date'] = pd.to_datetime(test["date"].dt.strftime('%Y-%m-%d')).astype(str)
test

In [None]:
test.shape

There are 202737 rows and 6 columns. Let's check if there are duplicated records for a same ID

In [None]:
test[test.duplicated(subset=['ID'], keep= False)]

There are no duplicates

Let's analyze the quantitative columns

In [None]:
test.describe()

Let's see what dates includes the testing data:

In [None]:
testing_dates = [test["date"].min(), test["date"].max()]
testing_dates

#### **3.  DATA CLEANING**

In the *sales* table here are entries with NaN values in price and sales column. Let's change them to 0.

In [None]:
# Drop records with nulls
sales.fillna(0, inplace=True)
# Confirm that there aren´t any more nulls
sales.isnull().values.any()

In [None]:
sales.shape

The records went from 4605985 to 778366

Now, let's proceed to make a consolidated table <br>
First, lets join *sales* with *geoparams* to associate each row in sales to a city

In [None]:
sales_info = pd.merge(sales, geo_params, how="inner", on='geoCluster')

In [None]:
sales_info

#### **4.  DATA EXPLORATION**

Let's build a time series for a specific SKU (choose one)

In [None]:
sku_sample = 24

First, let's filter by SKU

In [None]:
subset = sales_info.loc[sales_info["SKU"]==sku_sample]
subset

There are many geoCluster (locations) associated for a single SKU, thus, we have to analyze them granularily

In [None]:
locations = subset['geoCluster'].unique()
locations

Let's pick one to create the time series

In [None]:
# --RANDOMLY--
geoc = random.choice(locations)

#--PICKING A SPECIFIC GEOCLUSTER--
# geoc = 

# Filter by a random geoCluster from the list
subset = subset.loc[subset["geoCluster"]==geoc]
# Order by date ascendant
subset.sort_values(by=['date'], inplace=True)
subset

In [None]:
#----------------------------------------------------------------
# GRAPH
#----------------------------------------------------------------

# size:
fig = plt.figure(figsize=(30,6))

# title:
plt.title('Time Series for Sales', fontsize=20)

# x axis:
# x values
x = subset.index.values.tolist()
# x ticks
my_xticks = subset['date']
plt.xticks(x[::7], my_xticks[::7], rotation='90')
# # x label
plt.xlabel("date", fontsize=16)

# y axis:
# y values
y = subset['sales']
# y label
plt.ylabel("sales", fontsize=16)


# create plot
plt.plot(x, y)
# add grids to plot
plt.grid(color='black', linestyle='-', linewidth=0.5)
# show plot
plt.show()

#----------------------------------------------------------------

#### **5.  DATA FORECASTING**

Let's build the forecasting procedure for the chosen SKU and geoCluster

First get the training and testing dates list

In [None]:
# training dates
training_dates = sorted(sales["date"].unique())
# testing dates
testing_dates = sorted(test["date"].unique())

Choose the variables/columns to train with

In [None]:
cols = list(sales_info)[4:6]
cols

Store them in one dataframe (convert to float to do normalization and not loose any information)

In [None]:
df_for_training = subset[cols].astype(float)
df_for_training

LSTM uses sigmoid and tanh that are sensitive to magnitude so values need to be normalized. Let's normalize as follows:


In [None]:
# normalize the dataset
scaler = StandardScaler()
scaler = scaler.fit(df_for_training)
df_for_training_scaled = scaler.transform(df_for_training)
df_for_training_scaled

As required for LSTM networks, we require to reshape an input data into n_samples x timesteps x n_features.<br>
In this example, the n_features is 5. We will make timesteps = 14 (past days data used for training). 

In [None]:
#Empty lists to be populated using formatted training data
trainX = []
trainY = []

In [None]:
n_future = 1   # Number of days we want to look into the future based on the past days.
n_past = 14  # Number of past days we want to use to predict the future.

Reformat input data into a shape: (n_samples x timesteps x n_features)<br>

In [None]:
for i in range(n_past, len(df_for_training_scaled) - n_future +1):
    trainX.append(df_for_training_scaled[i - n_past:i, 0:df_for_training.shape[1]])
    trainY.append(df_for_training_scaled[i + n_future - 1:i + n_future, 0])

trainX, trainY = np.array(trainX), np.array(trainY)

In [None]:
print('trainX shape == {}.'.format(trainX.shape))
print('trainY shape == {}.'.format(trainY.shape))

In [None]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
model.add(LSTM(32, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(trainY.shape[1]))

In [None]:
model.compile(optimizer='adam', loss='mse')
model.summary()