In [6]:
import pandas as pd
import numpy as np

import plotly.express as px

# model evaluation
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error


from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly, plot_cross_validation_metric, add_changepoints_to_plot

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def preprocess_data(df:pd.DataFrame) -> pd.DataFrame:
    df.date = pd.to_datetime(df.date)
    df['day_of_week'] = df['date'].dt.day_name()
    return df

# load train dataset | convert datatype of "date" column from "object" to "datetime"
stores_df = pd.read_csv("data/data/train.csv")
stores_df = preprocess_data(stores_df)

stores_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_of_week
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,Tuesday
1,1,2013-01-01,1,BABY CARE,0.000,0,Tuesday
2,2,2013-01-01,1,BEAUTY,0.000,0,Tuesday
3,3,2013-01-01,1,BEVERAGES,0.000,0,Tuesday
4,4,2013-01-01,1,BOOKS,0.000,0,Tuesday
...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,Tuesday
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,Tuesday
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,Tuesday
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,Tuesday


In [10]:
# sum up sales for the day
def sum_sales_per_day(df: pd.DataFrame, store_number:int=3) -> pd.DataFrame:
    day_level_df = df[df["store_nbr"]==store_number]\
        [
            ["date", "sales", "day_of_week"]
        ]\
            .groupby("date").agg(
                {
                    "sales": "sum",
                    "day_of_week": "first"
                }).reset_index()

    return day_level_df


day_level_df = sum_sales_per_day(stores_df)

day_level_df.head()

Unnamed: 0,date,sales,day_of_week
0,2013-01-01,0.0,Tuesday
1,2013-01-02,24060.348,Wednesday
2,2013-01-03,18570.745025,Thursday
3,2013-01-04,17392.097995,Friday
4,2013-01-05,22700.872005,Saturday


In [None]:
# visualize sales
fig = px.line(day_level_df, x='date', y=["sales"], markers=True, title="Store sales")
fig.show()

In [14]:
# plot sales per each day of week
fig = px.box(day_level_df, x='day_of_week', y="sales", color="day_of_week",
             boxmode="overlay", points='all')
fig.update_layout(
    margin=dict(l=20, r=20, t=30, b=20),
    paper_bgcolor="LightSteelBlue",
    width=1400,
    height=700,
    title='Weekdays sales distribution',
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [16]:
from statsmodels.tsa.stattools import adfuller

adftest = adfuller(day_level_df[:30].set_index('date')['sales'].dropna()) #autolag = 'AIC', regression = 'n')
print("ADF Test Results")
print("Null Hypothesis: The series has an Unit Root")
print("P-Value:", adftest[1])

ADF Test Results
Null Hypothesis: The series has an Unit Root
P-Value: 0.5817703833310451


In [17]:
# replace Zero values on NaN
day_level_df["sales"] = day_level_df["sales"].mask(day_level_df["sales"] == float(0), None)
day_level_df.head()

Unnamed: 0,date,sales,day_of_week
0,2013-01-01,,Tuesday
1,2013-01-02,24060.348,Wednesday
2,2013-01-03,18570.745025,Thursday
3,2013-01-04,17392.097995,Friday
4,2013-01-05,22700.872005,Saturday


In [18]:
# amount of Nan values
day_level_df["sales"].isna().sum()
print(f'NaN value counts - {day_level_df["sales"].isna().sum()}')

NaN value counts - 5


In [20]:
# load holidays event
event_df = pd.read_csv("data/data/holidays_events.csv")
event_df = preprocess_data(event_df)

event_df.head()


Unnamed: 0,date,type,locale,locale_name,description,transferred,day_of_week
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False,Friday
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False,Sunday
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False,Thursday
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False,Saturday
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False,Saturday


In [21]:
# merge sales data with holidays event data
event_df['date'] = pd.to_datetime(event_df['date'])

day_level_df[day_level_df['sales'].isna()].merge(
    event_df[["date", "description"]],
    how="left"
)

Unnamed: 0,date,sales,day_of_week,description
0,2013-01-01,,Tuesday,Primer dia del ano
1,2014-01-01,,Wednesday,Primer dia del ano
2,2015-01-01,,Thursday,Primer dia del ano
3,2016-01-01,,Friday,Primer dia del ano
4,2017-01-01,,Sunday,Primer dia del ano


In [22]:
# choose filling zeroes for this dataset
day_level_df["sales"] = day_level_df["sales"].mask(day_level_df["sales"] == float(0), None)
day_level_df.fillna(0, inplace=True)

In [None]:
# Calculate Simple Moving Average
window_size = 30
MA_dataset = day_level_df.copy()
MA_dataset['MA_score'] = MA_dataset['sales'].rolling(window=window_size).mean()

# Plot actual and forecasted data
fig = px.line(MA_dataset, x='date', y=["sales", "MA_score"], markers=True, title="MA forecast")

# Show plot
fig.show()

In [24]:
# model evaluation
def evaluate_forecasting_model(actual_values:pd.Series, predicted_values:pd.Series, round_nbr:int=2) -> None:
    mape = mean_absolute_percentage_error(
        actual_values,
        predicted_values
    )
    mae = mean_absolute_error(
        actual_values,
        predicted_values
    )
    mse = mean_squared_error(
        actual_values,      
        predicted_values
    )

    print(f"MAE - {round(mae, round_nbr)}")
    print(f"MSE - {round(mse, round_nbr)}")
    print(f"MAPE - {round(mape, round_nbr)}")

evaluate_forecasting_model(
    actual_values=MA_dataset[-window_size:]['sales'],
    predicted_values=MA_dataset[-window_size:]['MA_score']
)

MAE - 4163.35
MSE - 24398107.21
MAPE - 0.13


In [25]:
# preprocess data to needed format
fbp_set = day_level_df[['date', 'sales']]
fbp_set.rename(columns={"date": "ds", "sales":"y"}, inplace=True)
fbp_set.fillna(0, inplace=True)
fbp_set.head()

Unnamed: 0,ds,y
0,2013-01-01,0.0
1,2013-01-02,24060.348
2,2013-01-03,18570.745025
3,2013-01-04,17392.097995
4,2013-01-05,22700.872005


In [26]:
# preprocess holidays dataframe
holiday_df = event_df.copy()
holiday_df.rename(columns={"date": "ds", "description":"holiday"}, inplace=True)
holiday_df = holiday_df[["ds", "holiday"]]
holiday_df.head()


Unnamed: 0,ds,holiday
0,2012-03-02,Fundacion de Manta
1,2012-04-01,Provincializacion de Cotopaxi
2,2012-04-12,Fundacion de Cuenca
3,2012-04-14,Cantonizacion de Libertad
4,2012-04-21,Cantonizacion de Riobamba


In [27]:
# split dataframe on train and test
window = 30
train, test = fbp_set[:-window], fbp_set[-window:]

In [28]:
# init and fit the model
model = Prophet(holidays=holiday_df)
model.fit(train)

# Create Future dates
future_sales_30_days = model.make_future_dataframe(periods=30, freq='D')
future_sales_180_days = model.make_future_dataframe(periods=180, freq='D')
future_sales_270_days = model.make_future_dataframe(periods=270, freq='D')
future_sales_365_days = model.make_future_dataframe(periods=365, freq='D')

# Predict sales
forecast_30_days = model.predict(future_sales_30_days)
forecast_180_days = model.predict(future_sales_180_days)
forecast_270_days = model.predict(future_sales_270_days)
forecast_365_days = model.predict(future_sales_365_days)

19:10:26 - cmdstanpy - INFO - Chain [1] start processing
19:10:26 - cmdstanpy - INFO - Chain [1] done processing


In [29]:
# evaluate forecasting for 30 days
benchmark_df = test.merge(forecast_30_days[["ds", "yhat"]], on="ds", how="left")
evaluate_forecasting_model(
    actual_values=benchmark_df['y'],
    predicted_values=benchmark_df['yhat'],
    round_nbr=3
)

MAE - 3580.227
MSE - 18256703.408
MAPE - 0.105


In [30]:
# evaluate forecasting for 180 days
benchmark_df = test.merge(forecast_180_days[["ds", "yhat"]], on="ds", how="left")
evaluate_forecasting_model(
    actual_values=benchmark_df['y'],
    predicted_values=benchmark_df['yhat'],
    round_nbr=3
)

MAE - 3580.227
MSE - 18256703.408
MAPE - 0.105


In [31]:
# evaluate forecasting for 270 days
benchmark_df = test.merge(forecast_270_days[["ds", "yhat"]], on="ds", how="left")
evaluate_forecasting_model(
    actual_values=benchmark_df['y'],
    predicted_values=benchmark_df['yhat'],
    round_nbr=3
)

MAE - 3580.227
MSE - 18256703.408
MAPE - 0.105


In [32]:
# evaluate forecasting for 365 days
benchmark_df = test.merge(forecast_365_days[["ds", "yhat"]], on="ds", how="left")
evaluate_forecasting_model(
    actual_values=benchmark_df['y'],
    predicted_values=benchmark_df['yhat'],
    round_nbr=3
)

MAE - 3580.227
MSE - 18256703.408
MAPE - 0.105
