Problem Statement: Create a predictive model that can forecast future sales of a retail store by analyzing its historical sales data, weather conditions, public holidays and promotions. The objective is to provide accurate sales forecasts for the next 3 months to help the store management optimize their inventory, staffing, and marketing strategies.
The solution should be able to identify seasonal trends and long-term trends in the sales data and adjust the forecasts accordingly. This will enable the store management to make informed decisions about the quantity and variety of products to stock, the number of employees to hire, and the timing and content of marketing campaigns.
A successful implementation of this solution will help the retail store improve profitability and customer satisfaction by enhancing its operational efficiency and meeting customer demand. Additionally, this solution can have a broader impact on the retail industry, as it can be applied to other retail stores and help them improve their sales forecasting capabilities.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from prophet.plot import plot_plotly, plot_components_plotly

from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics

import warnings
warnings.filterwarnings('ignore')

In [None]:
sales = pd.read_csv('/content/drive/MyDrive/archive (7)/sales data-set.csv')
features = pd.read_csv('/content/drive/MyDrive/archive (7)/Features data set.csv')

In [None]:
sales

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-05-02,24924.50,False
1,1,1,2010-12-02,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-05-03,21827.90,False
...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False
421566,45,98,2012-05-10,628.10,False
421567,45,98,2012-12-10,1061.02,False
421568,45,98,2012-10-19,760.01,False


In [None]:
features

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False
1,1,12/02/2010,38.51,2.548,,,,,,211.242170,8.106,True
2,1,19/02/2010,39.93,2.514,,,,,,211.289143,8.106,False
3,1,26/02/2010,46.63,2.561,,,,,,211.319643,8.106,False
4,1,05/03/2010,46.50,2.625,,,,,,211.350143,8.106,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8185,45,28/06/2013,76.05,3.639,4842.29,975.03,3.00,2449.97,3169.69,,,False
8186,45,05/07/2013,77.50,3.614,9090.48,2268.58,582.74,5797.47,1514.93,,,False
8187,45,12/07/2013,79.37,3.614,3789.94,1827.31,85.72,744.84,2150.36,,,False
8188,45,19/07/2013,82.84,3.737,2961.49,1047.07,204.19,363.00,1059.46,,,False


In [None]:
sales.describe(include='all')

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
count,421570.0,421570.0,421570,421570.0,421570
unique,,,143,,2
top,,,23/12/2011,,False
freq,,,3027,,391909
mean,22.200546,44.260317,,15981.258123,
std,12.785297,30.492054,,22711.183519,
min,1.0,1.0,,-4988.94,
25%,11.0,18.0,,2079.65,
50%,22.0,37.0,,7612.03,
75%,33.0,74.0,,20205.8525,


In [None]:
sales.Date = pd.to_datetime(sales.Date)
sales.Date.dtype

dtype('<M8[ns]')

In [None]:
store_1 = sales[(sales.Store == 1) & (sales.Dept == 1)].sort_values('Date')
store_1.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
34,1,1,2010-01-10,20094.19,False
8,1,1,2010-02-04,57258.43,False
21,1,1,2010-02-07,16333.14,False
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False


In [None]:
df = store_1[['Date', 'Weekly_Sales']]
df.columns = ['ds', 'y']

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['ds'],
                         y=df['y'],
                         marker_color='#9597fb'))

fig.update_layout(xaxis_title="Date",
                  yaxis_title="Weekly Sales (US$)",
                  title="Weekly Sales (US$) Throughout 2010, 2011 and 2012")
fig.show()

In [None]:
m = Prophet(interval_width=.95,
            daily_seasonality=False,
            weekly_seasonality=True).fit(df)

future = m.make_future_dataframe(periods=5, freq='W') # Forecasting 5 weeks into the future.
forecast = m.predict(future)

plot_plotly(m, forecast) # Plotting.

DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/nov1saoq.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/ix3z42uc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.9/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=36680', 'data', 'file=/tmp/tmprj06nf8d/nov1saoq.json', 'init=/tmp/tmprj06nf8d/ix3z42uc.json', 'output', 'file=/tmp/tmprj06nf8d/prophet_modelvgru9b6i/prophet_model-20230330182945.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:29:45 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
18:29:45 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


In [None]:
plot_components_plotly(m, forecast)

In [None]:
class suppress_stdout_stderr(object):

    def __init__(self):
        self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
        self.save_fds = (os.dup(1), os.dup(2))

    def __enter__(self):
        os.dup2(self.null_fds[0], 1)
        os.dup2(self.null_fds[1], 2)

    def __exit__(self, *_):
        os.dup2(self.save_fds[0], 1)
        os.dup2(self.save_fds[1], 2)
        os.close(self.null_fds[0])
        os.close(self.null_fds[1])

In [None]:
def getCrossValidationData(m):
    with suppress_stdout_stderr():
        c_v = cross_validation(m,
                               initial='120W',   # Initially, the model will be trained in 120 weeks.
                               period='2W',      # After each model tested, we'll add 2 more weeks.
                               horizon ='2W',    # The forecasting will happen in a range of 2 weeks.
                               parallel="processes",   # To acellerate the cross-validation.
                              )
    return c_v

def getPerfomanceMetrics(m):
    return performance_metrics(getCrossValidationData(m),
                               rolling_window=1, # Generate metrics for the whole (100%) seen data.
                              )

In [None]:
import os

In [None]:
getPerfomanceMetrics(m).mean()

INFO:prophet:Making 14 forecasts with cutoffs between 2012-05-07 00:00:00 and 2012-11-26 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x7f37c1b2e730>
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/_bk3bhx4.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/pp7e9ecr.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/r2au5chy.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/nkha3msd.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.9/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=44792', 'data', 'file=/tmp/tmprj06nf8d/pp7e9ecr.json', 'init=/tmp/tmprj06nf8d/r2au5chy.json', 'output', 'file=/tmp/tmprj06nf8d/prophet_modelpg9kwqg0/prophet_model-20230330181307.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:13:07 - cmdstanpy - INFO - Chain [1] start processing
DEBUG:cmdstanpy:running

horizon     14 days 00:00:00
mse         106520133.658835
rmse            10320.859153
mae              6444.558743
mape                0.278483
mdape               0.134906
smape               0.252406
coverage            0.851852
dtype: object

In [None]:
superbowls = pd.DataFrame({
  'holiday': 'superbowl',
  'ds': pd.to_datetime(['2010-02-07', '2011-02-06', '2012-02-05', '2013-02-03']),
  'lower_window': -2,
  'upper_window': 2,
})
easter = pd.DataFrame({
  'holiday': 'easter',
  'ds': pd.to_datetime(['2010-04-05', '2011-04-25', '2012-04-09', '2013-04-01']),
  'lower_window': -2,
  'upper_window': 1,
})
mothers_day = pd.DataFrame({
    'holiday': "mother's day",
    'ds': pd.to_datetime(['2010-05-09', '2011-05-08', '2012-05-13', '2013-02-12']),
    'lower_window': -3,
    'upper_window': 0,
})
fathers_day = pd.DataFrame({
    'holiday': "father's day",
    'ds': pd.to_datetime(['2010-06-19', '2011-06-19', '2012-06-17', '2013-06-16']),
    'lower_window': -3,
    'upper_window': 0,
})
halloween = pd.DataFrame({
    'holiday': "father's day",
    'ds': pd.to_datetime(['2010-10-31', '2011-10-31', '2012-10-31', '2013-10-31']),
    'lower_window': -3,
    'upper_window': 2,
})
black_friday = pd.DataFrame({
    'holiday': "black friday",
    'ds': pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29']),
    'lower_window': 0,
    'upper_window': 0,
})
cyber_monday = pd.DataFrame({
    'holiday': "cyber monday",
    'ds': pd.to_datetime(['2010-11-29', '2011-11-28', '2012-12-26', '2013-12-02']),
    'lower_window': 0,
    'upper_window': 0,
})

holidays = pd.concat((superbowls,
                      easter,
                      mothers_day,
                      fathers_day,
                      halloween,
                      black_friday,
                      cyber_monday))

In [None]:
m = Prophet(holidays=holidays,
            interval_width=.95,
            daily_seasonality=False)
m.add_country_holidays(country_name='US')
with suppress_stdout_stderr():
    m.fit(df)
future = m.make_future_dataframe(periods=5, freq='W')
forecast = m.predict(future)
plot_plotly(m, forecast)

DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/vftss2l4.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/xmlr2ql9.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.9/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=30127', 'data', 'file=/tmp/tmprj06nf8d/vftss2l4.json', 'init=/tmp/tmprj06nf8d/xmlr2ql9.json', 'output', 'file=/tmp/tmprj06nf8d/prophet_modelz5jpbpxr/prophet_model-20230330181309.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:13:09 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
18:13:09 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


In [None]:
getPerfomanceMetrics(m).mean()

INFO:prophet:Making 14 forecasts with cutoffs between 2012-05-07 00:00:00 and 2012-11-26 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x7f37c1703400>
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/x4ueqow6.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/hhn8rpgp.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/ju8wbvz5.json
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.9/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=39439', 'data', 'file=/tmp/tmprj06nf8d/x4ueqow6.json', 'init=/tmp/tmprj06nf8d/hhn8rpgp.json', 'output', 'file=/tmp/tmprj06nf8d/prophet_model5aysi2j2/prophet_model-20230330181309.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:13:09 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/yzr_jqh3.json
DEB

horizon     14 days 00:00:00
mse         121199832.523461
rmse            11009.079549
mae              6881.704443
mape                0.301894
mdape               0.149187
smape               0.264729
coverage            0.777778
dtype: object

In [None]:
store_1f = features[features.Store == 1].drop(['Store', 'CPI', 'Unemployment', 'IsHoliday'], axis=1) # Picking only useful regressors for Store 1.

store_1f['Date'] = pd.to_datetime(store_1f['Date']) # Converting it to datetime type.

store_1f.fillna(0, inplace=True)# Filling null values (markdowns) with 0.

df = store_1f.merge(store_1[['Date', 'Weekly_Sales']], on='Date') # Merging a new dataframe with the regressors.

df.rename({'Date': 'ds', 'Weekly_Sales': 'y'}, axis=1, inplace=True) # Renaming columns.
store_1f.rename({'Date': 'ds'}, axis=1, inplace=True)

regressors = df.drop(['ds', 'y'], axis=1).columns # Defining regressors names iterable.

In [None]:
df.tail() # The regressors are the temperature, fuel price and markdowns 1 to 5.

Unnamed: 0,ds,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,y
138,2012-09-28,76.08,3.666,3666.27,7.64,1.65,1417.96,4744.28,18947.81
139,2012-05-10,68.55,3.617,8077.89,0.0,18.22,3617.43,3626.14,21904.47
140,2012-12-10,62.99,3.601,2086.18,0.0,8.11,602.36,5926.45,22764.01
141,2012-10-19,67.97,3.594,950.33,0.0,4.93,80.25,2312.85,24185.27
142,2012-10-26,69.16,3.506,2585.85,31.75,6.0,1057.16,1305.01,27390.81


In [None]:
store_1f

Unnamed: 0,ds,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5
0,2010-05-02,42.31,2.572,0.00,0.00,0.00,0.00,0.00
1,2010-12-02,38.51,2.548,0.00,0.00,0.00,0.00,0.00
2,2010-02-19,39.93,2.514,0.00,0.00,0.00,0.00,0.00
3,2010-02-26,46.63,2.561,0.00,0.00,0.00,0.00,0.00
4,2010-05-03,46.50,2.625,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...
177,2013-06-28,83.94,3.495,4205.98,796.70,6.84,3816.78,4812.74
178,2013-05-07,79.85,3.422,7649.99,3503.29,1766.77,9454.96,1079.89
179,2013-12-07,83.12,3.400,6089.94,1362.42,209.62,2367.42,2651.05
180,2013-07-19,79.26,3.556,3117.04,1060.39,199.05,1012.30,5381.72


In [None]:
m = Prophet(interval_width=.95,
            daily_seasonality=False)

for regressor in regressors: # Adding the regressors.
    m.add_regressor(regressor)

with suppress_stdout_stderr(): # Training.
    m.fit(df)

# The forecast must have the same regressors.
# In this case we're going to forecast until the end of March of the next year.
future = store_1f[store_1f.ds < '2013-03-31']
forecast = m.predict(future)
plot_plotly(m, forecast)


DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/m7y9ppvd.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/xsdxg7zr.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.9/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=96343', 'data', 'file=/tmp/tmprj06nf8d/m7y9ppvd.json', 'init=/tmp/tmprj06nf8d/xsdxg7zr.json', 'output', 'file=/tmp/tmprj06nf8d/prophet_modelnxhbdcpv/prophet_model-20230330185123.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:51:23 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
18:51:23 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


In [None]:
future[(future.ds > '2013-01') & (future.ds < '2013-04')]

Unnamed: 0,ds,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5
154,2013-01-18,42.92,3.237,3772.69,3559.46,3.88,246.62,1900.4
155,2013-01-25,53.37,3.227,965.89,1097.91,0.1,225.36,1831.88
156,2013-01-02,56.46,3.244,9290.91,1359.9,265.0,20657.82,972.61
158,2013-02-15,49.66,3.475,72937.29,6665.52,47.21,13014.67,6310.18
159,2013-02-22,50.25,3.597,20107.75,3163.89,42.2,15657.3,5812.86
160,2013-01-03,48.01,3.711,10610.74,261.46,2.8,25.54,2747.59
162,2013-03-15,55.33,3.622,3808.13,0.0,15.65,2616.6,1909.17
163,2013-03-22,63.42,3.611,12553.98,0.0,495.1,6787.75,2545.66
164,2013-03-29,51.0,3.606,13067.46,0.0,384.9,122.93,3903.8
169,2013-03-05,66.66,3.386,2298.63,2.0,129.9,55.46,1301.04


In [None]:
future.tail().sort_values(by="ds")

Unnamed: 0,ds,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5
160,2013-01-03,48.01,3.711,10610.74,261.46,2.8,25.54,2747.59
169,2013-03-05,66.66,3.386,2298.63,2.0,129.9,55.46,1301.04
162,2013-03-15,55.33,3.622,3808.13,0.0,15.65,2616.6,1909.17
163,2013-03-22,63.42,3.611,12553.98,0.0,495.1,6787.75,2545.66
164,2013-03-29,51.0,3.606,13067.46,0.0,384.9,122.93,3903.8


In [None]:
getPerfomanceMetrics(m).mean()

INFO:prophet:Making 14 forecasts with cutoffs between 2012-05-07 00:00:00 and 2012-11-26 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x7f37c1adfca0>
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/z8gc93ng.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/4o6dv1eo.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.9/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=6148', 'data', 'file=/tmp/tmprj06nf8d/z8gc93ng.json', 'init=/tmp/tmprj06nf8d/4o6dv1eo.json', 'output', 'file=/tmp/tmprj06nf8d/prophet_model0zawng8e/prophet_model-20230330185152.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:51:52 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/s2th37fr.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprj06nf8d/7ouwppqn.json
DEBU

horizon     14 days 00:00:00
mse          85215167.547006
rmse             9231.206181
mae              6006.263973
mape                0.268047
mdape                0.17647
smape               0.254654
coverage            0.925926
dtype: object