# Univariate Time Series Forecasting with Exogenous Variables

Perform EDA on the dataset to extract valuable insight about the process generating the time series.

In [None]:
!pip install pycaret
!pip install kaggle

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-2.0.2.tar.gz (165 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.8/165.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting deprecation>=2.1.0 (from pycaret)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from pycare

In [None]:
!kaggle datasets download -d pratyushakar/rossmann-store-sales

Dataset URL: https://www.kaggle.com/datasets/pratyushakar/rossmann-store-sales
License(s): CC0-1.0
Downloading rossmann-store-sales.zip to /content
 72% 5.00M/6.90M [00:00<00:00, 48.4MB/s]
100% 6.90M/6.90M [00:00<00:00, 61.9MB/s]


In [None]:
!unzip rossmann-store-sales.zip

Archive:  rossmann-store-sales.zip
  inflating: store.csv               
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import pandas as pd
import numpy as np

from pycaret.time_series import *

In [None]:
data = pd.read_csv('train.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Sales', inplace=True)
data.head()

Unnamed: 0_level_0,Store,DayOfWeek,Date,Customers,Open,Promo,StateHoliday,SchoolHoliday
Sales,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5263,1,5,2015-07-31,555,1,1,0,1
6064,2,5,2015-07-31,625,1,1,0,1
8314,3,5,2015-07-31,821,1,1,0,1
13995,4,5,2015-07-31,1498,1,1,0,1
4822,5,5,2015-07-31,559,1,1,0,1


In [None]:
# Creating a mock 'temperature' exogenous variable
data['temperature'] = 20 + 10 * np.sin(np.linspace(0, 10, len(data)))

# Resample the data to daily frequency
numeric_cols = ['Sales', 'temperature']
data = data[numeric_cols].resample('D').mean()

print(data.index.freq)

<Day>


**The dataset has missing values tagged as -200. [Reference](https://archive.ics.uci.edu/ml/datasets/air+quality). We should remove these values (replace them with NaN) and let `pycaret` handle the imputation appropriately (preventing leakage of data during training).**

In [None]:
target = data[['Sales']]  # Electricity load (target variable)
exogenous = data[['temperature']]  # Temperature as the exogenous variable

# Initialize the setup
ts_setup = setup(data=target, target='Sales', fh=30, fold=3, session_id=123, seasonal_period='D')

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Sales
2,Approach,Univariate
3,Exogenous Variables,Not Present
4,Original data shape,"(942, 1)"
5,Transformed data shape,"(942, 1)"
6,Transformed train set shape,"(912, 1)"
7,Transformed test set shape,"(30, 1)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


In [None]:
data = data.reset_index().set_index(pd.to_datetime(data.index))

In [None]:
final_model = finalize_model('prophet')
future_dates = pd.date_range(start=data.index[-1] + pd.DateOffset(days=1), periods=30, freq='D')
future_exogenous = pd.DataFrame({'temperature': 20 + 10 * np.sin(np.linspace(0, 10, 30))}, index=future_dates)

INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmp01w8btmg/9ejfltnm.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp01w8btmg/hzn_rmb_.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38405', 'data', 'file=/tmp/tmp01w8btmg/9ejfltnm.json', 'init=/tmp/tmp01w8btmg/hzn_rmb_.json', 'output', 'file=/tmp/tmp01w8btmg/prophet_modelj3r065u1/prophet_model-20240927194045.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
INFO:cmdstanpy:Chain [1] start processing
INFO:cmdstanpy:Chain [1] done processing


In [None]:
plot_model(final_model, plot='forecast')