In [3]:
import pandas as pd
import numpy as np
import statsmodels.tsa.seasonal
from IPython.display import display
import plotly.express as px
import plotly.graph_objects as go

In [4]:
def run_sequence_plot(x, y, title, xtitle, ytitle):
    figure = px.line(x=x, y=y)
    figure.update_layout(title=title, width=800, xaxis_title=xtitle, yaxis_title=ytitle)
    return figure

# Read parquet dataset

In [85]:
df = pd.read_parquet("data/household.parquet")

# Exclude data before July 2007
df = df.loc["2007-07":]
df_monthly = df.resample("W").mean()
var = "Global_active_power"
fig = run_sequence_plot(
    df_monthly.index, df_monthly[var], f"Monthly 99% percentile {var}", "Time", f"{var}"
)
fig.show()

# Cross-validation split

In [35]:
# Models
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima


In [86]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

tscv = TimeSeriesSplit(n_splits=5, test_size=3)

dates = []
y_real = []
y_pred_3sm = []
y_pred_autoarima = []
y_pred_sarima = []

for train, test in tscv.split(df_monthly.index):
    print(f"Train: {train}\nTest: {test}")

    triple_exp = ExponentialSmoothing(df_monthly[var].iloc[train], trend="add", seasonal="add", seasonal_periods=12).fit(optimized=True)
    sar = SARIMAX(df_monthly[var].iloc[train], order=(1, 0, 0), seasonal_order=(0, 1, 1, 12), trend="t").fit()
    auto_model = auto_arima(df_monthly[var].iloc[train], start_p=0, start_q=0, max_p=3, max_q=3, m=12, start_P=0, seasonal=True, D=1, trace=True, error_action="ignore", suppress_warnings=True, stepwise=True)

    dates.append(df_monthly.iloc[test].index)
    y_real.append(df_monthly[var].iloc[test])
    y_pred_sarima.append(sar.forecast(steps=len(test)))
    y_pred_autoarima.append(auto_model.predict(n_periods=len(test)))
    y_pred_3sm.append(triple_exp.forecast(len(test)))




Train: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165]
Test: [166 167 168]
Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,1,1)[12] intercept   : AIC=inf, Time=0.34 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=134.665, Time=0.05 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=16.581, Time=0.27 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=0.56 sec



Maximum Likelihood optimization failed to converge. Check mle_retvals



Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,1,1)[12] intercept   : AIC=inf, Time=0.45 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=143.832, Time=0.05 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=26.967, Time=0.24 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=0.47 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=141.905, Time=0.01 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=117.387, Time=0.04 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=22.676, Time=0.52 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=1.63 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=0.66 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=88.530, Time=0.37 sec
 ARIMA(2,0,0)(2,1,0)[12] intercept   : AIC=14.736, Time=1.07 sec
 ARIMA(2,0,0)(1,1,0)[12] intercept   : AIC=19.939, Time=0.30 sec
 ARIMA(2,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=2.54 sec
 ARIMA(2,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=0.71 sec
 ARIMA(3,0,0)(2,1,0)[12] intercept   : AIC=16.029, Time=1.13 s

In [87]:
errors_df = pd.DataFrame({'date': np.array(dates).flatten(), 'real': np.array(y_real).flatten(), 'sarima': np.array(y_pred_sarima).flatten(), 'auto_arima': np.array(y_pred_autoarima).flatten(), 'triple_smoothing': np.array(y_pred_3sm).flatten()})

display(errors_df)

print(f'MAPE:\n'
      f'Triple Smoothing: {mean_absolute_percentage_error(errors_df.real, errors_df.triple_smoothing):.3f}\n'
      f'Auto ARIMA: {mean_absolute_percentage_error(errors_df.real, errors_df.auto_arima):.3f}\n'
      f'SARIMA: {mean_absolute_percentage_error(errors_df.real, errors_df.sarima):.3f}\n')

print(f'MSE:\n'
      f'Triple Smoothing: {mean_squared_error(errors_df.real, errors_df.triple_smoothing):.3f}\n'
      f'Auto ARIMA: {mean_squared_error(errors_df.real, errors_df.auto_arima):.3f}\n'
      f'SARIMA: {mean_squared_error(errors_df.real, errors_df.sarima):.3f}\n')

Unnamed: 0,date,real,sarima,auto_arima,triple_smoothing
0,2010-09-05,1.208106,0.938446,0.942207,0.851068
1,2010-09-12,0.928914,0.87034,0.906164,0.712751
2,2010-09-19,0.843054,0.899644,1.031286,0.73219
3,2010-09-26,0.896605,1.023497,0.986179,0.974421
4,2010-10-03,1.42235,1.048989,1.087879,1.067673
5,2010-10-10,0.882893,1.026989,1.033958,0.986048
6,2010-10-17,1.134174,0.783306,0.89278,0.866086
7,2010-10-24,1.396573,0.861496,0.992419,0.97988
8,2010-10-31,0.958963,1.106105,1.131599,1.077074
9,2010-11-07,1.059359,0.864675,1.012112,0.970891


MAPE:
Triple Smoothing: 0.146
Auto ARIMA: 0.140
SARIMA: 0.170

MSE:
Triple Smoothing: 0.046
Auto ARIMA: 0.040
SARIMA: 0.057



In [88]:
figure = px.line(x=df_monthly.index, y=df_monthly[var])
figure.add_scatter(x=errors_df.date, y=errors_df.sarima, mode="lines", name="sarima")
figure.add_scatter(x=errors_df.date, y=errors_df.auto_arima, mode="lines", name="auto arima")
figure.add_scatter(x=errors_df.date, y=errors_df.triple_smoothing, mode="lines", name="3x smoothing")
figure.update_layout(
    title=f"Monthly 99% percentile {var}",
    width=1000,
    xaxis_title="Time",
    yaxis_title=f"{var}",
)
figure.show()