In [None]:
import pandas as pd
import numpy as np
import statsmodels.tsa.seasonal
from IPython.display import display
import plotly.express as px
import plotly.graph_objects as go

In [None]:
def run_sequence_plot(x, y, title, xtitle, ytitle):
    figure = px.line(x=x, y=y)
    figure.update_layout(title=title, width=800, xaxis_title=xtitle, yaxis_title=ytitle)
    return figure

# Read parquet dataset

In [None]:
df = pd.read_parquet("data/household.parquet")

# Exclude data before July 2007
df = df.loc["2007-07":]
df_monthly = df.resample("M").quantile(0.99)
var = "Global_active_power"
fig = run_sequence_plot(
    df_monthly.index, df_monthly[var], f"Monthly 99% percentile {var}", "Time", f"{var}"
)
fig.show()

# Cross-validation split

In [None]:
# Models
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

tscv = TimeSeriesSplit(n_splits=4, test_size=3)

dates = []
y_real = []
y_pred_3sm = []
y_pred_autoarima = []
y_pred_sarima = []

for train, test in tscv.split(df_monthly.index):
    print(f"Train: {train}\nTest: {test}")

    triple_exp = ExponentialSmoothing(
        df_monthly[var].iloc[train], trend="add", seasonal="add", seasonal_periods=12
    ).fit(optimized=True)

    sar = SARIMAX(
        df_monthly[var].iloc[train],
        order=(1, 0, 0),
        seasonal_order=(0, 1, 1, 12),
        trend="t",
    ).fit()

    auto_model = auto_arima(
        df_monthly[var].iloc[train],
        start_p=0,
        start_q=0,
        max_p=3,
        max_q=3,
        m=12,
        start_P=0,
        seasonal=True,
        D=1,
        trace=True,
        error_action="ignore",
        suppress_warnings=True,
        stepwise=True,
    )

    dates.append(df_monthly.iloc[test].index)
    y_real.append(df_monthly[var].iloc[test])
    y_pred_sarima.append(sar.forecast(steps=len(test)))
    y_pred_autoarima.append(auto_model.predict(n_periods=len(test)))
    y_pred_3sm.append(triple_exp.forecast(len(test)))

In [None]:
errors_df = pd.DataFrame(
    {
        "date": np.array(dates).flatten(),
        "real": np.array(y_real).flatten(),
        "sarima": np.array(y_pred_sarima).flatten(),
        "auto_arima": np.array(y_pred_autoarima).flatten(),
        "triple_smoothing": np.array(y_pred_3sm).flatten(),
    }
)

display(errors_df)

print(
    f"MAPE:\n"
    f"Triple Smoothing: {mean_absolute_percentage_error(errors_df.real, errors_df.triple_smoothing):.3f}\n"
    f"Auto ARIMA: {mean_absolute_percentage_error(errors_df.real, errors_df.auto_arima):.3f}\n"
    f"SARIMA: {mean_absolute_percentage_error(errors_df.real, errors_df.sarima):.3f}\n"
)

print(
    f"MSE:\n"
    f"Triple Smoothing: {mean_squared_error(errors_df.real, errors_df.triple_smoothing):.3f}\n"
    f"Auto ARIMA: {mean_squared_error(errors_df.real, errors_df.auto_arima):.3f}\n"
    f"SARIMA: {mean_squared_error(errors_df.real, errors_df.sarima):.3f}\n"
)

In [None]:
figure = px.line(x=df_monthly.index, y=df_monthly[var])
figure.add_scatter(x=errors_df.date, y=errors_df.sarima, mode="lines", name="sarima")
figure.add_scatter(
    x=errors_df.date, y=errors_df.auto_arima, mode="lines", name="auto arima"
)
figure.add_scatter(
    x=errors_df.date, y=errors_df.triple_smoothing, mode="lines", name="3x smoothing"
)
figure.update_layout(
    title=f"Monthly 99% percentile {var}",
    width=1000,
    xaxis_title="Time",
    yaxis_title=f"{var}",
)
figure.show()