# Univariate Models

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.arima.model import ARIMA

from src.data_preprocessing.data_loader import load_data, time_split
from src.modeling.evaluation import smape, mae
from src.modeling.univariate_modeling import get_best_cv_model

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df = load_data(DATA_DIR / MAIN_FILE)

In [None]:
models = get_best_cv_model(df)

In [None]:
# define colors for each model type
color_dict = {
    "baseline": "grey",
    "ARIMA": "orange",
    "ETS": "blue",
    "XGB": "green",
}

for col in models:
    # get series of each col, where last 9 months are test values
    series = df[col].copy().dropna()
    y_train = series.iloc[:-9]
    y_test = series.iloc[-9:]

    # create figure for time series plot of this column
    plt.figure(figsize=(10, 6))
    plt.plot(y_train.index, y_train, label="Training data", color="black")
    plt.plot(y_test.index, y_test, label="Test data", color="red")

    for model in models[col]:
        if model not in ["selected", "XGB"]:
            # plot line graph for recursive and baseline method
            plt.plot(
                y_test.index,
                models[col][model]["preds"],
                label=model,
                color=color_dict[model],
            )
        elif model == "XGB":
            # for direct model plot points to show that it is not
            # continous and different models
            plt.plot(
                y_test.index[2::3],
                models[col][model]["preds"],
                ".",
                label=model,
                color=color_dict[model],
            )

    # add plot information
    plt.xlabel("Time")
    plt.ylabel("Values")
    plt.legend()
    plt.title(
        f"{col} - Actual vs Predictions \n Final Model: {models[col]['selected']}"
    )
    plt.show()

## Best Univariate Model for the Target

We want to invesigate the average performance across folds for each time horizon (3-6-9 months) of the best model as well as look at the residuals of each fold. 

Note: The time horizon is defined by the goal of the project, i.e. giving predictions for 3, 6, and 9 months into the future.

In [None]:
target = "best_price_compound"

In [None]:
# get time series for target and cv-splits as well as best model
series = df[target].copy().dropna()
target_model = models[target][models[target]["selected"]]
spl = time_split(series)

# create lists for target/preds for 3, 6, and 9 months for each fold
preds_3, preds_6, preds_9 = [], [], []
target_3, target_6, target_9 = [], [], []

# create figure for residuals of each cv
plt.figure(figsize=(10, 4))
plt.axhline(
    0, color="red", linestyle="--", linewidth=2, label="Zero Residuals"
)
plt.title("Residuals of cv folds")
plt.xlabel("Time")
plt.ylabel("Residuals (y - y_hat)")

for train_idx, test_idx in spl:
    # create train and test data and append target for fold fold
    train = series.iloc[train_idx]
    test = series.iloc[test_idx]
    target_3.append(test[2])
    target_6.append(test[5])
    target_9.append(test[8])

    # train model for fold
    model = ARIMA(
        train,
        order=target_model["order"],
        seasonal_order=target_model["seasonal_order"],
    )
    model_fit = model.fit()

    # use model to predict on test index
    # append predictions for 3, 6, and 9 months for fold
    preds = model_fit.predict(start=test.index[0], end=test.index[-1])
    preds_3.append(preds[2])
    preds_6.append(preds[5])
    preds_9.append(preds[8])

    # calculate residuals for all test data and add layer to plot for fold
    residuals = test - preds
    plt.plot(test.index, residuals, label="Residuals", color="blue")

In [None]:
# calculate sMAPE for 3, 6, and 9 months
smape_3 = smape(np.array(target_3), np.array(preds_3))
smape_6 = smape(np.array(target_6), np.array(preds_6))
smape_9 = smape(np.array(target_9), np.array(preds_9))

# calculate MAE for 3, 6, and 9 months
mae_3 = mae(np.array(target_3), np.array(preds_3))
mae_6 = mae(np.array(target_6), np.array(preds_6))
mae_9 = mae(np.array(target_9), np.array(preds_9))

In [None]:
print("** Fold-averaged results for time horizons **")
print("3 months:\n", f"- sMAPE: {smape_3:.2f}% \n", f"- MAE: {mae_3:.2f}")
print("6 months:\n", f"- sMAPE: {smape_6:.2f}% \n", f"- MAE: {mae_6:.2f}")
print("9 months:\n", f"- sMAPE: {smape_9:.2f}% \n", f"- MAE: {mae_9:.2f}")