# Univariate Models

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

from src.data_preprocessing.data_loader import load_data
from src.modeling.univariate_modeling import get_arima_model, get_ets_model
from src.modeling.evaluation import smape, mae

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df = load_data(DATA_DIR / MAIN_FILE)

In [None]:
models = get_arima_model(df)

In [None]:
ets_models = get_ets_model(df)

In [None]:
for (col_ets, ets_model), (col, model) in zip(
    ets_models.items(), models.items()
):
    # ETS model
    series_ets = df[col_ets].copy().dropna()
    y_true_ets = series_ets
    y_pred_ets = ets_model.fittedvalues

    print(f"*** {col_ets} - ETS Model ***")
    print(f"sMAPE: {smape(y_true_ets.values, y_pred_ets.values):.2f}%")
    print(f"MAE: {mae(y_true_ets.values, y_pred_ets.values):.2f}")

    # Model summary
    print(ets_model.summary())

    # Plotting the original series and the fitted values
    plt.figure(figsize=(10, 6))
    plt.plot(y_true_ets.index, y_true_ets, label="Actual Series")
    plt.plot(
        y_true_ets.index, y_pred_ets, label="Fitted Values", linestyle="dashed"
    )
    plt.xlabel("Time")
    plt.ylabel("Values")
    plt.legend()
    plt.title(f"{col_ets} - Actual vs Fitted Values (ETS Model)")
    plt.show()

    # Line plot of residuals
    residuals_ets = y_true_ets - y_pred_ets
    plt.figure(figsize=(10, 4))
    plt.plot(y_true_ets.index, residuals_ets, label="Residuals")
    plt.axhline(
        0, color="red", linestyle="--", linewidth=2, label="Zero Residuals"
    )
    plt.xlabel("Time")
    plt.ylabel("Residuals")
    plt.legend()
    plt.title(f"{col_ets} - Residuals (ETS Model)")
    plt.show()

    print("--------------------------------")

    # ARIMA model
    d = model.model.order[1]
    y_true_arima = df[col].dropna().iloc[d:]
    time_arima = y_true_arima.index
    y_pred_arima = model.fittedvalues.iloc[d:]

    print(f"*** {col} - ARIMA Model ***")
    print(f"sMAPE: {smape(y_true_arima.values, y_pred_arima.values):.2f}%")
    print(f"MAE: {mae(y_true_arima.values, y_pred_arima.values):.2f}")

    # Model summary
    print(model.summary())

    # Plotting the original series and the fitted values
    plt.figure(figsize=(10, 6))
    plt.plot(time_arima, y_true_arima, label="Actual Series")
    plt.plot(
        time_arima, y_pred_arima, label="Fitted Values", linestyle="dashed"
    )
    plt.xlabel("Time")
    plt.ylabel("Values")
    plt.legend()
    plt.title(f"{col} - Actual vs Fitted Values (ARIMA Model)")
    plt.show()

    # Line plot of residuals
    residuals_arima = pd.DataFrame(model.resid)
    residuals_arima.plot()
    plt.title(f"{col} - Residuals (ARIMA Model)")
    plt.show()

    print("--------------------------------")

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing

from src.data_preprocessing.data_loader import time_split

results = {}

for (_, ets_model), (col, model) in zip(ets_models.items(), models.items()):
    series = df[col].copy().dropna()
    all_splits = time_split(series)

    arima_smape, arima_mae, ets_smape, ets_mae = 0, 0, 0, 0

    for train_idx, test_idx in all_splits:
        train_set = series.iloc[train_idx]
        test_set = series.iloc[test_idx]

        model_arima = ARIMA(
            train_set,
            order=model.model.order,
            seasonal_order=model.model.seasonal_order,
        )
        model_arima = model_arima.fit()

        model_ets = ExponentialSmoothing(
            train_set,
            trend=ets_model.model.trend,
            seasonal=ets_model.model.seasonal,
            seasonal_periods=ets_model.model.seasonal_periods,
        )
        model_ets = model_ets.fit()

        pred_arima = model_arima.predict(
            start=test_set.index[0], end=test_set.index[-1]
        )
        pred_ets = model_ets.predict(
            start=test_set.index[0], end=test_set.index[-1]
        )

        arima_smape += smape(test_set, pred_arima)
        arima_mae += mae(test_set, pred_arima)

        ets_smape += smape(test_set, pred_ets)
        ets_mae += mae(test_set, pred_ets)

    results[col] = {
        "ARIMA": {
            "sMAPE": round(arima_smape / len(all_splits), 2),
            "MAE": round(arima_mae / len(all_splits), 2),
        },
        "ETS": {
            "sMAPE": round(ets_smape / len(all_splits), 2),
            "MAE": round(ets_mae / len(all_splits), 2),
        },
    }

In [None]:
results

In [None]:
import json

with open("results.json", "w") as fp:
    json.dump(results, fp)

In [None]:
from config.config_modeling import P_RANGE, Q_RANGE, SEASONAL_TERMS, D
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from src.data_preprocessing.data_loader import time_split

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df = load_data(DATA_DIR / MAIN_FILE)

In [None]:
def grid_search_arima_cv(ts, p_values, d, q_values, seasonal):
    """
    Performs a grid search for ARIMA based on cross-validated AIC.

    Parameters:
    - ts: pd.Series
    - p_values: List of candidate values for p
    - d: d value for difference to make series stationary
    - q_values: List of candidate values for q
    - splits: List of index tuples for each train-test split

    Returns:
    - Best ARIMA model order (p, d, q)
    """
    best_sMAPE = float("inf")
    best_order = None
    # values = {}
    spl = time_split(series)

    for p in p_values:
        for q in q_values:
            order = (p, d, q)
            sMAPE_total = 0.0

            for train_idx, test_idx in spl:
                train = ts.iloc[train_idx]
                test = ts.iloc[test_idx]
                model = ARIMA(train, order=order, seasonal_order=seasonal)
                model_fit = model.fit()
                preds = model_fit.predict(
                    start=test.index[0], end=test.index[-1]
                )
                sMAPE = smape(test, preds)
                sMAPE_total += sMAPE

            avg_sMAPE = sMAPE_total / len(spl)
            # values[order] = avg_sMAPE

            if avg_sMAPE < best_sMAPE:
                best_sMAPE = avg_sMAPE
                best_order = order

    return best_order, best_sMAPE


# Example usage:
# Assuming df is your DataFrame with a time series column
# Assuming splits is a list of tuples containing indices for each train-test split
# p_values = range(0, 4)
# d = 1
# q_values = range(0, 4)

# best_order = grid_search_arima_cv(series, p_values, d, q_values, splits)
# print(f"Best ARIMA Order: {best_order}")

In [None]:
results = {}

models = {}
for col in df.columns:
    if col not in Q_RANGE.keys():
        continue

    series = df[col].copy()
    series.index = series.index.to_period("M")
    series = series.dropna()

    best_order, best_sMAPE = grid_search_arima_cv(
        series,
        P_RANGE[col],
        D[col],
        Q_RANGE[col],
        SEASONAL_TERMS.get(col, (0, 0, 0, 0)),
    )

    results[col] = {"order": best_order, "sMPAE": best_sMAPE}

In [None]:
results

In [None]:
import json

with open("results_arima_cv_model_selection.json", "w") as fp:
    json.dump(results, fp)