In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX

from constants import intermediate_names, processed_names
from src.data_pipelines.uni_intermediate_to_processed import build_univariate_dataset

# ARIMA / SARIMA setup
Configuration and helpers to train simple (S)ARIMA models on the grouped PC price series.

In [None]:
# Modeling configuration
HORIZON = 3  # months
USE_EXOGENOUS = True
TEST_SIZE = 12  # months held out for evaluation
DATE_COL = processed_names.WIDE_DATE
TARGET_COLUMNS = intermediate_names.GROUPED_ENDOGENOUS_COLUMNS
DEFAULT_ORDER = (1, 1, 1)
DEFAULT_SEASONAL_ORDER = (0, 0, 0, 12)

In [None]:
# Build the wide dataset with engineered features
full_wide_df = build_univariate_dataset(
    horizon=HORIZON,
    group_by_pc_types=True,
    include_exogenous=True,
    include_differencing=True,
)
full_wide_df[DATE_COL] = pd.to_datetime(full_wide_df[DATE_COL])
full_wide_df = full_wide_df.sort_values(DATE_COL).reset_index(drop=True)
full_wide_df.head()

In [None]:
def build_exog_matrix(df: pd.DataFrame, target: str, use_exogenous: bool = True):
    """Select exogenous regressors for a target.

    Includes all engineered features derived from the target (lags/rolling/roc/diff)
    plus exogenous variables and their derived features when requested.
    """
    # Features built from the target itself (e.g., lags, rolling stats)
    exog_cols = [c for c in df.columns if c.startswith(target) and c != target]

    if use_exogenous:
        for base in intermediate_names.EXOGENOUS_COLUMNS:
            exog_cols.extend([c for c in df.columns if c.startswith(base)])

    exog_cols = sorted(set(exog_cols))
    return df[exog_cols] if exog_cols else None


def train_test_forecast(
    df: pd.DataFrame,
    target: str,
    test_size: int,
    order: tuple[int, int, int] = DEFAULT_ORDER,
    seasonal_order: tuple[int, int, int, int] = DEFAULT_SEASONAL_ORDER,
    use_exogenous: bool = True,
):
    """Fit SARIMA with optional exogenous vars and return forecast and MAPE."""
    df_local = df.dropna(subset=[target]).sort_values(DATE_COL).reset_index(drop=True)
    if len(df_local) <= test_size + max(order[1], seasonal_order[1]):
        raise ValueError(
            f"Not enough data for target {target} with test_size={test_size}"
        )

    exog_matrix = build_exog_matrix(df_local, target, use_exogenous=use_exogenous)
    train = df_local.iloc[:-test_size]
    test = df_local.iloc[-test_size:]
    train_exog = exog_matrix.iloc[:-test_size] if exog_matrix is not None else None
    test_exog = exog_matrix.iloc[-test_size:] if exog_matrix is not None else None

    model = SARIMAX(
        train[target],
        exog=train_exog,
        order=order,
        seasonal_order=seasonal_order,
        enforce_stationarity=False,
        enforce_invertibility=False,
    )
    fit_res = model.fit(disp=False)
    forecast = fit_res.forecast(steps=len(test), exog=test_exog)

    actual = test[target]
    denom = actual.replace(0, np.nan)
    mape = float(np.nanmean(np.abs((actual - forecast) / denom)))

    out_df = pd.DataFrame(
        {
            DATE_COL: test[DATE_COL].values,
            "actual": actual.values,
            "forecast": forecast.values,
        }
    )
    return fit_res, out_df, mape

In [None]:
# Run across all grouped PC price targets
results = []
plots = {}
for target in TARGET_COLUMNS:
    try:
        fit_res, eval_df, mape = train_test_forecast(
            df=full_wide_df,
            target=target,
            test_size=TEST_SIZE,
            order=DEFAULT_ORDER,
            seasonal_order=DEFAULT_SEASONAL_ORDER,
            use_exogenous=USE_EXOGENOUS,
        )
        results.append(
            {
                "target": target,
                "mape": mape,
                "train_n": len(fit_res.data.endog),
                "test_n": len(eval_df),
            }
        )
        plots[target] = eval_df
    except Exception as exc:
        results.append({"target": target, "mape": np.nan, "error": str(exc)})

pd.DataFrame(results)

Use `plots['eu_pc_crystal_best_price']` (or any target key) to inspect actual vs forecast for a specific series.