# Modeling SARIMA par PCType / région (multi_3m)
Prévision mensuelle de `pc_price` par PCType/région avec SARIMA, split hold-out pour MAPE et option MLflow.


## Imports et configuration

In [None]:
import os

from dotenv import load_dotenv
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
from statsmodels.tsa.statespace.sarimax import SARIMAX

import constants.constants as cst
from constants.paths import PROCESSED_DATA_DIR

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 4)

DATA_PATH = PROCESSED_DATA_DIR / "multi_3m.csv"
assert DATA_PATH.exists(), DATA_PATH

## Configurer MLflow

In [None]:
load_dotenv()
tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
if tracking_uri:
    mlflow.set_tracking_uri(tracking_uri)
    print(f"Tracking URI: {tracking_uri}")

experiment_id = os.getenv("MLFLOW_EXPERIMENT_ID")
experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME")
active_experiment_id = experiment_id

if experiment_name:
    mlflow.set_experiment(experiment_name)
    exp = mlflow.get_experiment_by_name(experiment_name)
    if exp and not active_experiment_id:
        active_experiment_id = exp.experiment_id
    print("Experiment name:", experiment_name)

if active_experiment_id:
    print("Using experiment_id:", active_experiment_id)
else:
    print("No experiment_id provided; runs will use the active/default experiment.")

## Charger les données

In [None]:
def load_data(horizon: int = 3) -> pd.DataFrame:
    """Load multi_{horizon}m.csv and sort."""
    df_local = pd.read_csv(
        PROCESSED_DATA_DIR / f"multi_{horizon}m.csv", parse_dates=["date"]
    )
    return df_local.sort_values(["pc_type", "region", "date"])


df = load_data(3)
df.head()

## Disponibilité des PCType (Enum vs données)

Vérifie quelles valeurs de `PCType` existent dans les données processees et le coverage par région.

In [None]:
from constants.constants import PCType

pc_enum = [p.value for p in PCType]
pc_seen = sorted(df.pc_type.unique())
counts = (
    df.groupby(["pc_type", "region"])
    .agg(start=("date", "min"), end=("date", "max"), rows=("date", "size"))
    .reset_index()
)
missing_enum = [p for p in pc_enum if p not in pc_seen]
print("PCType vus dans les données:", pc_seen)
print("PCType Enum manquants dans les données:", missing_enum)
counts

## Fonction MAPE
Utilisée pour les baselines et SARIMA.

In [None]:
def mape(y_true: pd.Series, y_pred: pd.Series) -> float | None:
    """MAPE fraction (pas de *100), en ignorant zéros/NaN."""
    mask = (y_true != 0) & y_true.notna() & y_pred.notna()
    if mask.sum() == 0:
        return None
    return (np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])).mean()

## Métriques alignées avec global_models

In [None]:
def compute_metrics(df_slice: pd.DataFrame, preds: pd.Series) -> dict:
    """Retourne global_mape, weighted_mape et mape par pc_type."""
    metrics: dict[str, float | None] = {}
    y_true = df_slice["pc_price"]
    metric_val = mape(y_true, preds)
    metrics[cst.GLOBAL_MAPE] = metric_val
    metrics[cst.WEIGHTED_MAPE] = metric_val
    for pc_val, grp in df_slice.groupby("pc_type"):
        metrics[f"{pc_val}_mape"] = mape(grp["pc_price"], preds.loc[grp.index])
    return metrics

## Baselines MAPE par horizon (naïf et saisonnier t-12)

In [None]:
def baseline_mape(
    horizon: int,
    strategy: str = "naive",
    df_full: pd.DataFrame | None = None,
    test_df: pd.DataFrame | None = None,
) -> pd.DataFrame:
    """Compute naïf/saisonnier MAPE par pc_type/region (fraction)."""
    rows = []
    data = df_full if df_full is not None else df
    for (pc, reg), g in data.groupby(["pc_type", "region"]):
        g = g.sort_values("date").set_index("date")
        y = g["pc_price"]
        if strategy == "naive":
            pred = y.shift(horizon)
        elif strategy == "seasonal_naive":
            pred = y.shift(12)
        else:
            raise ValueError("strategy must be naive or seasonal_naive")
        if test_df is not None:
            mask_dates = test_df[(test_df.pc_type == pc) & (test_df.region == reg)][
                "date"
            ]
            test_mask = g.index.isin(mask_dates)
            m = mape(y[test_mask], pred[test_mask])
        else:
            m = mape(y, pred)
        rows.append(
            {
                "pc_type": pc,
                "region": reg,
                "horizon": horizon,
                "strategy": strategy,
                "MAPE": m,
            }
        )
    return pd.DataFrame(rows)

## Split temporel adaptatif
Assure un minimum d'observations train/test par (pc_type, region).

In [None]:
def adaptive_train_test_split(
    df_local: pd.DataFrame,
    group_cols: list[str],
    target_test_ratio: float = 0.2,
    min_train_samples: int = 20,
    min_test_samples: int = 5,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Split temporel garantissant un minimum par groupe."""
    df_local = df_local.sort_values("date").reset_index(drop=True)
    dates = sorted(df_local["date"].unique())
    unique_groups = df_local[group_cols].drop_duplicates().to_dict("records")
    for split_date in reversed(dates):
        train = df_local[df_local["date"] < split_date]
        test = df_local[df_local["date"] >= split_date]
        valid = True
        for g in unique_groups:
            train_g = train
            test_g = test
            for col, val in g.items():
                train_g = train_g[train_g[col] == val]
                test_g = test_g[test_g[col] == val]
            if len(train_g) < min_train_samples or len(test_g) < min_test_samples:
                valid = False
                break
        if valid:
            print(f"Found valid global split at date: {split_date}")
            return train, test
    print("No valid global split found. Falling back to ratio.")
    split_idx = int(len(dates) * (1 - target_test_ratio))
    split_date = dates[split_idx]
    return df_local[df_local["date"] < split_date], df_local[
        df_local["date"] >= split_date
    ]

## Baselines sur le jeu de validation (test_df)

In [None]:
splits = {}
baseline_tables = []
for horizon in [3, 6, 9]:
    df_h = load_data(horizon)
    train_df_h, test_df_h = adaptive_train_test_split(
        df_h,
        group_cols=["pc_type", "region"],
        target_test_ratio=0.2,
        min_train_samples=20,
        min_test_samples=5,
    )
    splits[horizon] = {"df": df_h, "train": train_df_h, "test": test_df_h}
    baseline_tables.append(
        baseline_mape(horizon, "naive", df_full=df_h, test_df=test_df_h)
    )
    baseline_tables.append(
        baseline_mape(horizon, "seasonal_naive", df_full=df_h, test_df=test_df_h)
    )
baseline_df = pd.concat(baseline_tables, ignore_index=True)
baseline_df

## Fonctions utilitaires

## SARIMA + logging MLflow (optionnel)

In [None]:
def evaluate_sarima(
    pc_type: str,
    region: str,
    df_source: pd.DataFrame,
    order: tuple[int, int, int] = (1, 1, 0),
    seasonal_order: tuple[int, int, int, int] = (0, 0, 0, 12),
    horizon: int = 6,
    train_df: pd.DataFrame | None = None,
    test_df: pd.DataFrame | None = None,
) -> dict:
    """Fit SARIMA pour un pc_type/region et retourne métriques."""
    train_slice = (
        train_df[(train_df.pc_type == pc_type) & (train_df.region == region)]
        if train_df is not None
        else df_source[(df_source.pc_type == pc_type) & (df_source.region == region)]
    )
    test_slice = (
        test_df[(test_df.pc_type == pc_type) & (test_df.region == region)]
        if test_df is not None
        else None
    )
    train_slice = train_slice.sort_values("date")
    y_train = train_slice.set_index("date")["pc_price"]
    if test_slice is not None and not test_slice.empty:
        y_test = test_slice.sort_values("date").set_index("date")["pc_price"]
    else:
        return {
            "pc_type": pc_type,
            "region": region,
            "MAPE": None,
            "n_points": 0,
            "note": "Test set empty (strict split)",
        }
    if len(y_train) <= max(order[0], seasonal_order[0] * seasonal_order[3]):
        return {
            "pc_type": pc_type,
            "region": region,
            "MAPE": None,
            "n_points": 0,
            "note": "train trop court",
        }
    try:
        model = SARIMAX(
            y_train,
            order=order,
            seasonal_order=seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False,
        )
        res = model.fit(disp=False)
        steps = min(len(y_test), horizon)
        fc = res.get_forecast(steps=steps).predicted_mean
        m = mape(y_test.iloc[:steps], fc)
        return {
            "pc_type": pc_type,
            "region": region,
            "MAPE": m,
            "order": order,
            "seasonal_order": seasonal_order,
            "horizon": horizon,
            "n_points": steps,
        }
    except Exception as exc:
        return {
            "pc_type": pc_type,
            "region": region,
            "MAPE": None,
            "n_points": 0,
            "error": str(exc),
        }

In [None]:
results_all = []
agg_rows = []
for horizon, objs in splits.items():
    df_h = objs["df"]
    train_h = objs["train"]
    test_h = objs["test"]
    horizon_results = []
    for pc in sorted(df_h.pc_type.unique()):
        for reg in sorted(df_h.region.unique()):
            res = evaluate_sarima(
                pc, reg, df_h, horizon=horizon, train_df=train_h, test_df=test_h
            )
            res["horizon"] = horizon
            horizon_results.append(res)
    if horizon_results:
        hr_df = pd.DataFrame(horizon_results)
        results_all.append(hr_df)
        valid = hr_df[hr_df["MAPE"].notna()].copy()
        if not valid.empty:
            weights = valid["n_points"].replace(0, 1)

            def wmean(series, weights=weights):
                """Compute weighted mean, using weights from the closure."""
                return float(np.average(series, weights=weights.loc[series.index]))

            agg = {
                "horizon": horizon,
                cst.GLOBAL_MAPE: wmean(valid["MAPE"]),
                cst.WEIGHTED_MAPE: wmean(valid["MAPE"]),
            }
            reg_subset = valid[valid["pc_type"] == cst.REGULAR_PC_TYPE]
            if not reg_subset.empty:
                agg["regular_mape"] = wmean(reg_subset["MAPE"])
            green_subset = valid[valid["pc_type"] == cst.GREEN_PC_TYPE]
            if not green_subset.empty:
                agg["green_mape"] = wmean(green_subset["MAPE"])
            agg_rows.append(agg)

results_df = (
    pd.concat(results_all, ignore_index=True) if results_all else pd.DataFrame()
)

agg_df = pd.DataFrame(agg_rows) if agg_rows else pd.DataFrame()

# MLflow logging par horizon (désactivé par défaut)
log_horizon_mlflow = True
if log_horizon_mlflow and not agg_df.empty:
    for _, row in agg_df.iterrows():
        h = int(row["horizon"])
        metrics_to_log = {
            k: v for k, v in row.items() if k != "horizon" and pd.notna(v)
        }
        with mlflow.start_run(
            run_name=f"SARIMA_agg_h{h}", experiment_id=active_experiment_id
        ):
            mlflow.set_tags(
                {
                    cst.MLFLOW_MODEL_TYPE: "sarima",
                    cst.MLFLOW_MODEL_PHILOSOPHY: "statistical",
                    cst.MLFLOW_FUNCTION: "forecast",
                    cst.MLFLOW_HORIZON: h,
                }
            )
            mlflow.log_metrics(metrics_to_log)

## Run diagnostic (sans logging)

(Per-série MLflow logging désactivé)