# EDA par PCType pour prévision mensuelle
Objectif: vérifier si les séries `pc_price` par `pc_type`/`region` sont utilisables pour un modèle mensuel (MAPE), avec baselines naïves et vérifs de complétude.

## Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from constants.paths import PROCESSED_DATA_DIR

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 4)

DATA_PATH = PROCESSED_DATA_DIR / "multi_3m.csv"
assert DATA_PATH.exists(), DATA_PATH

## Charger les données

In [None]:
df = pd.read_csv(DATA_PATH, parse_dates=["date"])
df.head()

## Aperçu par PCType / région

In [None]:
span = df.groupby(["pc_type", "region"]).agg(
    start=("date", "min"), end=("date", "max"), rows=("date", "size")
)
span["months"] = (
    (span["end"].dt.year - span["start"].dt.year) * 12
    + (span["end"].dt.month - span["start"].dt.month)
    + 1
)
span

## Vérifier la complétude mensuelle

In [None]:
def monthly_gaps(g: pd.DataFrame) -> tuple[int, pd.DatetimeIndex]:
    """Return number of missing monthly timestamps and the missing dates.

    Expects a DataFrame with a 'date' column.
    """
    g = g.sort_values("date").set_index("date")
    full_idx = pd.date_range(g.index.min(), g.index.max(), freq="MS")
    missing = full_idx.difference(g.index)
    return len(missing), missing

## Baselines MAPE (naïf t-1 et saisonnier t-12)

In [None]:
def mape(y_true: pd.Series, y_pred: pd.Series) -> float:
    """Compute Mean Absolute Percentage Error ignoring zeros/NaN."""
    mask = (y_true != 0) & y_true.notna() & y_pred.notna()
    return (np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])).mean() * 100

## Visualiser les séries (par PCType/Region)

In [None]:
def plot_series() -> None:
    """Line plot of pc_price par PCType/région."""
    for pc in sorted(df.pc_type.unique()):
        fig, ax = plt.subplots(figsize=(12, 4))
        sns.lineplot(
            data=df[df.pc_type == pc],
            x="date",
            y="pc_price",
            hue="region",
            marker="o",
            ax=ax,
        )
        ax.set_title(f"pc_price - {pc}")
        ax.set_xlabel("date")
        ax.set_ylabel("price")
        plt.tight_layout()
        plt.show()

## ACF rapide (24 mois)

In [None]:
def plot_acf_serie(pc_type: str, region: str, lags: int = 24) -> None:
    """Plot ACF for a given pc_type/region."""
    g = df[(df.pc_type == pc_type) & (df.region == region)].sort_values("date")
    fig, ax = plt.subplots(figsize=(10, 3))
    plot_acf(g["pc_price"], lags=lags, ax=ax, title=f"ACF {pc_type} - {region}")
    plt.tight_layout()
    plt.show()


# Exemple : décommentez pour lancer
# plot_acf_serie('regular', 'europe')
# plot_acf_serie('regular', 'asia')
# plot_acf_serie('green', 'europe')
# plot_acf_serie('green', 'asia')

## PACF rapide (24 mois)
Utilise la Partial Autocorrelation Function pour repérer l'ordre AR potentiel.

In [None]:
def plot_pacf_serie(pc_type: str, region: str, lags: int = 24) -> None:
    """Plot PACF pour un pc_type/region."""
    g = df[(df.pc_type == pc_type) & (df.region == region)].sort_values("date")
    fig, ax = plt.subplots(figsize=(10, 3))
    plot_pacf(g["pc_price"], lags=lags, ax=ax, title=f"PACF {pc_type} - {region}")
    plt.tight_layout()
    plt.show()


# Exemple : décommentez pour lancer
# plot_pacf_serie('regular', 'europe')
# plot_pacf_serie('regular', 'asia')
# plot_pacf_serie('green', 'europe')
# plot_pacf_serie('green', 'asia')