In [1]:
# Step 1 — Preprocess + aggregate all CSVs in data/dataset into one table
# - Reads all *.csv files in data/dataset (expects 10, but works for any count)
# - Standardizes column names
# - Casts dtypes (safe numeric parsing)
# - Sorts by timestamp, vehicle_id
# - Saves aggregated outputs (CSV + Parquet)

from __future__ import annotations

import glob
import os
from pathlib import Path
import pandas as pd

In [2]:
DATA_DIR = Path("data/dataset 1")
OUT_DIR = Path("data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Expected schema (your feature list)
EXPECTED_COLS = [
    "timestamp","vehicle_id","rsu_id",
    "x","y","z","speed","acceleration","heading_rad","road_id",
    "rxPower_dbm","sinr_db","txPower_mw","txPowerSource","neighborCount","nearestRSU",
    "packetsSent","packetsReceived","pdr","distance_to_rsu_m","signal_age_ms","cqi","mcs",
    "rsrp_dbm","rsrq_db","connection_type","cbr","throughput_kbps","interference_dbm"
]

# Columns to parse as numeric (everything except obvious IDs/categoricals)
NUMERIC_COLS = [
    "timestamp","x","y","z","speed","acceleration","heading_rad",
    "rxPower_dbm","sinr_db","txPower_mw","neighborCount",
    "packetsSent","packetsReceived","pdr","distance_to_rsu_m","signal_age_ms",
    "cqi","mcs","rsrp_dbm","rsrq_db","cbr","throughput_kbps","interference_dbm"
]

ID_OR_CAT_COLS = [
    "vehicle_id","rsu_id","road_id","txPowerSource","nearestRSU","connection_type"
]

In [4]:
def read_one_csv(fp: Path) -> pd.DataFrame:
    df = pd.read_csv(fp)

    # Normalize column names (strip spaces, keep exact case)
    df.columns = [c.strip() for c in df.columns]

    # If some files have missing columns, add them as NA.
    missing = [c for c in EXPECTED_COLS if c not in df.columns]
    for c in missing:
        df[c] = pd.NA

    # Keep only expected columns (drops extras if any)
    df = df[EXPECTED_COLS].copy()

    # Parse numerics safely
    for c in NUMERIC_COLS:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # Keep IDs as string to avoid mixed types (e.g., "veh_12" vs 12)
    for c in ID_OR_CAT_COLS:
        if c in df.columns:
            df[c] = df[c].astype("string")

    # Add file provenance (helps debugging / ablations)
    df["source_file"] = fp.name

    return df

def aggregate_dataset(data_dir: Path) -> pd.DataFrame:
    files = sorted(data_dir.glob("*.csv"))
    if not files:
        raise FileNotFoundError(f"No CSV files found in: {data_dir.resolve()}")

    dfs = []
    for i, fp in enumerate(files):
        print(i)
        if i == 10:
            break
        try:
            dfs.append(read_one_csv(fp))
        except Exception as e:
            raise RuntimeError(f"Failed reading {fp}: {e}") from e

    agg = pd.concat(dfs, ignore_index=True)

    # Basic cleanup
    # Drop rows missing the minimum required fields
    agg = agg.dropna(subset=["timestamp", "vehicle_id"]).reset_index(drop=True)

    # Optional: convert timestamp to integer ms if needed
    # If your timestamp is already in seconds or ms, keep as numeric.
    # agg["timestamp"] = agg["timestamp"].astype("int64")

    # Sort for time-series modeling
    agg = agg.sort_values(["timestamp", "vehicle_id"], kind="mergesort").reset_index(drop=True)

    return agg

if __name__ == "__main__":
    df_all = aggregate_dataset(DATA_DIR)

    # Quick sanity checks
    csv_files = sorted(DATA_DIR.glob("*.csv"))
    print(f"Read {len(csv_files)} CSV file(s) from {DATA_DIR}")
    print(f"Aggregated rows: {len(df_all):,}")
    print("Columns:", list(df_all.columns))
    print(df_all.head(3))

    # Save aggregated dataset
    out_csv = OUT_DIR / "v2x_aggregated.csv"
    out_parquet = OUT_DIR / "v2x_aggregated.parquet"

    df_all.to_csv(out_csv, index=False)
    df_all.to_parquet(out_parquet, index=False)

    print(f"Saved: {out_csv}")
    print(f"Saved: {out_parquet}")


0
1
2
3
4
5
6
7
8
9
10
Read 1386 CSV file(s) from data\dataset 1
Aggregated rows: 69,700
Columns: ['timestamp', 'vehicle_id', 'rsu_id', 'x', 'y', 'z', 'speed', 'acceleration', 'heading_rad', 'road_id', 'rxPower_dbm', 'sinr_db', 'txPower_mw', 'txPowerSource', 'neighborCount', 'nearestRSU', 'packetsSent', 'packetsReceived', 'pdr', 'distance_to_rsu_m', 'signal_age_ms', 'cqi', 'mcs', 'rsrp_dbm', 'rsrq_db', 'connection_type', 'cbr', 'throughput_kbps', 'interference_dbm', 'source_file']
   timestamp vehicle_id rsu_id          x          y      z    speed  \
0        6.7    node[0]      0  4966.7891  1400.3171  1.895  13.1719   
1        6.8    node[0]      0  4966.7450  1401.6309  1.895  13.1458   
2        6.9    node[0]      0  4966.7013  1402.9345  1.895  13.0431   

   acceleration  heading_rad   road_id  ...  signal_age_ms  cqi  mcs rsrp_dbm  \
0      131.7189      -1.6043  21159656  ...        20.7109    8   14 -69.0389   
1       -0.2613      -1.6043  21159656  ...        20.7158    8

## Step 2

In [5]:
"""
Step 2 — Stage 1 RSU Forecaster (NO trajectory prediction)

What this script does:
1) Load the preprocessed aggregated vehicle-level table from a Parquet file.
2) Build RSU-level time series by aggregating vehicle signals per (timestamp, rsu_id).
3) Create supervised learning samples using historical windows (Tin) to predict future RSU load (Tout).
4) Train a strong tabular baseline (sklearn HistGradientBoosting via MultiOutputRegressor).
5) Evaluate on a STRICT time split (no shuffling) and save artifacts.

Assumptions:
- You already produced a Parquet like: data/processed/v2x_aggregated.parquet
- timestamp is numeric and monotonic within each rsu_id (can be float seconds or integer ms).
- rsu_id exists for each record.
"""

from __future__ import annotations

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [6]:
# -----------------------
# Config
# -----------------------
PARQUET_PATH = Path("data/processed/v2x_aggregated.parquet")
OUT_DIR = Path("data/stage1_rsu_forecaster")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Window sizes (10 Hz -> 0.1s)
Tin = 20    # 2.0 seconds history
Tout = 10   # 1.0 second horizon (multi-step)
TRAIN_RATIO = 0.7
VAL_RATIO = 0.1
TEST_RATIO = 0.2  # will be implied (1 - train - val)

RANDOM_STATE = 42

In [7]:


# -----------------------
# 1) Load
# -----------------------
df = pd.read_parquet(PARQUET_PATH)

required = {"timestamp", "rsu_id", "vehicle_id", "throughput_kbps"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns in parquet: {missing}")

# Basic cleaning
df = df.dropna(subset=["timestamp", "rsu_id"]).copy()

# -----------------------
# 2) Aggregate to RSU-level features (per timestamp, rsu_id)
# -----------------------
# Choose which per-vehicle columns to aggregate.
# Keep it simple, stable, and relevant to load.
agg_spec = {
    # Target ingredients / traffic
    "throughput_kbps": ["sum", "mean"],
    "packetsSent": ["sum", "mean"],
    "packetsReceived": ["sum", "mean"],

    # Density
    "vehicle_id": ["nunique"],  # num vehicles

    # Link quality
    "sinr_db": ["mean", "median"],
    "cqi": ["mean", "median"],
    "pdr": ["mean", "median"],
    "rxPower_dbm": ["mean", "median"],

    # Congestion/interference
    "cbr": ["mean", "median"],
    "interference_dbm": ["mean", "median"],

    # Geometry
    "distance_to_rsu_m": ["mean", "median"],
}

# Some columns may not exist if your preprocessing had NaNs/missing fields in some files
agg_spec = {k: v for k, v in agg_spec.items() if k in df.columns}

g = df.groupby(["timestamp", "rsu_id"], sort=False).agg(agg_spec)
g.columns = ["_".join(col).strip() for col in g.columns.to_flat_index()]
g = g.reset_index()

# Define RSU load target (what we forecast)
# Here: sum throughput_kbps of vehicles attached to RSU at time t
if "throughput_kbps_sum" not in g.columns:
    raise ValueError("Expected 'throughput_kbps_sum' after aggregation (target load).")
g = g.rename(columns={"throughput_kbps_sum": "rsu_load_kbps"})

# (Optional) fill NA for features (important for ML model stability)
feature_cols = [c for c in g.columns if c not in ["timestamp", "rsu_id", "rsu_load_kbps"]]
g[feature_cols] = g[feature_cols].fillna(0.0)
g["rsu_load_kbps"] = g["rsu_load_kbps"].fillna(0.0)

# Sort for time-based windowing
g = g.sort_values(["rsu_id", "timestamp"]).reset_index(drop=True)

# Save aggregated RSU table for inspection
g.to_parquet(OUT_DIR / "rsu_aggregated.parquet", index=False)

# -----------------------
# 3) Create supervised samples: X = past Tin steps of RSU features, y = next Tout steps of rsu_load_kbps
#    We do this per RSU, then concatenate across RSUs.
# -----------------------
def make_windows_for_one_rsu(df_r: pd.DataFrame, Tin: int, Tout: int):
    """Returns X, y, meta for one RSU (time-series windows)."""
    df_r = df_r.sort_values("timestamp").reset_index(drop=True)

    # Inputs at each t are current RSU-level features (NOT including target)
    Xmat = df_r[feature_cols].to_numpy(dtype=np.float32)
    yvec = df_r["rsu_load_kbps"].to_numpy(dtype=np.float32)

    T = len(df_r)
    n_samples = T - Tin - Tout + 1
    if n_samples <= 0:
        return None

    X = np.zeros((n_samples, Tin * len(feature_cols)), dtype=np.float32)
    y = np.zeros((n_samples, Tout), dtype=np.float32)

    # meta for later plotting/debug
    meta_ts = np.zeros((n_samples,), dtype=np.float64)

    for idx in range(n_samples):
        t0 = idx
        t_in_end = idx + Tin
        t_out_end = t_in_end + Tout

        X[idx, :] = Xmat[t0:t_in_end].reshape(-1)          # flatten Tin x F -> Tin*F
        y[idx, :] = yvec[t_in_end:t_out_end]               # next Tout steps
        meta_ts[idx] = df_r.loc[t_in_end - 1, "timestamp"] # anchor timestamp = last input time

    return X, y, meta_ts

Xs, Ys, RSUs, Tanchors = [], [], [], []
for rsu_id, df_r in g.groupby("rsu_id", sort=False):
    out = make_windows_for_one_rsu(df_r, Tin=Tin, Tout=Tout)
    if out is None:
        continue
    X, y, meta_ts = out
    Xs.append(X); Ys.append(y)
    RSUs.append(np.full((len(X),), str(rsu_id), dtype=object))
    Tanchors.append(meta_ts)

X_all = np.concatenate(Xs, axis=0)
y_all = np.concatenate(Ys, axis=0)
rsu_all = np.concatenate(RSUs, axis=0)
t_anchor_all = np.concatenate(Tanchors, axis=0)

print(f"Stage-1 dataset windows: X={X_all.shape}, y={y_all.shape} (Tout={Tout})")

# -----------------------
# 4) Strict time split (global by timestamp anchors)
#    We split by time over all RSUs to avoid leakage.
# -----------------------
# Sort windows by anchor time
order = np.argsort(t_anchor_all)
X_all = X_all[order]
y_all = y_all[order]
rsu_all = rsu_all[order]
t_anchor_all = t_anchor_all[order]

N = len(X_all)
n_train = int(N * TRAIN_RATIO)
n_val = int(N * VAL_RATIO)
n_test = N - n_train - n_val

X_train, y_train = X_all[:n_train], y_all[:n_train]
X_val, y_val     = X_all[n_train:n_train+n_val], y_all[n_train:n_train+n_val]
X_test, y_test   = X_all[n_train+n_val:], y_all[n_train+n_val:]

print(f"Split sizes: train={len(X_train)}, val={len(X_val)}, test={len(X_test)}")

# -----------------------
# 5) Train model (tabular multi-step)
# -----------------------
base = HistGradientBoostingRegressor(
    loss="absolute_error",     # robust MAE-like loss
    learning_rate=0.05,
    max_depth=6,
    max_iter=400,
    random_state=RANDOM_STATE,
)
model = MultiOutputRegressor(base, n_jobs=-1)
model.fit(X_train, y_train)

# -----------------------
# 6) Evaluate
# -----------------------
def eval_multi_step(name: str, y_true: np.ndarray, y_pred: np.ndarray):
    maes = []
    rmses = []
    for k in range(y_true.shape[1]):
        mae = mean_absolute_error(y_true[:, k], y_pred[:, k])
        rmse = mean_squared_error(y_true[:, k], y_pred[:, k], squared=False)
        maes.append(mae)
        rmses.append(rmse)
    print(f"\n[{name}] Per-horizon MAE:  {np.round(maes, 3)}")
    print(f"[{name}] Per-horizon RMSE: {np.round(rmses, 3)}")
    print(f"[{name}] Avg MAE={np.mean(maes):.4f}, Avg RMSE={np.mean(rmses):.4f}")

# Baseline: persistence (predict next Tout steps as last observed load in input window)
# Note: persistence uses ONLY aggregated series; here we approximate using y(t) at anchor time.
# Since y_all is aligned to anchor times, persistence is y_true shifted? We can compute from windows:
# A simple persistence predictor is: predict y(t+1..t+Tout) = y(t) (last observed load).
# We don't have y(t) in X, so rebuild from anchor-time load using g table:
load_lookup = g.set_index(["timestamp", "rsu_id"])["rsu_load_kbps"].to_dict()

def persistence_predict(t_anchor: np.ndarray, rsu_ids: np.ndarray, Tout: int) -> np.ndarray:
    yp = np.zeros((len(t_anchor), Tout), dtype=np.float32)
    for i in range(len(t_anchor)):
        key = (float(t_anchor[i]), str(rsu_ids[i]))
        last_load = load_lookup.get(key, 0.0)
        yp[i, :] = last_load
    return yp

# Validation predictions
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Persistence
t_val = t_anchor_all[n_train:n_train+n_val]
r_val = rsu_all[n_train:n_train+n_val]
y_val_persist = persistence_predict(t_val, r_val, Tout)

t_test = t_anchor_all[n_train+n_val:]
r_test = rsu_all[n_train+n_val:]
y_test_persist = persistence_predict(t_test, r_test, Tout)

eval_multi_step("VAL  (Model)", y_val, y_val_pred)
eval_multi_step("VAL  (Persist)", y_val, y_val_persist)

eval_multi_step("TEST (Model)", y_test, y_test_pred)
eval_multi_step("TEST (Persist)", y_test, y_test_persist)

# -----------------------
# 7) Save model + metadata (lightweight)
# -----------------------
import joblib

joblib.dump(model, OUT_DIR / "stage1_rsu_model.joblib")

meta = {
    "Tin": Tin,
    "Tout": Tout,
    "feature_cols": feature_cols,
    "train_ratio": TRAIN_RATIO,
    "val_ratio": VAL_RATIO,
    "parquet_path": str(PARQUET_PATH),
}
joblib.dump(meta, OUT_DIR / "stage1_meta.joblib")

print(f"\nSaved model to: {OUT_DIR / 'stage1_rsu_model.joblib'}")
print(f"Saved meta  to: {OUT_DIR / 'stage1_meta.joblib'}")
print(f"Saved RSU aggregated table to: {OUT_DIR / 'rsu_aggregated.parquet'}")


Stage-1 dataset windows: X=(41757, 400), y=(41757, 10) (Tout=10)
Split sizes: train=29229, val=4175, test=8353





[VAL  (Model)] Per-horizon MAE:  [1.238 1.992 2.228 1.855 2.561 2.238 2.218 3.085 2.928 3.096]
[VAL  (Model)] Per-horizon RMSE: [2.027 2.64  3.056 2.516 3.525 3.108 3.389 4.27  4.261 4.499]
[VAL  (Model)] Avg MAE=2.3438, Avg RMSE=3.3290

[VAL  (Persist)] Per-horizon MAE:  [0.085 0.16  0.207 0.244 0.285 0.323 0.363 0.402 0.434 0.463]
[VAL  (Persist)] Per-horizon RMSE: [0.519 0.755 0.915 1.045 1.16  1.264 1.361 1.453 1.534 1.609]
[VAL  (Persist)] Avg MAE=0.2967, Avg RMSE=1.1614

[TEST (Model)] Per-horizon MAE:  [5.131 4.566 4.948 4.82  5.1   4.178 4.429 4.746 3.55  3.735]
[TEST (Model)] Per-horizon RMSE: [5.954 5.446 5.906 5.677 6.073 5.111 5.531 5.759 4.417 4.618]
[TEST (Model)] Avg MAE=4.5202, Avg RMSE=5.4491

[TEST (Persist)] Per-horizon MAE:  [0.032 0.063 0.074 0.087 0.1   0.112 0.124 0.136 0.148 0.161]
[TEST (Persist)] Per-horizon RMSE: [0.235 0.338 0.409 0.47  0.523 0.572 0.617 0.658 0.697 0.735]
[TEST (Persist)] Avg MAE=0.1036, Avg RMSE=0.5255

Saved model to: data\stage1_rsu_for

In [8]:
"""
Step 2 — Stage 1 RSU Forecaster (NO trajectory prediction)
Add 5+ additional models, evaluate, and plot which one outperforms.

Models included (>=6 total):
1) Persistence baseline
2) Ridge (linear, strong baseline)
3) RandomForest
4) ExtraTrees
5) HistGradientBoosting (your previous default)
6) MLPRegressor
7) KNeighborsRegressor

Notes:
- All are sklearn models (no extra installs needed).
- Multi-step forecasting is handled with MultiOutputRegressor.
- Strict time split by anchor timestamp (no shuffling).
- Plots: Avg MAE and Avg RMSE on VAL and TEST.
"""

from __future__ import annotations

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# -----------------------
# Config
# -----------------------
PARQUET_PATH = Path("data/processed/v2x_aggregated.parquet")
OUT_DIR = Path("data/stage1_rsu_forecaster_multi")
OUT_DIR.mkdir(parents=True, exist_ok=True)

Tin = 20     # 2.0s history @ 10Hz
Tout = 10    # 1.0s horizon @ 10Hz
TRAIN_RATIO = 0.7
VAL_RATIO = 0.1
RANDOM_STATE = 42

# -----------------------
# 1) Load
# -----------------------
df = pd.read_parquet(PARQUET_PATH)
df = df.dropna(subset=["timestamp", "rsu_id"]).copy()

required = {"timestamp", "rsu_id", "vehicle_id", "throughput_kbps"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns in parquet: {missing}")

# -----------------------
# 2) Aggregate to RSU-level features
# -----------------------
agg_spec = {
    "throughput_kbps": ["sum", "mean"],
    "packetsSent": ["sum", "mean"],
    "packetsReceived": ["sum", "mean"],
    "vehicle_id": ["nunique"],

    "sinr_db": ["mean", "median"],
    "cqi": ["mean", "median"],
    "pdr": ["mean", "median"],
    "rxPower_dbm": ["mean", "median"],

    "cbr": ["mean", "median"],
    "interference_dbm": ["mean", "median"],
    "distance_to_rsu_m": ["mean", "median"],
}
agg_spec = {k: v for k, v in agg_spec.items() if k in df.columns}

g = df.groupby(["timestamp", "rsu_id"], sort=False).agg(agg_spec)
g.columns = ["_".join(col).strip() for col in g.columns.to_flat_index()]
g = g.reset_index()

if "throughput_kbps_sum" not in g.columns:
    raise ValueError("Expected 'throughput_kbps_sum' after aggregation.")

g = g.rename(columns={"throughput_kbps_sum": "rsu_load_kbps"})

feature_cols = [c for c in g.columns if c not in ["timestamp", "rsu_id", "rsu_load_kbps"]]
g[feature_cols] = g[feature_cols].fillna(0.0)
g["rsu_load_kbps"] = g["rsu_load_kbps"].fillna(0.0)

g = g.sort_values(["rsu_id", "timestamp"]).reset_index(drop=True)
g.to_parquet(OUT_DIR / "rsu_aggregated.parquet", index=False)

# -----------------------
# 3) Windowing
# -----------------------
def make_windows_for_one_rsu(df_r: pd.DataFrame, Tin: int, Tout: int):
    df_r = df_r.sort_values("timestamp").reset_index(drop=True)
    Xmat = df_r[feature_cols].to_numpy(dtype=np.float32)
    yvec = df_r["rsu_load_kbps"].to_numpy(dtype=np.float32)

    T = len(df_r)
    n_samples = T - Tin - Tout + 1
    if n_samples <= 0:
        return None

    X = np.zeros((n_samples, Tin * len(feature_cols)), dtype=np.float32)
    y = np.zeros((n_samples, Tout), dtype=np.float32)
    meta_ts = np.zeros((n_samples,), dtype=np.float64)

    for idx in range(n_samples):
        t0 = idx
        t_in_end = idx + Tin
        t_out_end = t_in_end + Tout

        X[idx, :] = Xmat[t0:t_in_end].reshape(-1)
        y[idx, :] = yvec[t_in_end:t_out_end]
        meta_ts[idx] = df_r.loc[t_in_end - 1, "timestamp"]

    return X, y, meta_ts

Xs, Ys, RSUs, Tanchors = [], [], [], []
for rsu_id, df_r in g.groupby("rsu_id", sort=False):
    out = make_windows_for_one_rsu(df_r, Tin=Tin, Tout=Tout)
    if out is None:
        continue
    X, y, meta_ts = out
    Xs.append(X); Ys.append(y)
    RSUs.append(np.full((len(X),), str(rsu_id), dtype=object))
    Tanchors.append(meta_ts)

X_all = np.concatenate(Xs, axis=0)
y_all = np.concatenate(Ys, axis=0)
rsu_all = np.concatenate(RSUs, axis=0)
t_anchor_all = np.concatenate(Tanchors, axis=0)

print(f"Windows: X={X_all.shape}, y={y_all.shape}, features={len(feature_cols)}, Tin={Tin}, Tout={Tout}")

# -----------------------
# 4) Strict time split by anchor timestamp
# -----------------------
order = np.argsort(t_anchor_all)
X_all = X_all[order]
y_all = y_all[order]
rsu_all = rsu_all[order]
t_anchor_all = t_anchor_all[order]

N = len(X_all)
n_train = int(N * TRAIN_RATIO)
n_val = int(N * VAL_RATIO)

X_train, y_train = X_all[:n_train], y_all[:n_train]
X_val, y_val     = X_all[n_train:n_train+n_val], y_all[n_train:n_train+n_val]
X_test, y_test   = X_all[n_train+n_val:], y_all[n_train+n_val:]

t_val = t_anchor_all[n_train:n_train+n_val]
r_val = rsu_all[n_train:n_train+n_val]
t_test = t_anchor_all[n_train+n_val:]
r_test = rsu_all[n_train+n_val:]

print(f"Split sizes: train={len(X_train)}, val={len(X_val)}, test={len(X_test)}")

# -----------------------
# 5) Persistence baseline helper
# -----------------------
load_lookup = g.set_index(["timestamp", "rsu_id"])["rsu_load_kbps"].to_dict()

def persistence_predict(t_anchor: np.ndarray, rsu_ids: np.ndarray, Tout: int) -> np.ndarray:
    yp = np.zeros((len(t_anchor), Tout), dtype=np.float32)
    for i in range(len(t_anchor)):
        key = (float(t_anchor[i]), str(rsu_ids[i]))
        last_load = float(load_lookup.get(key, 0.0))
        yp[i, :] = last_load
    return yp

# -----------------------
# 6) Evaluation utilities
# -----------------------
def multi_step_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    maes, rmses = [], []
    for k in range(y_true.shape[1]):
        maes.append(mean_absolute_error(y_true[:, k], y_pred[:, k]))
        rmses.append(mean_squared_error(y_true[:, k], y_pred[:, k], squared=False))
    return {
        "avg_mae": float(np.mean(maes)),
        "avg_rmse": float(np.mean(rmses)),
        "mae_per_h": maes,
        "rmse_per_h": rmses,
    }

def fit_predict_model(model, Xtr, ytr, Xv, Xt):
    model.fit(Xtr, ytr)
    return model.predict(Xv), model.predict(Xt)

# -----------------------
# 7) Define models (>=5 additional)
# -----------------------
models = {
    "Ridge": MultiOutputRegressor(
        Ridge(alpha=2.0, random_state=RANDOM_STATE)
    ),
    "RandomForest": MultiOutputRegressor(
        RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_leaf=3,
            n_jobs=-1,
            random_state=RANDOM_STATE,
        ),
        n_jobs=-1
    ),
    "ExtraTrees": MultiOutputRegressor(
        ExtraTreesRegressor(
            n_estimators=500,
            max_depth=None,
            min_samples_leaf=2,
            n_jobs=-1,
            random_state=RANDOM_STATE,
        ),
        n_jobs=-1
    ),
    "HistGBDT": MultiOutputRegressor(
        HistGradientBoostingRegressor(
            loss="absolute_error",
            learning_rate=0.05,
            max_depth=6,
            max_iter=500,
            random_state=RANDOM_STATE,
        ),
        n_jobs=-1
    ),
    "MLP": MultiOutputRegressor(
        MLPRegressor(
            hidden_layer_sizes=(256, 128),
            activation="relu",
            solver="adam",
            alpha=1e-4,
            learning_rate_init=1e-3,
            max_iter=150,
            random_state=RANDOM_STATE,
            early_stopping=True,
            n_iter_no_change=10,
            validation_fraction=0.1,
        ),
        n_jobs=-1
    ),
    "KNN": MultiOutputRegressor(
        KNeighborsRegressor(
            n_neighbors=25,
            weights="distance",
            metric="minkowski",
            p=2,
        ),
        n_jobs=-1
    ),
}

# -----------------------
# 8) Run all models + baseline
# -----------------------
results = []

# Persistence
y_val_p = persistence_predict(t_val, r_val, Tout)
y_test_p = persistence_predict(t_test, r_test, Tout)
m_val = multi_step_metrics(y_val, y_val_p)
m_test = multi_step_metrics(y_test, y_test_p)
results.append({
    "model": "Persistence",
    "val_avg_mae": m_val["avg_mae"],
    "val_avg_rmse": m_val["avg_rmse"],
    "test_avg_mae": m_test["avg_mae"],
    "test_avg_rmse": m_test["avg_rmse"],
})

# Learned models
for name, mdl in models.items():
    print(f"\nTraining: {name}")
    y_val_pred, y_test_pred = fit_predict_model(mdl, X_train, y_train, X_val, X_test)

    m_val = multi_step_metrics(y_val, y_val_pred)
    m_test = multi_step_metrics(y_test, y_test_pred)

    results.append({
        "model": name,
        "val_avg_mae": m_val["avg_mae"],
        "val_avg_rmse": m_val["avg_rmse"],
        "test_avg_mae": m_test["avg_mae"],
        "test_avg_rmse": m_test["avg_rmse"],
    })

# Save results table
res_df = pd.DataFrame(results).sort_values("val_avg_mae").reset_index(drop=True)
print("\n=== Model ranking by VAL avg MAE (lower is better) ===")
print(res_df)

res_df.to_csv(OUT_DIR / "stage1_model_comparison.csv", index=False)

# -----------------------
# 9) Plot results (no explicit colors)
# -----------------------
def plot_metric(df_plot: pd.DataFrame, metric_col: str, title: str, out_path: Path):
    plt.figure(figsize=(10, 4))
    x = np.arange(len(df_plot))
    plt.bar(x, df_plot[metric_col].values)
    plt.xticks(x, df_plot["model"].values, rotation=30, ha="right")
    plt.ylabel(metric_col)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)
    plt.close()

# Sort by each metric for readability in plots
plot_metric(res_df.sort_values("val_avg_mae"),  "val_avg_mae",  "Stage-1 RSU Forecaster — VAL Avg MAE",  OUT_DIR / "val_avg_mae.png")
plot_metric(res_df.sort_values("val_avg_rmse"), "val_avg_rmse", "Stage-1 RSU Forecaster — VAL Avg RMSE", OUT_DIR / "val_avg_rmse.png")
plot_metric(res_df.sort_values("test_avg_mae"),  "test_avg_mae",  "Stage-1 RSU Forecaster — TEST Avg MAE",  OUT_DIR / "test_avg_mae.png")
plot_metric(res_df.sort_values("test_avg_rmse"), "test_avg_rmse", "Stage-1 RSU Forecaster — TEST Avg RMSE", OUT_DIR / "test_avg_rmse.png")

print(f"\nSaved comparison CSV to: {OUT_DIR / 'stage1_model_comparison.csv'}")
print(f"Saved plots to: {OUT_DIR}")


Windows: X=(41757, 400), y=(41757, 10), features=20, Tin=20, Tout=10
Split sizes: train=29229, val=4175, test=8353

Training: Ridge





Training: RandomForest





Training: ExtraTrees


: 