In [1]:
import os

# List all .tsf files in the current directory
tsf_files = [f for f in os.listdir('.') if f.lower().endswith('.tsf')]

# Sort for readability
tsf_files.sort()

print(f"Found {len(tsf_files)} .tsf files:\n")
for f in tsf_files:
    print(f)


Found 3 .tsf files:

electricity_weekly_dataset.tsf
hospital_dataset.tsf
tourism_monthly_dataset.tsf


In [2]:
import os
import re
from pathlib import Path
from collections import defaultdict

TSF_FILES = [
    "hospital_dataset.tsf",
    "tourism_monthly_dataset.tsf",
    "electricity_weekly_dataset.tsf",
]

def parse_tsf(filepath, max_series_preview=3):
    """
    Lightweight .tsf inspector:
    - reads header comments (# ...)
    - reads @metadata fields (e.g., @frequency, @horizon, @missing, @equallength)
    - reads @attribute lines
    - samples a few @data rows to summarize series lengths and value parsing
    Does NOT load entire file into memory as a full dataframe (fast + safe).
    """
    meta = {}
    attributes = []
    header_comments = []
    in_data = False
    series_lengths = []
    series_names_preview = []
    start_timestamps_preview = []
    numeric_parse_failures = 0

    # TSF @attribute format: "@attribute <name> <type>"
    attr_pat = re.compile(r"^@attribute\s+(\S+)\s+(\S+)", re.IGNORECASE)

    with open(filepath, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            if line.startswith("#") and not in_data:
                header_comments.append(line)
                continue

            if line.lower().startswith("@data"):
                in_data = True
                continue

            if not in_data:
                if line.lower().startswith("@attribute"):
                    m = attr_pat.match(line)
                    if m:
                        attributes.append((m.group(1), m.group(2)))
                    continue

                if line.startswith("@"):
                    # e.g., @frequency weekly
                    parts = line.split(None, 1)
                    key = parts[0].lstrip("@").strip().lower()
                    val = parts[1].strip() if len(parts) > 1 else ""
                    meta[key] = val
                continue

            # Data row example:
            # series_name:start_timestamp:val1,val2,val3,...
            # Some datasets may omit timestamps or use different separators; handle robustly.
            if in_data:
                # stop after previewing a few series to keep it fast
                if len(series_lengths) >= 5000 and max_series_preview == 0:
                    break

                # Split first two ":" occurrences
                parts = line.split(":", 2)
                if len(parts) < 2:
                    continue

                sname = parts[0]
                series_names_preview.append(sname) if len(series_names_preview) < max_series_preview else None

                if len(parts) == 2:
                    # no timestamp field, just values
                    values_str = parts[1]
                else:
                    ts = parts[1]
                    values_str = parts[2]
                    if len(start_timestamps_preview) < max_series_preview:
                        start_timestamps_preview.append(ts)

                values = [v.strip() for v in values_str.split(",") if v.strip() != ""]
                series_lengths.append(len(values))

                # quick numeric parse check (sample first 20 values)
                for v in values[:20]:
                    try:
                        float(v)
                    except:
                        numeric_parse_failures += 1

                # only preview a few series rows (not entire file) for speed
                if len(series_lengths) >= max_series_preview and max_series_preview > 0:
                    # keep scanning to get length stats? If you want full stats, set max_series_preview=0
                    pass

    summary = {
        "file": os.path.basename(filepath),
        "meta": meta,
        "attributes": attributes,
        "header_comment_lines": len(header_comments),
        "series_count_estimate": len(series_lengths),
        "series_length_min": min(series_lengths) if series_lengths else None,
        "series_length_median": (sorted(series_lengths)[len(series_lengths)//2] if series_lengths else None),
        "series_length_max": max(series_lengths) if series_lengths else None,
        "numeric_parse_failures_in_sample": numeric_parse_failures,
        "series_name_preview": series_names_preview[:max_series_preview],
        "start_timestamp_preview": start_timestamps_preview[:max_series_preview],
    }
    return summary

def print_summary(s):
    print("=" * 90)
    print(f"FILE: {s['file']}")
    print("-" * 90)

    # Metadata fields commonly used in Monash TSF
    for k in ["relation", "frequency", "horizon", "missing", "equallength"]:
        if k in s["meta"]:
            print(f"@{k}: {s['meta'][k]}")
    # Print any additional meta keys not in the common list
    other_keys = [k for k in s["meta"].keys() if k not in {"relation","frequency","horizon","missing","equallength"}]
    if other_keys:
        print("Other @meta:")
        for k in sorted(other_keys):
            print(f"  @{k}: {s['meta'][k]}")

    print("\n@attributes:")
    if s["attributes"]:
        for name, typ in s["attributes"]:
            print(f"  - {name} ({typ})")
    else:
        print("  (none found)")

    print("\nData scan (limited):")
    print(f"  Series rows scanned: {s['series_count_estimate']}")
    print(f"  Series length min/median/max: {s['series_length_min']} / {s['series_length_median']} / {s['series_length_max']}")
    print(f"  Numeric parse failures (sampled values): {s['numeric_parse_failures_in_sample']}")

    print("\nPreview:")
    for i, name in enumerate(s["series_name_preview"]):
        ts = s["start_timestamp_preview"][i] if i < len(s["start_timestamp_preview"]) else "(no timestamp field)"
        print(f"  - series_name={name} | start_timestamp={ts}")

# --- RUN INSPECTION ---
for fn in TSF_FILES:
    path = Path("./") / fn
    if not path.exists():
        print(f"[MISSING] {fn} not found in current directory.")
        continue
    summary = parse_tsf(path, max_series_preview=3)
    print_summary(summary)


FILE: hospital_dataset.tsf
------------------------------------------------------------------------------------------
@relation: Hospital
@frequency: monthly
@missing: false
@equallength: true

@attributes:
  - series_name (string)
  - start_timestamp (date)

Data scan (limited):
  Series rows scanned: 767
  Series length min/median/max: 84 / 84 / 84
  Numeric parse failures (sampled values): 0

Preview:
  - series_name=T1 | start_timestamp=2000-01-01 00-00-00
  - series_name=T2 | start_timestamp=2000-01-01 00-00-00
  - series_name=T3 | start_timestamp=2000-01-01 00-00-00
FILE: tourism_monthly_dataset.tsf
------------------------------------------------------------------------------------------
@relation: Tourism
@frequency: monthly
@horizon: 24
@missing: false
@equallength: false

@attributes:
  - series_name (string)
  - start_timestamp (date)

Data scan (limited):
  Series rows scanned: 366
  Series length min/median/max: 91 / 330 / 333
  Numeric parse failures (sampled values): 0



In [3]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import re

TSF_FILES = [
    "hospital_dataset.tsf",
    "tourism_monthly_dataset.tsf",
    "electricity_weekly_dataset.tsf",
]

def parse_tsf_to_long_df(path: str) -> pd.DataFrame:
    meta = {}
    in_data = False
    rows = []

    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            if line.lower().startswith("@data"):
                in_data = True
                continue

            if not in_data:
                if line.startswith("@"):
                    parts = line.split(None, 1)
                    key = parts[0].lstrip("@").strip().lower()
                    val = parts[1].strip() if len(parts) > 1 else ""
                    meta[key] = val
                continue

            # data line: series_name:start_timestamp:val1,val2,...
            parts = line.split(":", 2)
            if len(parts) < 2:
                continue

            series_name = parts[0]
            if len(parts) == 2:
                start_ts = None
                values_str = parts[1]
            else:
                start_ts = parts[1]
                values_str = parts[2]

            values = [v.strip() for v in values_str.split(",") if v.strip() != ""]
            y = pd.to_numeric(pd.Series(values), errors="coerce")

            freq = meta.get("frequency", "").lower()
            if start_ts is None:
                # fallback: index-based timestamps
                t = pd.RangeIndex(len(y))
            else:
                start = pd.to_datetime(start_ts.replace("-", ":"), errors="coerce")  # tsf uses "00-00-00"
                if pd.isna(start):
                    start = pd.to_datetime(start_ts, errors="coerce")

                if freq == "monthly":
                    t = pd.date_range(start=start, periods=len(y), freq="MS")
                elif freq == "weekly":
                    t = pd.date_range(start=start, periods=len(y), freq="W-SUN")
                else:
                    # generic fallback
                    t = pd.date_range(start=start, periods=len(y), freq="D")

            df_part = pd.DataFrame({
                "dataset": Path(path).name,
                "series_name": series_name,
                "timestamp": t,
                "y": y.values
            })
            rows.append(df_part)

    out = pd.concat(rows, ignore_index=True)
    return out

dfs = {}
for fn in TSF_FILES:
    df = parse_tsf_to_long_df(fn)
    dfs[fn] = df
    print("="*80)
    print(fn)
    print("shape:", df.shape)
    print("columns:", df.columns.tolist())
    print("n_series:", df["series_name"].nunique())
    print("date_range:", df["timestamp"].min(), "->", df["timestamp"].max())
    print("missing_y:", df["y"].isna().sum())
    print(df.head(3))


hospital_dataset.tsf
shape: (64428, 4)
columns: ['dataset', 'series_name', 'timestamp', 'y']
n_series: 767
date_range: 2000-01-01 00:00:00+00:00 -> 2006-12-01 00:00:00+00:00
missing_y: 0
                dataset series_name                 timestamp   y
0  hospital_dataset.tsf          T1 2000-01-01 00:00:00+00:00  27
1  hospital_dataset.tsf          T1 2000-02-01 00:00:00+00:00  16
2  hospital_dataset.tsf          T1 2000-03-01 00:00:00+00:00  18
tourism_monthly_dataset.tsf
shape: (109280, 4)
columns: ['dataset', 'series_name', 'timestamp', 'y']
n_series: 366
date_range: 1979-01-01 00:00:00+00:00 -> 2007-09-01 00:00:00+00:00
missing_y: 0
                       dataset series_name                 timestamp  \
0  tourism_monthly_dataset.tsf          T1 1979-01-01 00:00:00+00:00   
1  tourism_monthly_dataset.tsf          T1 1979-02-01 00:00:00+00:00   
2  tourism_monthly_dataset.tsf          T1 1979-03-01 00:00:00+00:00   

           y  
0  1149.8700  
1  1053.8002  
2  1388.8798  
elect

In [4]:
import pandas as pd
from pandas.tseries.frequencies import to_offset

# -----------------------------
# CONFIG (LOCKED)
# -----------------------------
DATASETS = {
    "hospital_dataset.tsf": {
        "freq": "MS",      # monthly start
        "horizons": [6, 12],
        "min_train_periods": 48,  # 4 years
        "step": 3          # months between rollings
    },
    "tourism_monthly_dataset.tsf": {
        "freq": "MS",
        "horizons": [24],
        "min_train_periods": 96,  # 8 years (handles unequal length)
        "step": 6
    },
    "electricity_weekly_dataset.tsf": {
        "freq": "W-SUN",
        "horizons": [8],
        "min_train_periods": 104, # 2 years
        "step": 4                 # weeks
    }
}

def build_rolling_splits(df, dataset_name, cfg):
    """
    Builds rolling-origin splits PER SERIES.
    Returns a DataFrame with one row per split:
    series_name | split_id | train_end | test_start | test_end | horizon
    """
    rows = []
    freq = cfg["freq"]
    step = cfg["step"]
    min_train = cfg["min_train_periods"]

    for horizon in cfg["horizons"]:
        for s, g in df[df["dataset"] == dataset_name].groupby("series_name"):
            g = g.sort_values("timestamp")
            times = g["timestamp"].values
            n = len(times)

            # compute rolling cutoff indices
            cutoff_idxs = list(range(min_train - 1, n - horizon, step))
            for i, cut_idx in enumerate(cutoff_idxs):
                train_end = times[cut_idx]
                test_start = times[cut_idx + 1]
                test_end = times[cut_idx + horizon]

                rows.append({
                    "dataset": dataset_name,
                    "series_name": s,
                    "horizon": horizon,
                    "split_id": i,
                    "train_end": pd.Timestamp(train_end),
                    "test_start": pd.Timestamp(test_start),
                    "test_end": pd.Timestamp(test_end)
                })

    return pd.DataFrame(rows)

# -----------------------------
# BUILD SPLITS
# -----------------------------
all_splits = []
for name, cfg in DATASETS.items():
    df = dfs[name]
    splits = build_rolling_splits(df, name, cfg)
    all_splits.append(splits)

splits_df = pd.concat(all_splits, ignore_index=True)

# -----------------------------
# VERIFICATION SUMMARY
# -----------------------------
summary = (
    splits_df
    .groupby(["dataset", "horizon"])
    .agg(
        n_splits=("split_id", "nunique"),
        n_series=("series_name", "nunique"),
        first_train_end=("train_end", "min"),
        last_test_end=("test_end", "max")
    )
    .reset_index()
)

print("="*90)
print("ROLLING BACKTEST SUMMARY")
print("="*90)
print(summary)

# Leakage sanity check (should always be true)
leakage_check = (splits_df["train_end"] < splits_df["test_start"]).all()
print("\nLeakage check (train_end < test_start for all splits):", leakage_check)

print("\nSample splits:")
print(splits_df.head(5))


ROLLING BACKTEST SUMMARY
                          dataset  horizon  n_splits  n_series  \
0  electricity_weekly_dataset.tsf        8        12       321   
1            hospital_dataset.tsf        6        11       767   
2            hospital_dataset.tsf       12         9       767   
3     tourism_monthly_dataset.tsf       24        36       365   

  first_train_end last_test_end  
0      2013-12-22    2014-12-21  
1      2003-12-01    2006-12-01  
2      2003-12-01    2006-12-01  
3      1986-12-01    2007-06-01  

Leakage check (train_end < test_start for all splits): True

Sample splits:
                dataset series_name  horizon  split_id  train_end test_start  \
0  hospital_dataset.tsf          T1        6         0 2003-12-01 2004-01-01   
1  hospital_dataset.tsf          T1        6         1 2004-03-01 2004-04-01   
2  hospital_dataset.tsf          T1        6         2 2004-06-01 2004-07-01   
3  hospital_dataset.tsf          T1        6         3 2004-09-01 2004-10-01 

In [5]:
import numpy as np
import pandas as pd

# -----------------------------
# FIX TIMEZONE MISMATCH (LOCKED)
# Convert all timestamps in dfs + splits_df to tz-naive consistently
# -----------------------------
def make_tz_naive(ts: pd.Series) -> pd.Series:
    ts = pd.to_datetime(ts, errors="coerce")
    # If tz-aware, drop tz; if already naive, this is a no-op
    try:
        return ts.dt.tz_localize(None)
    except AttributeError:
        # In case it's already a DatetimeIndex-like without .dt
        return ts.dt.tz_localize(None)

# Apply to dfs
for k in list(dfs.keys()):
    dfs[k]["timestamp"] = pd.to_datetime(dfs[k]["timestamp"], errors="coerce")
    if getattr(dfs[k]["timestamp"].dt, "tz", None) is not None:
        dfs[k]["timestamp"] = dfs[k]["timestamp"].dt.tz_localize(None)

# Apply to splits_df
for col in ["train_end", "test_start", "test_end"]:
    splits_df[col] = pd.to_datetime(splits_df[col], errors="coerce")
    if getattr(splits_df[col].dt, "tz", None) is not None:
        splits_df[col] = splits_df[col].dt.tz_localize(None)

# -----------------------------
# SEASONAL NAIVE
# -----------------------------
def seasonal_naive_forecast(y_train: np.ndarray, horizon: int, season_lag: int) -> np.ndarray:
    """
    Seasonal naive forecast:
    repeats the last observed seasonal cycle.
    """
    y_train = np.asarray(y_train, dtype=float)
    if len(y_train) == 0:
        return np.full(horizon, np.nan)

    if len(y_train) < season_lag:
        # fallback to last value
        return np.repeat(y_train[-1], horizon)

    last_season = y_train[-season_lag:]  # length season_lag
    reps = int(np.ceil(horizon / season_lag))
    y_hat = np.tile(last_season, reps)[:horizon]
    return y_hat

def evaluate_seasonal_naive(df_long, splits_df, dataset_name, freq):
    season_lag = 12 if freq == "monthly" else 52
    df_ds = df_long[df_long["dataset"] == dataset_name]

    rows = []
    for _, s in splits_df[splits_df["dataset"] == dataset_name].iterrows():
        g = df_ds[df_ds["series_name"] == s["series_name"]].sort_values("timestamp")

        train = g[g["timestamp"] <= s["train_end"]]
        test  = g[(g["timestamp"] >= s["test_start"]) & (g["timestamp"] <= s["test_end"])]

        y_true = test["y"].astype(float).values
        y_hat  = seasonal_naive_forecast(train["y"].astype(float).values, int(s["horizon"]), season_lag)

        # Safety: ensure lengths match
        if len(y_true) != len(y_hat):
            # This should not happen; if it does, skip + record
            continue

        rows.append({
            "dataset": dataset_name,
            "horizon": int(s["horizon"]),
            "series_name": s["series_name"],
            "split_id": int(s["split_id"]),
            "mae": float(np.mean(np.abs(y_true - y_hat))),
            "rmse": float(np.sqrt(np.mean((y_true - y_hat) ** 2))),
        })

    return pd.DataFrame(rows)

# -----------------------------
# RUN BASELINE
# -----------------------------
baseline_df = pd.concat([
    evaluate_seasonal_naive(dfs["hospital_dataset.tsf"], splits_df, "hospital_dataset.tsf", freq="monthly"),
    evaluate_seasonal_naive(dfs["tourism_monthly_dataset.tsf"], splits_df, "tourism_monthly_dataset.tsf", freq="monthly"),
    evaluate_seasonal_naive(dfs["electricity_weekly_dataset.tsf"], splits_df, "electricity_weekly_dataset.tsf", freq="weekly"),
], ignore_index=True)

summary = (
    baseline_df
    .groupby(["dataset", "horizon"], as_index=False)
    .agg(MAE=("mae", "mean"), RMSE=("rmse", "mean"))
)

print("="*90)
print("SEASONAL NAIVE BASELINE RESULTS")
print("="*90)
print(summary)

print("\nRows evaluated:", len(baseline_df))
print("Any NaNs in metrics:", baseline_df[["mae", "rmse"]].isna().any().to_dict())


SEASONAL NAIVE BASELINE RESULTS
                          dataset  horizon           MAE          RMSE
0  electricity_weekly_dataset.tsf        8  33504.051629  39984.900267
1            hospital_dataset.tsf        6     21.819545     26.014796
2            hospital_dataset.tsf       12     21.788582     27.021620
3     tourism_monthly_dataset.tsf       24   2006.442369   2521.121274

Rows evaluated: 30358
Any NaNs in metrics: {'mae': False, 'rmse': False}


In [6]:
import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

N_JOBS = -1
BATCH_SIZE = 50

DATASET = "hospital_dataset.tsf"
SEASONAL_PERIODS = 12

def ets_forecast(train_y, horizon):
    train_y = np.asarray(train_y, dtype=float)
    if len(train_y) < 2 * SEASONAL_PERIODS:
        return np.repeat(train_y[-1], horizon)

    model = ExponentialSmoothing(
        train_y,
        trend="add",
        seasonal="add",
        seasonal_periods=SEASONAL_PERIODS,
        initialization_method="estimated"
    )
    fit = model.fit(optimized=True)
    return fit.forecast(horizon)

def eval_one(row, grouped):
    g = grouped[row["series_name"]]

    train = g[g["timestamp"] <= row["train_end"]]
    test  = g[(g["timestamp"] >= row["test_start"]) & (g["timestamp"] <= row["test_end"])]

    y_true = test["y"].values
    y_hat = ets_forecast(train["y"].values, int(row["horizon"]))

    if len(y_true) != len(y_hat):
        return None

    return {
        "dataset": DATASET,
        "horizon": int(row["horizon"]),
        "series_name": row["series_name"],
        "split_id": int(row["split_id"]),
        "mae": float(np.mean(np.abs(y_true - y_hat))),
        "rmse": float(np.sqrt(np.mean((y_true - y_hat) ** 2))),
    }

# -----------------------------
# PREP
# -----------------------------
df_hosp = dfs[DATASET].copy()
df_hosp["timestamp"] = pd.to_datetime(df_hosp["timestamp"]).dt.tz_localize(None)

grouped = {
    s: g.sort_values("timestamp")
    for s, g in df_hosp.groupby("series_name")
}

tasks = splits_df[splits_df["dataset"] == DATASET].copy()
tasks["train_end"] = pd.to_datetime(tasks["train_end"]).dt.tz_localize(None)
tasks["test_start"] = pd.to_datetime(tasks["test_start"]).dt.tz_localize(None)
tasks["test_end"] = pd.to_datetime(tasks["test_end"]).dt.tz_localize(None)

print("Hospital ETS fits:", len(tasks))

# -----------------------------
# RUN
# -----------------------------
results = []
for start in tqdm(range(0, len(tasks), BATCH_SIZE), desc="Hospital ETS"):
    chunk = tasks.iloc[start:start+BATCH_SIZE]
    out = Parallel(n_jobs=N_JOBS)(
        delayed(eval_one)(row, grouped) for _, row in chunk.iterrows()
    )
    results.extend([r for r in out if r is not None])

ets_hospital_df = pd.DataFrame(results)

ets_hospital_summary = (
    ets_hospital_df
    .groupby(["dataset", "horizon"], as_index=False)
    .agg(MAE=("mae", "mean"), RMSE=("rmse", "mean"))
)

print("="*90)
print("HOSPITAL ETS RESULTS")
print("="*90)
print(ets_hospital_summary)


  from .autonotebook import tqdm as notebook_tqdm


Hospital ETS fits: 15340


Hospital ETS: 100%|██████████| 307/307 [17:01<00:00,  3.33s/it]

HOSPITAL ETS RESULTS
                dataset  horizon        MAE       RMSE
0  hospital_dataset.tsf        6  18.586508  21.778053
1  hospital_dataset.tsf       12  20.490773  24.421683





In [7]:
import os
os.environ["LIGHTGBM_VERBOSE"] = "-1"

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

# -----------------------------
# CONFIG (LOCKED)
# -----------------------------
DATASET = "hospital_dataset.tsf"
HORIZONS = [6, 12]
LAGS = [1, 3, 6, 12]
ROLLS = [3, 6, 12]
QUANTILES = [0.1, 0.5, 0.9]

# -----------------------------
# FEATURE ENGINEERING
# -----------------------------
df = dfs[DATASET].copy()
df["timestamp"] = pd.to_datetime(df["timestamp"]).dt.tz_localize(None)
df = df.sort_values(["series_name", "timestamp"])
df["y"] = pd.to_numeric(df["y"], errors="coerce")

for lag in LAGS:
    df[f"lag_{lag}"] = df.groupby("series_name")["y"].shift(lag)

for r in ROLLS:
    df[f"roll_mean_{r}"] = df.groupby("series_name")["y"].shift(1).rolling(r).mean()
    df[f"roll_std_{r}"]  = df.groupby("series_name")["y"].shift(1).rolling(r).std()

feature_cols = [c for c in df.columns if c.startswith(("lag_", "roll_"))]

# -----------------------------
# BUILD GLOBAL TRAIN / TEST SETS
# -----------------------------
def build_global_sets(horizon):
    X_train, y_train = [], []
    X_test, y_test = [], []

    splits = splits_df[
        (splits_df["dataset"] == DATASET) &
        (splits_df["horizon"] == horizon)
    ]

    for _, s in splits.iterrows():
        g = df[df["series_name"] == s["series_name"]]

        train = g[g["timestamp"] <= s["train_end"]].dropna(subset=feature_cols)
        test  = g[(g["timestamp"] >= s["test_start"]) &
                  (g["timestamp"] <= s["test_end"])].dropna(subset=feature_cols)

        if len(train) == 0 or len(test) == 0:
            continue

        X_train.append(train[feature_cols])
        y_train.append(train["y"])

        X_test.append(test[feature_cols])
        y_test.append(test["y"])

    return (
        pd.concat(X_train, ignore_index=True),
        pd.concat(y_train, ignore_index=True),
        pd.concat(X_test, ignore_index=True),
        pd.concat(y_test, ignore_index=True)
    )

# -----------------------------
# TRAIN GLOBAL MODELS
# -----------------------------
results = []

for horizon in HORIZONS:
    Xtr, ytr, Xte, yte = build_global_sets(horizon)

    print(f"\nHorizon {horizon}: train={Xtr.shape}, test={Xte.shape}")

    for q in QUANTILES:
        model = lgb.LGBMRegressor(
            objective="quantile",
            alpha=q,
            n_estimators=300,
            learning_rate=0.05,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        )

        model.fit(Xtr, ytr)
        preds = model.predict(Xte)

        if q == 0.5:
            mae = mean_absolute_error(yte, preds)
            mse = mean_squared_error(yte, preds)
            rmse = float(np.sqrt(mse))
            results.append({
                "dataset": DATASET,
                "horizon": horizon,
                "model": "LightGBM (global p50)",
                "MAE": float(mae),
                "RMSE": rmse
            })

results_df = pd.DataFrame(results)

print("="*90)
print("HOSPITAL ML (GLOBAL, FAST, CLEAN)")
print("="*90)
print(results_df)



Horizon 6: train=(430287, 10), test=(50622, 10)

Horizon 12: train=(331344, 10), test=(82836, 10)
HOSPITAL ML (GLOBAL, FAST, CLEAN)
                dataset  horizon                  model        MAE       RMSE
0  hospital_dataset.tsf        6  LightGBM (global p50)  17.674858  57.376329
1  hospital_dataset.tsf       12  LightGBM (global p50)  18.052381  59.459905


In [8]:
import numpy as np
import pandas as pd

# Rebuild test set for each horizon (same as training code)
def build_global_sets_only_test(horizon):
    X_test, y_test = [], []

    splits = splits_df[
        (splits_df["dataset"] == "hospital_dataset.tsf") &
        (splits_df["horizon"] == horizon)
    ]

    for _, s in splits.iterrows():
        g = df[df["series_name"] == s["series_name"]]
        test  = g[(g["timestamp"] >= s["test_start"]) &
                  (g["timestamp"] <= s["test_end"])].dropna(subset=feature_cols)
        if len(test) == 0:
            continue
        X_test.append(test[feature_cols])
        y_test.append(test["y"])

    return pd.concat(X_test, ignore_index=True), pd.concat(y_test, ignore_index=True)

def quick_stats(arr, name):
    arr = np.asarray(arr, dtype=float)
    print(f"\n{name}")
    print("count:", len(arr))
    print("min/median/max:", np.min(arr), np.median(arr), np.max(arr))
    print("p90/p95/p99:", np.percentile(arr, 90), np.percentile(arr, 95), np.percentile(arr, 99))

for horizon in [6, 12]:
    Xte, yte = build_global_sets_only_test(horizon)
    quick_stats(yte.values, f"y_true stats (h={horizon})")

    # Refit p50 model quickly (same params as before) and get preds
    import lightgbm as lgb
    from sklearn.metrics import mean_squared_error, mean_absolute_error

    Xtr, ytr, _, _ = build_global_sets(horizon)

    model = lgb.LGBMRegressor(
        objective="quantile",
        alpha=0.5,
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbosity=-1
    )
    model.fit(Xtr, ytr)
    preds = model.predict(Xte)

    quick_stats(preds, f"y_pred stats (h={horizon})")

    abs_err = np.abs(yte.values - preds)
    quick_stats(abs_err, f"|error| stats (h={horizon})")

    mae = mean_absolute_error(yte, preds)
    rmse = np.sqrt(mean_squared_error(yte, preds))
    print(f"\nMAE={mae:.3f}  RMSE={rmse:.3f}")

    # show top 10 worst errors
    worst_idx = np.argsort(abs_err)[-10:][::-1]
    worst_df = pd.DataFrame({
        "y_true": yte.values[worst_idx],
        "y_pred": preds[worst_idx],
        "abs_err": abs_err[worst_idx]
    })
    print("\nTop 10 worst errors:")
    print(worst_df)



y_true stats (h=6)
count: 50622
min/median/max: 1.0 40.0 12090.0
p90/p95/p99: 659.0 1253.0 3862.0

y_pred stats (h=6)
count: 50622
min/median/max: 5.448407845116466 40.06192431443854 11060.372240075365
p90/p95/p99: 663.9669210209174 1234.3124054914492 3916.482745301505

|error| stats (h=6)
count: 50622
min/median/max: 0.00012116444123577708 5.6394890137455995 1590.8779229571255
p90/p95/p99: 35.86512907157753 65.14987818461168 206.426182731866

MAE=17.675  RMSE=57.376

Top 10 worst errors:
   y_true        y_pred      abs_err
0    9407   7816.122077  1590.877923
1    9407   7816.122077  1590.877923
2    8899   7311.957456  1587.042544
3    8899   7311.957456  1587.042544
4    6846   8337.593563  1491.593563
5   10154   8765.282527  1388.717473
6   12090  10867.136985  1222.863015
7   12090  10867.136985  1222.863015
8    6357   7487.833060  1130.833060
9    6357   7487.833060  1130.833060

y_true stats (h=12)
count: 82836
min/median/max: 1.0 40.0 12090.0
p90/p95/p99: 659.0 1253.0 3860.

In [9]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

DATASET = "hospital_dataset.tsf"
HORIZONS = [6, 12]

results = []

for horizon in HORIZONS:
    Xtr, ytr, Xte, yte = build_global_sets(horizon)

    # log-transform target
    ytr_log = np.log1p(ytr)

    print(f"\nHorizon {horizon}: train={Xtr.shape}, test={Xte.shape}")

    model = lgb.LGBMRegressor(
        objective="quantile",
        alpha=0.5,
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbosity=-1
    )

    model.fit(Xtr, ytr_log)

    # predict and invert
    preds_log = model.predict(Xte)
    preds = np.expm1(preds_log)

    mae = mean_absolute_error(yte, preds)
    rmse = np.sqrt(mean_squared_error(yte, preds))

    results.append({
        "dataset": DATASET,
        "horizon": horizon,
        "model": "LightGBM (global p50, log target)",
        "MAE": float(mae),
        "RMSE": float(rmse)
    })

results_df = pd.DataFrame(results)

print("=" * 90)
print("HOSPITAL ML (GLOBAL, LOG TARGET)")
print("=" * 90)
print(results_df)



Horizon 6: train=(430287, 10), test=(50622, 10)

Horizon 12: train=(331344, 10), test=(82836, 10)
HOSPITAL ML (GLOBAL, LOG TARGET)
                dataset  horizon                              model  \
0  hospital_dataset.tsf        6  LightGBM (global p50, log target)   
1  hospital_dataset.tsf       12  LightGBM (global p50, log target)   

         MAE       RMSE  
0  17.340464  55.842261  
1  17.978900  60.299794  


In [10]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

DATASET = "hospital_dataset.tsf"
HORIZONS = [6, 12]

# feature config (must match your earlier global model)
LAGS  = [1, 3, 6, 12]
ROLLS = [3, 6, 12]

# -----------------------------
# Load + tz-fix
# -----------------------------
df = dfs[DATASET].copy()
df["timestamp"] = pd.to_datetime(df["timestamp"]).dt.tz_localize(None)
df["y"] = pd.to_numeric(df["y"], errors="coerce")
df = df.sort_values(["series_name", "timestamp"])

spl = splits_df[splits_df["dataset"] == DATASET].copy()
for c in ["train_end", "test_start", "test_end"]:
    spl[c] = pd.to_datetime(spl[c]).dt.tz_localize(None)

# -----------------------------
# Build features (GUARANTEED)
# -----------------------------
for lag in LAGS:
    df[f"lag_{lag}"] = df.groupby("series_name")["y"].shift(lag)

for r in ROLLS:
    df[f"roll_mean_{r}"] = df.groupby("series_name")["y"].shift(1).rolling(r).mean()
    df[f"roll_std_{r}"]  = df.groupby("series_name")["y"].shift(1).rolling(r).std()

feature_cols = [c for c in df.columns if c.startswith(("lag_", "roll_"))]

# sanity
missing_feats = [c for c in [f"lag_{x}" for x in LAGS] + 
                 [f"roll_mean_{r}" for r in ROLLS] + [f"roll_std_{r}" for r in ROLLS]
                 if c not in df.columns]
if missing_feats:
    raise RuntimeError(f"Feature build failed, missing: {missing_feats}")

# -----------------------------
# Bucket by TRAIN history only
# -----------------------------
train_cutoff = spl["train_end"].min()
train_hist = df[df["timestamp"] <= train_cutoff]

series_median = train_hist.groupby("series_name")["y"].median()
q1, q2 = series_median.quantile([0.33, 0.66])

def size_bucket(x):
    if x <= q1:
        return "small"
    elif x <= q2:
        return "medium"
    else:
        return "large"

series_bucket = series_median.apply(size_bucket)
df["bucket"] = df["series_name"].map(series_bucket)

# -----------------------------
# Build global train/test per bucket
# -----------------------------
def build_bucket_sets(horizon, bucket):
    Xtr, ytr, Xte, yte = [], [], [], []
    splits_h = spl[spl["horizon"] == horizon]

    for _, s in splits_h.iterrows():
        g = df[(df["series_name"] == s["series_name"]) & (df["bucket"] == bucket)]
        if g.empty:
            continue

        train = g[g["timestamp"] <= s["train_end"]].dropna(subset=feature_cols)
        test  = g[(g["timestamp"] >= s["test_start"]) &
                  (g["timestamp"] <= s["test_end"])].dropna(subset=feature_cols)

        if len(train) == 0 or len(test) == 0:
            continue

        Xtr.append(train[feature_cols]); ytr.append(train["y"])
        Xte.append(test[feature_cols]);  yte.append(test["y"])

    if not Xtr:
        return None

    return (
        pd.concat(Xtr, ignore_index=True),
        pd.concat(ytr, ignore_index=True),
        pd.concat(Xte, ignore_index=True),
        pd.concat(yte, ignore_index=True),
    )

# -----------------------------
# Train 1 model per (horizon, bucket)
# -----------------------------
results = []

for horizon in HORIZONS:
    for bucket in ["small", "medium", "large"]:
        data = build_bucket_sets(horizon, bucket)
        if data is None:
            print(f"H{horizon} | {bucket:<6} | SKIP (no data)")
            continue

        Xtr, ytr, Xte, yte = data
        ytr_log = np.log1p(ytr)  # heavy-tail handling

        print(f"H{horizon} | {bucket:<6} | train={Xtr.shape} test={Xte.shape}")

        model = lgb.LGBMRegressor(
            objective="quantile",
            alpha=0.5,
            n_estimators=300,
            learning_rate=0.05,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        )

        model.fit(Xtr, ytr_log)
        preds = np.expm1(model.predict(Xte))

        mae = mean_absolute_error(yte, preds)
        rmse = np.sqrt(mean_squared_error(yte, preds))

        results.append({
            "dataset": DATASET,
            "horizon": horizon,
            "bucket": bucket,
            "model": "LightGBM (segmented global, log target)",
            "MAE": float(mae),
            "RMSE": float(rmse)
        })

results_df = pd.DataFrame(results).sort_values(["horizon", "bucket"])

print("=" * 100)
print("SEGMENTED GLOBAL LIGHTGBM — HOSPITAL")
print("=" * 100)
print(results_df)


H6 | small  | train=(149787, 10) test=(17622, 10)
H6 | medium | train=(134079, 10) test=(15774, 10)
H6 | large  | train=(146421, 10) test=(17226, 10)
H12 | small  | train=(115344, 10) test=(28836, 10)
H12 | medium | train=(103248, 10) test=(25812, 10)
H12 | large  | train=(112752, 10) test=(28188, 10)
SEGMENTED GLOBAL LIGHTGBM — HOSPITAL
                dataset  horizon  bucket  \
2  hospital_dataset.tsf        6   large   
1  hospital_dataset.tsf        6  medium   
0  hospital_dataset.tsf        6   small   
5  hospital_dataset.tsf       12   large   
4  hospital_dataset.tsf       12  medium   
3  hospital_dataset.tsf       12   small   

                                     model        MAE       RMSE  
2  LightGBM (segmented global, log target)  38.948141  89.263097  
1  LightGBM (segmented global, log target)   6.252343   8.555698  
0  LightGBM (segmented global, log target)   3.622809   4.793891  
5  LightGBM (segmented global, log target)  39.824063  89.118869  
4  LightGBM (seg

In [11]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

# =============================
# SEGMENTED GLOBAL LIGHTGBM — HOSPITAL (END-TO-END)
# - tz-safe
# - builds lag/rolling features
# - buckets series by training-median y (small/medium/large)
# - trains 1 global model per (horizon, bucket) on log1p(y)
# - reports bucket metrics + weighted overall metrics
# =============================

DATASET = "hospital_dataset.tsf"
HORIZONS = [6, 12]

LAGS  = [1, 3, 6, 12]
ROLLS = [3, 6, 12]

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# -----------------------------
# 1) Load data + tz-fix
# -----------------------------
df = dfs[DATASET].copy()
df["timestamp"] = pd.to_datetime(df["timestamp"]).dt.tz_localize(None)
df["y"] = pd.to_numeric(df["y"], errors="coerce")
df = df.sort_values(["series_name", "timestamp"]).reset_index(drop=True)

spl = splits_df[splits_df["dataset"] == DATASET].copy()
for c in ["train_end", "test_start", "test_end"]:
    spl[c] = pd.to_datetime(spl[c]).dt.tz_localize(None)

# -----------------------------
# 2) Build features (guaranteed)
# -----------------------------
for lag in LAGS:
    df[f"lag_{lag}"] = df.groupby("series_name")["y"].shift(lag)

for r in ROLLS:
    # IMPORTANT: shift(1) ensures rolling uses ONLY past values
    df[f"roll_mean_{r}"] = (
        df.groupby("series_name")["y"]
          .shift(1)
          .rolling(r)
          .mean()
          .reset_index(level=0, drop=True)
    )
    df[f"roll_std_{r}"] = (
        df.groupby("series_name")["y"]
          .shift(1)
          .rolling(r)
          .std()
          .reset_index(level=0, drop=True)
    )

feature_cols = [c for c in df.columns if c.startswith(("lag_", "roll_"))]

# -----------------------------
# 3) Bucket series by TRAIN history only
# -----------------------------
train_cutoff = spl["train_end"].min()
train_hist = df[df["timestamp"] <= train_cutoff]

series_median = train_hist.groupby("series_name")["y"].median()
q1, q2 = series_median.quantile([0.33, 0.66])

def size_bucket(v):
    if v <= q1:
        return "small"
    elif v <= q2:
        return "medium"
    else:
        return "large"

series_bucket = series_median.apply(size_bucket)
df["bucket"] = df["series_name"].map(series_bucket)

# -----------------------------
# 4) Build global train/test per bucket for a horizon
# -----------------------------
def build_bucket_sets(horizon, bucket):
    Xtr, ytr, Xte, yte = [], [], [], []

    splits_h = spl[spl["horizon"] == horizon]
    for _, s in splits_h.iterrows():
        g = df[(df["series_name"] == s["series_name"]) & (df["bucket"] == bucket)]
        if g.empty:
            continue

        train = g[g["timestamp"] <= s["train_end"]].dropna(subset=feature_cols)
        test  = g[(g["timestamp"] >= s["test_start"]) & (g["timestamp"] <= s["test_end"])].dropna(subset=feature_cols)

        if len(train) == 0 or len(test) == 0:
            continue

        Xtr.append(train[feature_cols]); ytr.append(train["y"])
        Xte.append(test[feature_cols]);  yte.append(test["y"])

    if not Xtr:
        return None

    return (
        pd.concat(Xtr, ignore_index=True),
        pd.concat(ytr, ignore_index=True),
        pd.concat(Xte, ignore_index=True),
        pd.concat(yte, ignore_index=True),
    )

# -----------------------------
# 5) Train segmented global models + compute metrics
# -----------------------------
bucket_metrics = []
bucket_counts = []  # store test row counts for weighted aggregation
models = {}         # (horizon, bucket) -> fitted model (optional)

for horizon in HORIZONS:
    for bucket in ["small", "medium", "large"]:
        data = build_bucket_sets(horizon, bucket)
        if data is None:
            print(f"H{horizon} | {bucket:<6} | SKIP (no data)")
            continue

        Xtr, ytr, Xte, yte = data
        n_test = len(yte)

        print(f"H{horizon} | {bucket:<6} | train={Xtr.shape} test={Xte.shape}")

        # log target for heavy tail
        ytr_log = np.log1p(ytr)

        model = lgb.LGBMRegressor(
            objective="quantile",
            alpha=0.5,
            n_estimators=300,
            learning_rate=0.05,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        )
        model.fit(Xtr, ytr_log)
        preds = np.expm1(model.predict(Xte))

        models[(horizon, bucket)] = model

        mae_val = float(mean_absolute_error(yte, preds))
        rmse_val = rmse(yte, preds)

        bucket_metrics.append({
            "dataset": DATASET,
            "horizon": horizon,
            "bucket": bucket,
            "model": "LightGBM (segmented global, log target)",
            "MAE": mae_val,
            "RMSE": rmse_val
        })
        bucket_counts.append({
            "horizon": horizon,
            "bucket": bucket,
            "n_test": int(n_test)
        })

bucket_df = pd.DataFrame(bucket_metrics).sort_values(["horizon", "bucket"]).reset_index(drop=True)
counts_df = pd.DataFrame(bucket_counts)

print("\n" + "=" * 100)
print("SEGMENTED GLOBAL LIGHTGBM — BUCKET METRICS")
print("=" * 100)
print(bucket_df)

# -----------------------------
# 6) Weighted overall metrics (per horizon)
# Weighted RMSE uses MSE aggregation:
#   RMSE_weighted = sqrt( sum(n_i * RMSE_i^2) / sum(n_i) )
# -----------------------------
weighted_rows = []
for horizon in HORIZONS:
    sub = bucket_df[bucket_df["horizon"] == horizon].merge(
        counts_df[counts_df["horizon"] == horizon],
        on=["horizon", "bucket"],
        how="inner"
    )
    total_n = sub["n_test"].sum()

    w_mae = (sub["MAE"] * sub["n_test"]).sum() / total_n
    w_rmse = float(np.sqrt(((sub["RMSE"] ** 2) * sub["n_test"]).sum() / total_n))

    weighted_rows.append({
        "dataset": DATASET,
        "horizon": horizon,
        "model": "LightGBM (segmented global, weighted)",
        "Weighted_MAE": float(w_mae),
        "Weighted_RMSE": float(w_rmse),
        "Total_Test_Rows": int(total_n)
    })

weighted_df = pd.DataFrame(weighted_rows).sort_values("horizon").reset_index(drop=True)

print("\n" + "=" * 100)
print("SEGMENTED GLOBAL LIGHTGBM — WEIGHTED OVERALL METRICS")
print("=" * 100)
print(weighted_df)


H6 | small  | train=(149787, 10) test=(17622, 10)
H6 | medium | train=(134079, 10) test=(15774, 10)
H6 | large  | train=(146421, 10) test=(17226, 10)
H12 | small  | train=(115344, 10) test=(28836, 10)
H12 | medium | train=(103248, 10) test=(25812, 10)
H12 | large  | train=(112752, 10) test=(28188, 10)

SEGMENTED GLOBAL LIGHTGBM — BUCKET METRICS
                dataset  horizon  bucket  \
0  hospital_dataset.tsf        6   large   
1  hospital_dataset.tsf        6  medium   
2  hospital_dataset.tsf        6   small   
3  hospital_dataset.tsf       12   large   
4  hospital_dataset.tsf       12  medium   
5  hospital_dataset.tsf       12   small   

                                     model        MAE       RMSE  
0  LightGBM (segmented global, log target)  38.948141  89.263097  
1  LightGBM (segmented global, log target)   6.252343   8.555698  
2  LightGBM (segmented global, log target)   3.622809   4.793891  
3  LightGBM (segmented global, log target)  39.824063  89.118869  
4  LightG

In [13]:
# ============================================================
# INTERPRETABILITY BLOCK (DROP-IN, FIXED FOR SHAP ADDITIVITY ERROR)
# - sets check_additivity=False
# - keeps everything else the same
# ============================================================

import os
import numpy as np
import pandas as pd

# ---------- REQUIRED INPUTS ----------
req = ["df", "spl", "feature_cols", "models", "DATASET", "HORIZONS"]
missing = [r for r in req if r not in globals()]
if missing:
    raise RuntimeError(
        f"Missing required variables: {missing}\n"
        "Run your segmented LightGBM training cell first (the one that creates df/spl/feature_cols/models)."
    )

OUT_DIR = "./artifacts_interpretability"
os.makedirs(OUT_DIR, exist_ok=True)

MAX_BACKGROUND = 2000
MAX_EXPLAIN    = 10000
RANDOM_SEED    = 42
rng = np.random.default_rng(RANDOM_SEED)

def build_bucket_test_matrix(horizon: int, bucket: str):
    Xte_list, yte_list = [], []
    splits_h = spl[spl["horizon"] == horizon]

    for _, s in splits_h.iterrows():
        g = df[(df["series_name"] == s["series_name"]) & (df["bucket"] == bucket)]
        if g.empty:
            continue

        test = g[(g["timestamp"] >= s["test_start"]) & (g["timestamp"] <= s["test_end"])].dropna(subset=feature_cols)
        if len(test) == 0:
            continue

        Xte_list.append(test[feature_cols])
        yte_list.append(test["y"])

    if not Xte_list:
        return None

    Xte = pd.concat(Xte_list, ignore_index=True)
    yte = pd.concat(yte_list, ignore_index=True)
    return Xte, yte

def sample_df(X: pd.DataFrame, max_rows: int):
    if len(X) <= max_rows:
        return X.reset_index(drop=True)
    idx = rng.choice(len(X), size=max_rows, replace=False)
    return X.iloc[idx].reset_index(drop=True)

# ---------- TRY SHAP ----------
use_shap = True
try:
    import shap
except Exception:
    use_shap = False

all_rows = []

if use_shap:
    print("Using SHAP TreeExplainer (check_additivity=False).")
    for horizon in HORIZONS:
        for bucket in ["small", "medium", "large"]:
            key = (horizon, bucket)
            if key not in models:
                print(f"SKIP: no model for {key}")
                continue

            data = build_bucket_test_matrix(horizon, bucket)
            if data is None:
                print(f"SKIP: no test data for H{horizon} {bucket}")
                continue

            Xte, _ = data

            X_bg = sample_df(Xte, MAX_BACKGROUND)
            X_ex = sample_df(Xte, MAX_EXPLAIN)

            model = models[key]

            # IMPORTANT FIX: disable additivity check
            explainer = shap.TreeExplainer(model, data=X_bg, feature_perturbation="interventional")
            shap_vals = explainer.shap_values(X_ex, check_additivity=False)

            imp = np.mean(np.abs(shap_vals), axis=0)
            imp_df = pd.DataFrame({"feature": feature_cols, "mean_abs_shap": imp})
            imp_df = imp_df.sort_values("mean_abs_shap", ascending=False).reset_index(drop=True)

            imp_df["dataset"] = DATASET
            imp_df["horizon"] = horizon
            imp_df["bucket"] = bucket
            all_rows.append(imp_df)

            out_path = os.path.join(OUT_DIR, f"shap_importance_{DATASET}_H{horizon}_{bucket}.csv")
            imp_df.to_csv(out_path, index=False)
            print(f"Saved: {out_path} | rows explained={len(X_ex)} | background={len(X_bg)}")

    if not all_rows:
        raise RuntimeError("No SHAP outputs produced (check models/test data).")

    shap_all = pd.concat(all_rows, ignore_index=True)

    TOPK = 15
    topk = (
        shap_all
        .sort_values(["horizon", "bucket", "mean_abs_shap"], ascending=[True, True, False])
        .groupby(["dataset", "horizon", "bucket"], as_index=False)
        .head(TOPK)
        .reset_index(drop=True)
    )

    print("\n" + "=" * 100)
    print("TOP SHAP FEATURES (mean |SHAP|) — per horizon & bucket")
    print("=" * 100)
    print(topk)

    topk_path = os.path.join(OUT_DIR, f"shap_top{TOPK}_{DATASET}.csv")
    topk.to_csv(topk_path, index=False)
    print(f"\nSaved: {topk_path}")

else:
    # fallback: permutation importance
    from sklearn.inspection import permutation_importance
    from sklearn.metrics import mean_absolute_error

    print("SHAP not available -> using permutation importance.")

    def mae_scorer(model, X, y):
        pred = np.expm1(model.predict(X))
        return mean_absolute_error(y, pred)

    def score_fn(est, X, y):
        return -mae_scorer(est, X, y)

    for horizon in HORIZONS:
        for bucket in ["small", "medium", "large"]:
            key = (horizon, bucket)
            if key not in models:
                print(f"SKIP: no model for {key}")
                continue

            data = build_bucket_test_matrix(horizon, bucket)
            if data is None:
                print(f"SKIP: no test data for H{horizon} {bucket}")
                continue

            Xte, yte = data
            X_ex = sample_df(Xte, min(MAX_EXPLAIN, 5000))
            y_ex = yte.iloc[X_ex.index].reset_index(drop=True)

            model = models[key]

            perm = permutation_importance(
                model, X_ex, y_ex,
                scoring=score_fn,
                n_repeats=5,
                random_state=RANDOM_SEED,
                n_jobs=-1
            )

            imp_df = pd.DataFrame({
                "feature": feature_cols,
                "mean_importance": perm.importances_mean,
                "std_importance": perm.importances_std
            }).sort_values("mean_importance", ascending=False).reset_index(drop=True)

            imp_df["dataset"] = DATASET
            imp_df["horizon"] = horizon
            imp_df["bucket"] = bucket
            all_rows.append(imp_df)

            out_path = os.path.join(OUT_DIR, f"perm_importance_{DATASET}_H{horizon}_{bucket}.csv")
            imp_df.to_csv(out_path, index=False)
            print(f"Saved: {out_path} | rows used={len(X_ex)}")

    perm_all = pd.concat(all_rows, ignore_index=True)
    TOPK = 15
    topk = (
        perm_all
        .sort_values(["horizon", "bucket", "mean_importance"], ascending=[True, True, False])
        .groupby(["dataset", "horizon", "bucket"], as_index=False)
        .head(TOPK)
        .reset_index(drop=True)
    )

    print("\n" + "=" * 100)
    print("TOP PERMUTATION FEATURES — per horizon & bucket")
    print("=" * 100)
    print(topk)

    topk_path = os.path.join(OUT_DIR, f"perm_top{TOPK}_{DATASET}.csv")
    topk.to_csv(topk_path, index=False)
    print(f"\nSaved: {topk_path}")

print("\nDone.")


Using SHAP TreeExplainer (check_additivity=False).




Saved: ./artifacts_interpretability\shap_importance_hospital_dataset.tsf_H6_small.csv | rows explained=10000 | background=2000




Saved: ./artifacts_interpretability\shap_importance_hospital_dataset.tsf_H6_medium.csv | rows explained=10000 | background=2000




Saved: ./artifacts_interpretability\shap_importance_hospital_dataset.tsf_H6_large.csv | rows explained=10000 | background=2000




Saved: ./artifacts_interpretability\shap_importance_hospital_dataset.tsf_H12_small.csv | rows explained=10000 | background=2000




Saved: ./artifacts_interpretability\shap_importance_hospital_dataset.tsf_H12_medium.csv | rows explained=10000 | background=2000




Saved: ./artifacts_interpretability\shap_importance_hospital_dataset.tsf_H12_large.csv | rows explained=10000 | background=2000

TOP SHAP FEATURES (mean |SHAP|) — per horizon & bucket
         feature  mean_abs_shap               dataset  horizon  bucket
0          lag_1       0.367885  hospital_dataset.tsf        6   large
1    roll_mean_3       0.305724  hospital_dataset.tsf        6   large
2         lag_12       0.111043  hospital_dataset.tsf        6   large
3   roll_mean_12       0.105898  hospital_dataset.tsf        6   large
4    roll_mean_6       0.051217  hospital_dataset.tsf        6   large
5          lag_3       0.015017  hospital_dataset.tsf        6   large
6          lag_6       0.013568  hospital_dataset.tsf        6   large
7    roll_std_12       0.004639  hospital_dataset.tsf        6   large
8     roll_std_6       0.004469  hospital_dataset.tsf        6   large
9     roll_std_3       0.004287  hospital_dataset.tsf        6   large
10   roll_mean_3       0.131671  ho

In [14]:
# ============================================================
# LLM PACKAGING BLOCK (DROP-IN)
# Saves ONLY model outputs + interpretability summaries to ./LLM_outputs
# so an LLM (DeepSeek via LangChain/LangGraph) can narrate results later.
#
# Assumes you already ran:
# - baseline_df, ets_hospital_df (optional but handled)
# - results_df (global LGBM) (optional)
# - bucket_df, weighted_df (segmented LGBM)  (recommended)
# - SHAP outputs saved under ./artifacts_interpretability (your block already does this)
# - dfs, splits_df exist
# ============================================================

import os, json, re
from pathlib import Path
import numpy as np
import pandas as pd

OUT_DIR = Path("./LLM_outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def _safe_write_df(df: pd.DataFrame, filename: str):
    if df is None:
        return
    if not isinstance(df, pd.DataFrame) or df.empty:
        return
    out_path = OUT_DIR / filename
    df.to_csv(out_path, index=False)
    print(f"[saved] {out_path}  rows={len(df):,}  cols={df.shape[1]}")

def _safe_write_json(obj, filename: str):
    out_path = OUT_DIR / filename
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, default=str)
    print(f"[saved] {out_path}")

def _maybe_get_df(name: str):
    return globals().get(name, None)

# ------------------------------------------------------------
# 1) Save core metric tables (if present in your namespace)
# ------------------------------------------------------------
_safe_write_df(_maybe_get_df("summary"), "baseline_summary.csv")  # if you reused name 'summary' later, ignore
_safe_write_df(_maybe_get_df("baseline_df"), "baseline_seasonal_naive_rows.csv")
_safe_write_df(_maybe_get_df("ets_hospital_summary") if "ets_hospital_summary" in globals() else None, "ets_hospital_summary.csv")
_safe_write_df(_maybe_get_df("ets_hospital_df"), "ets_hospital_rows.csv")
_safe_write_df(_maybe_get_df("results_df"), "lgbm_global_metrics.csv")       # your "HOSPITAL ML (GLOBAL...)" table
_safe_write_df(_maybe_get_df("bucket_df"), "lgbm_segmented_bucket_metrics.csv")
_safe_write_df(_maybe_get_df("weighted_df"), "lgbm_segmented_weighted_metrics.csv")

# ------------------------------------------------------------
# 2) Save SHAP/perm summaries (reads your artifact folder)
# ------------------------------------------------------------
ART_DIR = Path("./artifacts_interpretability")
shap_top_files = sorted(ART_DIR.glob("shap_top*_*.csv"))
shap_imp_files = sorted(ART_DIR.glob("shap_importance_*.csv"))
perm_top_files = sorted(ART_DIR.glob("perm_top*_*.csv"))
perm_imp_files = sorted(ART_DIR.glob("perm_importance_*.csv"))

def _concat_csvs(files):
    if not files:
        return None
    dfs_ = []
    for fp in files:
        try:
            df_ = pd.read_csv(fp)
            df_["source_file"] = fp.name
            dfs_.append(df_)
        except Exception as e:
            print(f"[warn] failed reading {fp}: {e}")
    if not dfs_:
        return None
    return pd.concat(dfs_, ignore_index=True)

shap_top_all = _concat_csvs(shap_top_files)
shap_imp_all = _concat_csvs(shap_imp_files)
perm_top_all = _concat_csvs(perm_top_files)
perm_imp_all = _concat_csvs(perm_imp_files)

_safe_write_df(shap_top_all, "interpret_shap_top_all.csv")
_safe_write_df(shap_imp_all, "interpret_shap_importance_all.csv")
_safe_write_df(perm_top_all, "interpret_perm_top_all.csv")
_safe_write_df(perm_imp_all, "interpret_perm_importance_all.csv")

# ------------------------------------------------------------
# 3) Build a single compact JSON "LLM packet" (best for LangGraph)
#    Keeps it small + structured (no raw full time series).
# ------------------------------------------------------------
packet = {}

# minimal dataset metadata + split summary
try:
    split_summary = (
        splits_df.groupby(["dataset", "horizon"], as_index=False)
        .agg(
            n_splits=("split_id", "nunique"),
            n_series=("series_name", "nunique"),
            first_train_end=("train_end", "min"),
            last_test_end=("test_end", "max"),
        )
    )
    packet["split_summary"] = split_summary.to_dict(orient="records")
    _safe_write_df(split_summary, "split_summary.csv")
except Exception as e:
    print(f"[warn] split summary not saved: {e}")

# store model metric tables (if available)
def _df_to_records(df):
    if df is None or not isinstance(df, pd.DataFrame) or df.empty:
        return None
    return df.replace({np.nan: None}).to_dict(orient="records")

packet["metrics"] = {
    "seasonal_naive_rows": None,     # deliberately not included (too big) unless you want it
    "seasonal_naive_summary": _df_to_records(_maybe_get_df("summary")) if isinstance(_maybe_get_df("summary"), pd.DataFrame) else None,
    "ets_hospital_summary": _df_to_records(_maybe_get_df("ets_hospital_summary")),
    "lgbm_global_metrics": _df_to_records(_maybe_get_df("results_df")),
    "lgbm_segmented_bucket_metrics": _df_to_records(_maybe_get_df("bucket_df")),
    "lgbm_segmented_weighted_metrics": _df_to_records(_maybe_get_df("weighted_df")),
}

# extract a clean "top features" table if present
top_features = None
if shap_top_all is not None and not shap_top_all.empty:
    # Normalize columns (your shap_top has: feature, mean_abs_shap, dataset, horizon, bucket)
    cols = [c for c in ["dataset", "horizon", "bucket", "feature", "mean_abs_shap"] if c in shap_top_all.columns]
    top_features = shap_top_all[cols].copy()
    top_features = top_features.sort_values(["dataset","horizon","bucket","mean_abs_shap"], ascending=[True, True, True, False])
    packet["interpretability"] = {"method": "shap", "top_features": top_features.to_dict(orient="records")}
elif perm_top_all is not None and not perm_top_all.empty:
    cols = [c for c in ["dataset", "horizon", "bucket", "feature", "mean_importance"] if c in perm_top_all.columns]
    top_features = perm_top_all[cols].copy()
    top_features = top_features.sort_values(["dataset","horizon","bucket","mean_importance"], ascending=[True, True, True, False])
    packet["interpretability"] = {"method": "permutation", "top_features": top_features.to_dict(orient="records")}
else:
    packet["interpretability"] = {"method": None, "top_features": None}

_safe_write_json(packet, "llm_packet.json")

# ------------------------------------------------------------
# 4) Write prompt templates your LLM will consume (no API calls here)
# ------------------------------------------------------------
SYSTEM_PROMPT = """You are an analytics assistant. You must ONLY interpret provided results.
You must NOT forecast new values, tune models, or invent numbers.
Use the structured JSON packet and CSV summaries as ground truth.

Output format:
1) Executive summary (5-8 bullets)
2) Model comparison (baseline vs ETS vs ML)
3) Segment insights (small/medium/large)
4) Horizon insights (6 vs 12 months where available)
5) Key drivers from interpretability (top features)
6) Limitations + next steps
"""

USER_PROMPT = """Interpret the attached project outputs.

You are given:
- split_summary.csv
- lgbm_segmented_weighted_metrics.csv (if present)
- lgbm_segmented_bucket_metrics.csv (if present)
- ets_hospital_summary.csv (if present)
- baseline_seasonal_naive_rows.csv and/or baseline_summary.csv (if present)
- interpret_shap_top_all.csv (if present)
- llm_packet.json

Rules:
- Do not hallucinate metrics; only cite what is present.
- If something is missing, say it is missing.
- Explain why the segmented model may outperform or underperform the global model.
- Use interpretability results to explain *why* the model behaves as it does.
"""

(OUT_DIR / "prompt_system.txt").write_text(SYSTEM_PROMPT, encoding="utf-8")
(OUT_DIR / "prompt_user.txt").write_text(USER_PROMPT, encoding="utf-8")
print(f"[saved] {OUT_DIR / 'prompt_system.txt'}")
print(f"[saved] {OUT_DIR / 'prompt_user.txt'}")

# ------------------------------------------------------------
# 5) Create a simple manifest for LangGraph ingestion
# ------------------------------------------------------------
manifest = {
    "llm_packet": str(OUT_DIR / "llm_packet.json"),
    "tables": {
        "split_summary": str(OUT_DIR / "split_summary.csv") if (OUT_DIR / "split_summary.csv").exists() else None,
        "baseline_summary": str(OUT_DIR / "baseline_summary.csv") if (OUT_DIR / "baseline_summary.csv").exists() else None,
        "ets_hospital_summary": str(OUT_DIR / "ets_hospital_summary.csv") if (OUT_DIR / "ets_hospital_summary.csv").exists() else None,
        "lgbm_global_metrics": str(OUT_DIR / "lgbm_global_metrics.csv") if (OUT_DIR / "lgbm_global_metrics.csv").exists() else None,
        "lgbm_segmented_bucket_metrics": str(OUT_DIR / "lgbm_segmented_bucket_metrics.csv") if (OUT_DIR / "lgbm_segmented_bucket_metrics.csv").exists() else None,
        "lgbm_segmented_weighted_metrics": str(OUT_DIR / "lgbm_segmented_weighted_metrics.csv") if (OUT_DIR / "lgbm_segmented_weighted_metrics.csv").exists() else None,
        "interpret_shap_top_all": str(OUT_DIR / "interpret_shap_top_all.csv") if (OUT_DIR / "interpret_shap_top_all.csv").exists() else None,
    },
    "prompts": {
        "system": str(OUT_DIR / "prompt_system.txt"),
        "user": str(OUT_DIR / "prompt_user.txt"),
    }
}
_safe_write_json(manifest, "manifest.json")

print("\n✅ LLM packaging complete. Next step: build the LangChain/LangGraph pipeline that reads manifest.json and generates narratives.")


[saved] LLM_outputs\baseline_summary.csv  rows=4  cols=4
[saved] LLM_outputs\baseline_seasonal_naive_rows.csv  rows=30,358  cols=6
[saved] LLM_outputs\ets_hospital_summary.csv  rows=2  cols=4
[saved] LLM_outputs\ets_hospital_rows.csv  rows=15,340  cols=6
[saved] LLM_outputs\lgbm_global_metrics.csv  rows=6  cols=6
[saved] LLM_outputs\lgbm_segmented_bucket_metrics.csv  rows=6  cols=6
[saved] LLM_outputs\lgbm_segmented_weighted_metrics.csv  rows=2  cols=6
[saved] LLM_outputs\interpret_shap_top_all.csv  rows=60  cols=6
[saved] LLM_outputs\interpret_shap_importance_all.csv  rows=60  cols=6
[saved] LLM_outputs\split_summary.csv  rows=4  cols=6
[saved] LLM_outputs\llm_packet.json
[saved] LLM_outputs\prompt_system.txt
[saved] LLM_outputs\prompt_user.txt
[saved] LLM_outputs\manifest.json

✅ LLM packaging complete. Next step: build the LangChain/LangGraph pipeline that reads manifest.json and generates narratives.


In [22]:
import os
import json
import pandas as pd
from datetime import datetime

# ---------------------------------------------------------
# CONFIG: OUTPUT SETUP
# ---------------------------------------------------------
INPUT_DIR = "./LLM_input"
os.makedirs(INPUT_DIR, exist_ok=True)

# ---------------------------------------------------------
# 1. GATHER DATA FROM MEMORY
# ---------------------------------------------------------
# We assume these variables exist from your previous cells:
# - summary (Baseline)
# - ets_hospital_summary (ETS)
# - results_df (Global LGBM)
# - bucket_df (Segmented LGBM buckets)
# - weighted_df (Segmented LGBM weighted)
# - topk (SHAP top features, optional)

data_packet = {}

def safe_save(df, filename):
    """Saves DataFrame to CSV and adds to packet list"""
    if df is not None and isinstance(df, pd.DataFrame) and not df.empty:
        path = os.path.join(INPUT_DIR, filename)
        df.to_csv(path, index=False)
        print(f"[Saved] {filename}")
        return df.to_csv(index=False) # Return string for direct prompt usage later
    return None

print("PREPARING CONTEXT FOR DEEPSEEK...\n")

# Save Baseline
data_packet['baseline'] = safe_save(globals().get('summary'), "baseline_metrics.csv")

# Save ETS
data_packet['ets'] = safe_save(globals().get('ets_hospital_summary'), "ets_metrics.csv")

# Save Global ML
data_packet['ml_global'] = safe_save(globals().get('results_df'), "ml_global_metrics.csv")

# Save Segmented ML
data_packet['ml_segmented_buckets'] = safe_save(globals().get('bucket_df'), "ml_segmented_buckets.csv")
data_packet['ml_segmented_weighted'] = safe_save(globals().get('weighted_df'), "ml_segmented_weighted.csv")

# Save Interpretability (SHAP)
# Note: 'topk' is the variable name from your SHAP cell
data_packet['shap_drivers'] = safe_save(globals().get('topk'), "shap_top_drivers.csv")

# ---------------------------------------------------------
# 2. CREATE META-CONTEXT
# ---------------------------------------------------------
context_note = {
    "project_scope": "Hospital Capacity Planning (Aggregated Time Series)",
    "dataset": "hospital_dataset.tsf",
    "horizons_months": [6, 12],
    "constraints": [
        "Do not forecast values.",
        "Compare models based on MAE (robustness) and RMSE (outlier penalty).",
        "Explain WHY segmented models might outperform global ones using the provided metrics."
    ],
    "generated_at": datetime.now().isoformat()
}

with open(os.path.join(INPUT_DIR, "context.json"), "w") as f:
    json.dump(context_note, f, indent=2)

print("\n✅ Data successfully saved to ./LLM_input")

PREPARING CONTEXT FOR DEEPSEEK...

[Saved] baseline_metrics.csv
[Saved] ets_metrics.csv
[Saved] ml_global_metrics.csv
[Saved] ml_segmented_buckets.csv
[Saved] ml_segmented_weighted.csv
[Saved] shap_top_drivers.csv

✅ Data successfully saved to ./LLM_input


In [None]:
import os
import json
from openai import OpenAI

# ---------------------------------------------------------
# CONFIG: API & OUTPUT
# ---------------------------------------------------------
# REPLACE WITH YOUR ACTUAL KEY
DEEPSEEK_API_KEY = "add_key_here" 

OUTPUT_DIR = "./Deepseek_results"
INPUT_DIR = "./LLM_input"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------------------------------------------------
# 1. LOAD CONTEXT
# ---------------------------------------------------------
def read_file_content(filename):
    path = os.path.join(INPUT_DIR, filename)
    if os.path.exists(path):
        with open(path, "r") as f:
            return f.read()
    return "Data not found."

context_str = f"""
PROJECT METADATA:
{read_file_content('context.json')}

BASELINE METRICS (Seasonal Naive):
{read_file_content('baseline_metrics.csv')}

CLASSICAL METRICS (ETS):
{read_file_content('ets_metrics.csv')}

ML METRICS (Global LightGBM):
{read_file_content('ml_global_metrics.csv')}

ML METRICS (Segmented LightGBM - by size bucket):
{read_file_content('ml_segmented_buckets.csv')}

ML METRICS (Segmented LightGBM - Weighted Average):
{read_file_content('ml_segmented_weighted.csv')}

KEY DRIVERS (SHAP Importance):
{read_file_content('shap_top_drivers.csv')}
"""

# ---------------------------------------------------------
# 2. CONSTRUCT PROMPT
# ---------------------------------------------------------
system_prompt = """You are a Senior Data Science Lead presenting to Hospital Stakeholders.
Your goal is to interpret the provided forecasting results.

STRUCTURE YOUR RESPONSE AS FOLLOWS:
1. **Executive Summary**: The single most important finding (e.g., "Segmented models reduced error by X% compared to baseline...").
2. **Model Leaderboard**: Compare Seasonal Naive vs. ETS vs. ML. Use specific numbers (MAE/RMSE).
3. **Why Segmentation Matters**: Explain the difference in performance between Small vs. Large hospitals based on the metrics.
4. **Key Drivers**: Use the SHAP data to explain what drives the forecast (e.g., "Large hospitals are driven by recent trends (lag_1), while smaller ones depend on yearly averages").
5. **Strategic Recommendation**: Based *only* on the data, what should the planning team use?"""

user_prompt = f"""
Here is the experimental data from our latest run on the Hospital Dataset.
Please generate the report.

DATA CONTEXT:
{context_str}
"""

# ---------------------------------------------------------
# 3. CALL DEEPSEEK API
# ---------------------------------------------------------
print("⏳ Contacting DeepSeek API...")

client = OpenAI(
    api_key=DEEPSEEK_API_KEY, 
    base_url="https://api.deepseek.com"
)

try:
    response = client.chat.completions.create(
        model="deepseek-chat",  # or "deepseek-reasoner" for Chain of Thought
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.3, # Keep it factual
        stream=False
    )

    narrative = response.choices[0].message.content
    
    # ---------------------------------------------------------
    # 4. SAVE RESULT
    # ---------------------------------------------------------
    out_path = os.path.join(OUTPUT_DIR, "hospital_forecast_narrative.md")
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(narrative)
    
    print("-" * 80)
    print(narrative)
    print("-" * 80)
    print(f"\n✅ Report saved to: {out_path}")

except Exception as e:
    print(f"\n❌ API Error: {e}")
    print("Check your API key and internet connection.")

⏳ Contacting DeepSeek API...
--------------------------------------------------------------------------------
**1. Executive Summary**  
Segmented modeling by hospital size dramatically improves forecast accuracy for small and medium hospitals, but a global ML model fails to capture the volatility of large hospitals, resulting in high errors. The weighted average of segmented ML models still underperforms classical ETS, which remains the best overall approach for this dataset.

**2. Model Leaderboard**  
For the 6-month horizon (MAE / RMSE):  
- **Seasonal Naive**: 21.82 / 26.01  
- **ETS**: 18.59 / 21.78  
- **Segmented ML (Weighted)**: 16.46 / 52.37  

For the 12-month horizon:  
- **Seasonal Naive**: 21.79 / 27.02  
- **ETS**: 20.49 / 24.42  
- **Segmented ML (Weighted)**: 16.84 / 52.30  

**Interpretation**: ETS outperforms Seasonal Naive on both MAE and RMSE, indicating better trend and seasonality capture. The segmented ML model has a lower MAE than ETS, but its RMSE is more than