> **Prerequisites:** run `outlier_test_dataset.ipynb` first to generate `test_traces.csv` and `test_labels.csv` in the same directory.

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display

# Outlier Detection — Method Comparison

Evaluates lightweight outlier detection methods against the labelled test set produced by `outlier_test_dataset.ipynb`.  
Primary metric: **F1 score** on the outlier class.

| Method | Principle |
|---|---|
| **z-score Δod** | Robust z-score on first differences |
| **IQR Δod** | Tukey IQR fence on first differences |
| **Hampel** | Sliding-window median/MAD identifier |
| **spline residuals** | Smooth spline fit → residual MAD z-score |
| **isolation forest** | Anomaly score from random partitioning tree |

In [None]:
traces_df = pd.read_csv("generated_data/test_traces.csv")
labels_df  = pd.read_csv("generated_data/test_labels.csv")
gt = labels_df["is_outlier"].astype(bool)

n_out = int(gt.sum())
print(
    f"Loaded  {len(traces_df):,} points · "
    f"{traces_df['curve_id'].nunique()} curves · "
    f"{n_out} ground-truth outliers ({100 * n_out / len(traces_df):.2f}%)"
)

In [None]:
# pip install pyod   ← needed for ECOD, COPOD, LOF
# growthcurves package must be installed: pip install -e ../


def _mad_z(x: np.ndarray) -> np.ndarray:
    med = np.median(x)
    mad = np.median(np.abs(x - med))
    return np.zeros(len(x)) if mad < 1e-12 else np.abs(x - med) / (1.4826 * mad)


def _fold_flag(scores: np.ndarray, k: float, noise_pct: float = 50) -> np.ndarray:
    """Flag points where score > k × noise_floor (= percentile(scores, noise_pct))."""
    noise = np.percentile(scores, noise_pct)
    if noise < 1e-12:
        return np.zeros(len(scores), dtype=bool)
    return (scores / noise) > k


def _mad_flag(scores: np.ndarray, k: float) -> np.ndarray:
    """Flag points with MAD z-score > k."""
    return _mad_z(scores) > k


def m_zscore_diff(t, od, k=8.0):
    return _fold_flag(np.abs(np.diff(od, prepend=od[0])), k)

def m_iqr_diff(t, od, k=8.0):
    d = np.diff(od, prepend=od[0])
    return _fold_flag(np.abs(d - np.median(d)), k)

def m_hampel(t, od, window=15, k=3.0):
    """Hampel identifier with per-window MAD z-score threshold (correct formulation)."""
    n, half = len(od), window // 2
    flags = np.zeros(n, dtype=bool)
    for i in range(n):
        nb = od[max(0, i - half): min(n, i + half + 1)]
        med = np.median(nb)
        mad = np.median(np.abs(nb - med))
        if mad > 1e-12:
            flags[i] = abs(od[i] - med) / (1.4826 * mad) > k
    return flags

def m_spline_residuals(t, od, k=3.5):
    """Spline residuals with MAD z-score threshold (residuals are approx Gaussian)."""
    from growthcurves.non_parametric import fit_spline
    from growthcurves.models import spline_from_params
    result = fit_spline(t, od, smooth="fast")
    if result is None:
        return np.zeros(len(t), dtype=bool)
    spl = spline_from_params(result["params"])
    raw = np.abs(od - np.exp(spl(t)))
    return _mad_flag(raw, k)

def m_isolation_forest(t, od, k=10.0):
    from sklearn.ensemble import IsolationForest
    d = np.diff(od, prepend=od[0])
    X = np.column_stack([od, d])
    clf = IsolationForest(n_estimators=200, contamination="auto", random_state=42)
    clf.fit(X)
    raw = -clf.score_samples(X)
    raw = raw - raw.min()
    return _fold_flag(raw, k)

def m_ecod(t, od, k=3.5, window=15):
    """ECOD using local rolling-mean residual + od + Δod."""
    from pyod.models.ecod import ECOD
    n, half = len(od), window // 2
    residual = np.zeros(n)
    for i in range(n):
        win = od[max(0, i - half): min(n, i + half + 1)]
        residual[i] = od[i] - win.mean()
    d = np.diff(od, prepend=od[0])
    X = np.column_stack([np.abs(residual), od, d])
    clf = ECOD(); clf.fit(X)
    return _mad_flag(clf.decision_scores_, k)

def m_copod(t, od, k=3.5, window=15):
    """COPOD using local rolling-mean residual + od + Δod."""
    from pyod.models.copod import COPOD
    n, half = len(od), window // 2
    residual = np.zeros(n)
    for i in range(n):
        win = od[max(0, i - half): min(n, i + half + 1)]
        residual[i] = od[i] - win.mean()
    d = np.diff(od, prepend=od[0])
    X = np.column_stack([np.abs(residual), od, d])
    clf = COPOD(); clf.fit(X)
    return _mad_flag(clf.decision_scores_, k)

def m_lof(t, od, k=3.5):
    from pyod.models.lof import LOF
    d = np.diff(od, prepend=od[0])
    X = np.column_stack([od, d])
    clf = LOF(); clf.fit(X)
    return _mad_flag(clf.decision_scores_, k)

def m_growthcurves(t, od, window_size=15, factor=1.5):
    """Sliding-window IQR outlier detection from growthcurves.preprocessing.out_of_iqr."""
    from growthcurves.preprocessing import out_of_iqr
    return out_of_iqr(od, window_size=window_size, factor=factor)


METHODS = {
    "z-score Δod":      m_zscore_diff,
    "IQR Δod":          m_iqr_diff,
    "Hampel":           m_hampel,
    "spline residuals": m_spline_residuals,
    "isolation forest": m_isolation_forest,
    "ECOD":             m_ecod,
    "COPOD":            m_copod,
    "LOF":              m_lof,
    "growthcurves":     m_growthcurves,
}

METHOD_COLORS = [
    "#3B82F6",  # z-score Δod    — blue
    "#F97316",  # IQR Δod        — orange
    "#10B981",  # Hampel          — green
    "#8B5CF6",  # spline res.     — purple
    "#EF4444",  # isolation f.    — red
    "#06B6D4",  # ECOD            — cyan
    "#84CC16",  # COPOD           — lime
    "#F59E0B",  # LOF             — amber
    "#0D9488",  # growthcurves    — teal
]
color_map = dict(zip(METHODS, METHOD_COLORS))

print("Methods:", list(METHODS))


In [None]:
predictions: dict = {}

for name, fn in METHODS.items():
    parts = []
    for cid, grp in traces_df.groupby("curve_id", sort=False):
        t  = grp["t"].to_numpy()
        od = grp["od"].to_numpy()
        parts.append(pd.Series(fn(t, od).astype(bool), index=grp.index))
    predictions[name] = pd.concat(parts)

print(f"{'Method':25s}  {'Flagged':>7}  {'GT outliers':>11}")
for name, pred in predictions.items():
    print(f"  {name:23s}  {int(pred.sum()):>7}  {int(gt.sum()):>11}")

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

rows = []
for name, pred in predictions.items():
    p, g = pred.astype(int), gt.astype(int)
    rows.append({
        "method":    name,
        "F1":        round(f1_score(g, p, zero_division=0), 3),
        "precision": round(precision_score(g, p, zero_division=0), 3),
        "recall":    round(recall_score(g, p, zero_division=0), 3),
        "flagged":   int(p.sum()),
    })

metrics_df = (
    pd.DataFrame(rows)
    .sort_values("F1", ascending=False)
    .reset_index(drop=True)
)
display(metrics_df)

In [None]:
import time

N_REPS = 5   # repeat each measurement to reduce noise

timing_rows = []
for _ in range(N_REPS):
    for name, fn in METHODS.items():
        for cid, grp in traces_df.groupby("curve_id", sort=False):
            t  = grp["t"].to_numpy()
            od = grp["od"].to_numpy()
            t0 = time.perf_counter()
            fn(t, od)
            timing_rows.append({"method": name, "ms": (time.perf_counter() - t0) * 1e3})

timing_df = pd.DataFrame(timing_rows)
speed = (
    timing_df.groupby("method")["ms"]
    .agg(median_ms="median", std_ms="std")
    .reset_index()
    .sort_values("median_ms")
)

fig = go.Figure(go.Bar(
    x=speed["median_ms"],
    y=speed["method"],
    orientation="h",
    error_x=dict(type="data", array=speed["std_ms"].tolist(), visible=True),
    marker_color=[color_map[m] for m in speed["method"]],
    text=[f"{v:.2f} ms" for v in speed["median_ms"]],
    textposition="outside",
))
fig.update_layout(
    template="plotly_white",
    height=50 + len(METHODS) * 45,
    width=640,
    title=f"Runtime per curve — median ± std  ({N_REPS} reps × {traces_df['curve_id'].nunique()} curves)",
    xaxis=dict(title="ms", type="log"),
    yaxis_title="",
    margin=dict(t=50, b=40, l=140, r=90),
)
fig.show()

In [None]:
method_names  = list(METHODS.keys())
ordered_names = metrics_df["method"].tolist()

# ── Bar charts: F1 / Precision / Recall ──────────────────────────────────────
fig_bars = make_subplots(
    rows=1, cols=3,
    subplot_titles=["F1 score", "Precision", "Recall"],
    horizontal_spacing=0.10,
)
for col, metric in enumerate(["F1", "precision", "recall"], start=1):
    vals = metrics_df.set_index("method").loc[ordered_names, metric].tolist()
    fig_bars.add_trace(
        go.Bar(
            x=ordered_names, y=vals,
            marker_color=[color_map[m] for m in ordered_names],
            showlegend=False,
            text=[f"{v:.2f}" for v in vals], textposition="outside",
        ),
        row=1, col=col,
    )
    fig_bars.update_yaxes(range=[0, 1.18], row=1, col=col)
fig_bars.update_layout(
    template="plotly_white", height=380, width=1100,
    title="Method comparison — ground truth test set",
)
fig_bars.show()

# ── Per-curve F1 heatmap ──────────────────────────────────────────────────────
curve_ids = traces_df["curve_id"].unique()
z = np.full((len(curve_ids), len(method_names)), np.nan)
for j, mname in enumerate(method_names):
    pred = predictions[mname]
    for i, cid in enumerate(curve_ids):
        idx = traces_df[traces_df["curve_id"] == cid].index
        z[i, j] = f1_score(gt.loc[idx].astype(int), pred.loc[idx].astype(int), zero_division=0)

fig_heat = go.Figure(go.Heatmap(
    z=z, x=method_names, y=list(curve_ids),
    colorscale="RdYlGn", zmin=0, zmax=1,
    text=np.round(z, 2).astype(str), texttemplate="%{text}",
    colorbar=dict(title="F1"),
    hovertemplate="curve: %{y}<br>method: %{x}<br>F1: %{z:.3f}<extra></extra>",
))
fig_heat.update_layout(
    template="plotly_white", height=360, width=1100,
    title="Per-curve F1 score by method",
    xaxis_title="Method", yaxis_title="Curve",
    xaxis=dict(tickangle=-35),
    margin=dict(b=110),
)
fig_heat.show()

### 7 · Per-curve visual inspection

For each curve: the OD trace with **○ ground-truth markers** on top, then one labelled strip per method below (shared x-axis).

- **○ open circle** in a strip = this time point is in the ground truth  
- **■ filled square** = flagged by the method  
- Both at the same position = true positive; circle only = false negative; square only = false positive

In [None]:
n_methods    = len(method_names)
row_heights  = [5] + [1] * n_methods  # data trace tall, flag strips short

for cid in traces_df["curve_id"].unique():
    grp  = traces_df[traces_df["curve_id"] == cid]
    t    = grp["t"].to_numpy()
    od   = grp["od"].to_numpy()
    gt_c = gt.loc[grp.index].to_numpy()

    fig = make_subplots(
        rows=1 + n_methods,
        cols=1,
        shared_xaxes=True,
        row_heights=row_heights,
        vertical_spacing=0.02,
    )

    # ── Row 1: OD trace + ground-truth markers ────────────────────────────────
    fig.add_trace(
        go.Scatter(
            x=t, y=od, mode="lines+markers",
            line=dict(color="#CBD5E1", width=1),
            marker=dict(size=4, color="#94A3B8"),
            showlegend=False,
        ),
        row=1, col=1,
    )
    if gt_c.any():
        fig.add_trace(
            go.Scatter(
                x=t[gt_c], y=od[gt_c], mode="markers",
                marker=dict(size=16, symbol="circle-open", color="black",
                            line=dict(width=2.5)),
                showlegend=False,
                hovertemplate="GT outlier<br>t=%{x:.0f}  OD=%{y:.4f}<extra></extra>",
            ),
            row=1, col=1,
        )

    # ── Rows 2+: one strip per method ─────────────────────────────────────────
    for j, mname in enumerate(method_names):
        row = j + 2
        p_c = predictions[mname].loc[grp.index].to_numpy()

        # Mark GT positions in every strip (open circles) so alignment is clear
        if gt_c.any():
            fig.add_trace(
                go.Scatter(
                    x=t[gt_c], y=np.ones(gt_c.sum()),
                    mode="markers",
                    marker=dict(size=12, symbol="circle-open", color="black",
                                line=dict(width=2)),
                    showlegend=False, hoverinfo="skip",
                ),
                row=row, col=1,
            )

        # Filled squares where this method fires
        if p_c.any():
            fig.add_trace(
                go.Scatter(
                    x=t[p_c], y=np.ones(p_c.sum()),
                    mode="markers",
                    marker=dict(size=10, symbol="square",
                                color=color_map[mname], opacity=0.9),
                    showlegend=False,
                    hovertemplate=f"{mname}<br>t=%{{x:.0f}}<extra></extra>",
                ),
                row=row, col=1,
            )

        fig.update_yaxes(
            title_text=mname, showticklabels=False,
            range=[0, 2], row=row, col=1,
        )

    fig.update_yaxes(title_text="OD", row=1, col=1)
    fig.update_xaxes(title_text="Time", row=1 + n_methods, col=1)
    fig.update_layout(
        title=f"<b>{cid}</b>  —  ○ = ground truth  ■ = flagged by method",
        template="plotly_white",
        height=300 + n_methods * 50,
        width=950,
        showlegend=False,
        margin=dict(t=45, b=35, l=140, r=15),
    )
    fig.show()

### 8 · Anomaly scores per method

Each method row shows the raw anomaly score (MAD z-score units) across time.  
**Red dashed line** = threshold (3.5). **○** = ground truth outlier position.

In [None]:
# Threshold k and type per method
# "fold"  → thresh = k × median(scores)         (sparse/heavy-tailed)
# "mad"   → thresh = median + k × 1.4826 × MAD  (shift-distributed, approx Gaussian)
# "fixed" → thresh = k                           (scores already in interpretable units)
THRESH = {
    "z-score Δod":      ("fold",  8.0),
    "IQR Δod":          ("fold",  8.0),
    "Hampel":           ("fixed", 3.0),  # scores are per-window MAD z-scores
    "spline residuals": ("mad",   3.5),  # residuals are approx Gaussian
    "isolation forest": ("fold",  10.0),
    "ECOD":             ("mad",   3.5),
    "COPOD":            ("mad",   3.5),
    "LOF":              ("mad",   3.5),
    "growthcurves":     ("fixed", 1.5),
}
NOISE_PCT = 50


def _hex_to_rgba(hex_color, alpha=0.12):
    h = hex_color.lstrip("#")
    r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
    return f"rgba({r},{g},{b},{alpha})"


def _thresh_value(scores, thresh_type, k):
    if thresh_type == "fold":
        noise = np.percentile(scores, NOISE_PCT)
        return k * noise
    elif thresh_type == "fixed":
        return k
    else:  # "mad"
        med = np.median(scores)
        mad = np.median(np.abs(scores - med))
        return med + k * 1.4826 * mad


def _sc_diff(t, od):
    return np.abs(np.diff(od, prepend=od[0]))

def _sc_iqr_diff(t, od):
    d = np.diff(od, prepend=od[0])
    return np.abs(d - np.median(d))

def _sc_hampel(t, od, window=15):
    """Per-window MAD z-score — scores are directly comparable to the fixed threshold."""
    n, half = len(od), window // 2
    raw = np.zeros(n)
    for i in range(n):
        nb = od[max(0, i - half): min(n, i + half + 1)]
        med = np.median(nb)
        mad = np.median(np.abs(nb - med))
        if mad > 1e-12:
            raw[i] = abs(od[i] - med) / (1.4826 * mad)
    return raw

def _sc_spline(t, od):
    from growthcurves.non_parametric import fit_spline
    from growthcurves.models import spline_from_params
    result = fit_spline(t, od, smooth="fast")
    if result is None:
        return np.zeros(len(t))
    spl = spline_from_params(result["params"])
    return np.abs(od - np.exp(spl(t)))

def _sc_iforest(t, od):
    from sklearn.ensemble import IsolationForest
    d = np.diff(od, prepend=od[0])
    X = np.column_stack([od, d])
    clf = IsolationForest(n_estimators=200, contamination="auto", random_state=42)
    clf.fit(X)
    raw = -clf.score_samples(X)
    return raw - raw.min()

def _sc_ecod(t, od, window=15):
    """ECOD score using local rolling-mean residual + od + Δod."""
    from pyod.models.ecod import ECOD
    n, half = len(od), window // 2
    residual = np.zeros(n)
    for i in range(n):
        win = od[max(0, i - half): min(n, i + half + 1)]
        residual[i] = od[i] - win.mean()
    d = np.diff(od, prepend=od[0])
    X = np.column_stack([np.abs(residual), od, d])
    clf = ECOD(); clf.fit(X)
    return clf.decision_scores_

def _sc_copod(t, od, window=15):
    """COPOD score using local rolling-mean residual + od + Δod."""
    from pyod.models.copod import COPOD
    n, half = len(od), window // 2
    residual = np.zeros(n)
    for i in range(n):
        win = od[max(0, i - half): min(n, i + half + 1)]
        residual[i] = od[i] - win.mean()
    d = np.diff(od, prepend=od[0])
    X = np.column_stack([np.abs(residual), od, d])
    clf = COPOD(); clf.fit(X)
    return clf.decision_scores_

def _sc_lof(t, od):
    from pyod.models.lof import LOF
    d = np.diff(od, prepend=od[0])
    X = np.column_stack([od, d])
    clf = LOF(); clf.fit(X)
    return clf.decision_scores_

def _sc_growthcurves(t, od, window_size=15):
    """Distance beyond the IQR fence, in IQR units (0 for inliers)."""
    n = len(od)
    half = window_size // 2
    raw = np.zeros(n)
    for i in range(n):
        win = od[max(0, i - half): min(n, i + half + 1)]
        q1 = np.nanquantile(win, 0.25)
        q3 = np.nanquantile(win, 0.75)
        iqr = q3 - q1
        if iqr > 1e-12:
            raw[i] = max(od[i] - q3, q1 - od[i], 0.0) / iqr
    return raw

SCORERS = {
    "z-score Δod":      _sc_diff,
    "IQR Δod":          _sc_iqr_diff,
    "Hampel":           _sc_hampel,
    "spline residuals": _sc_spline,
    "isolation forest": _sc_iforest,
    "ECOD":             _sc_ecod,
    "COPOD":            _sc_copod,
    "LOF":              _sc_lof,
    "growthcurves":     _sc_growthcurves,
}

# ── Plot ─────────────────────────────────────────────────────────────────────
n_sc        = len(SCORERS)
row_heights = [3] + [2] * n_sc

for cid in traces_df["curve_id"].unique():
    grp  = traces_df[traces_df["curve_id"] == cid]
    t    = grp["t"].to_numpy()
    od   = grp["od"].to_numpy()
    gt_c = gt.loc[grp.index].to_numpy()

    fig = make_subplots(
        rows=1 + n_sc, cols=1,
        shared_xaxes=True,
        row_heights=row_heights,
        vertical_spacing=0.02,
    )

    fig.add_trace(go.Scatter(
        x=t, y=od, mode="lines+markers",
        line=dict(color="#CBD5E1", width=1),
        marker=dict(size=4, color="#94A3B8"),
        showlegend=False,
    ), row=1, col=1)
    if gt_c.any():
        fig.add_trace(go.Scatter(
            x=t[gt_c], y=od[gt_c], mode="markers",
            marker=dict(size=16, symbol="circle-open", color="black", line=dict(width=2.5)),
            showlegend=False,
        ), row=1, col=1)

    for j, (mname, scorer) in enumerate(SCORERS.items()):
        row    = j + 2
        scores = scorer(t, od)
        color  = color_map[mname]
        t_type, k = THRESH[mname]
        thresh = _thresh_value(scores, t_type, k)

        fig.add_trace(go.Scatter(
            x=t, y=scores, mode="lines",
            line=dict(color=color, width=1.5),
            fill="tozeroy", fillcolor=_hex_to_rgba(color),
            showlegend=False,
            hovertemplate=f"{mname}  t=%{{x:.0f}}  score=%{{y:.3g}}"
                          f"  (thresh={thresh:.3g})<extra></extra>",
        ), row=row, col=1)

        fig.add_hline(y=thresh, line=dict(color="red", width=1, dash="dot"),
                      row=row, col=1)

        if gt_c.any():
            fig.add_trace(go.Scatter(
                x=t[gt_c], y=scores[gt_c], mode="markers",
                marker=dict(size=10, symbol="circle-open", color="black", line=dict(width=2)),
                showlegend=False, hoverinfo="skip",
            ), row=row, col=1)

        type_label = {"fold": "fold", "mad": "MAD z", "fixed": "fixed"}[t_type]
        fig.update_yaxes(title_text=f"{mname} ({type_label})", row=row, col=1,
                         title_standoff=4, rangemode="tozero")

    fig.update_yaxes(title_text="OD", row=1, col=1)
    fig.update_xaxes(title_text="Time", row=1 + n_sc, col=1)
    fig.update_layout(
        title=f"<b>{cid}</b>  —  raw anomaly scores  "
              f"(red dash = threshold  |  ○ = ground truth)",
        template="plotly_white",
        height=240 + n_sc * 90,
        width=950,
        showlegend=False,
        margin=dict(t=45, b=35, l=170, r=15),
    )
    fig.show()
