# Outlier Detection — Ground Truth Dataset Builder

Constructs a labelled test dataset for benchmarking outlier detection methods.

**Strategy:** synthetic outliers with known locations are injected into real growth curves. Any genuine artifacts already present in the raw data can be annotated in addition. The combined set becomes the ground truth used to evaluate detection algorithms (precision, recall, F1).

**Outputs:**
- `test_traces.csv` — the curve data as seen by a detection algorithm (`curve_id, idx, t, od`)
- `test_labels.csv` — ground truth labels (`curve_id, idx, is_outlier`)

In [1]:
from pathlib import Path
import copy
import json

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display

try:
    from plotly.graph_objects import FigureWidget
    import ipywidgets  # noqa: F401
    HAS_WIDGET = True
except ImportError:
    HAS_WIDGET = False
    print("ipywidgets not available — interactive click-labeling disabled.")
    print("Use the fallback dict-based labeling cell instead.")

print(f"plotly FigureWidget available: {HAS_WIDGET}")

plotly FigureWidget available: True


In [None]:
# ── Configure paths ───────────────────────────────────────────────────────────
# These point to the TheGrowthAnalysisApp repo's example data.
# Adjust if the files live elsewhere on your system.
EXAMPLE_DATA_PATH = Path("/Users/sambra/Documents/GitHub/TheGrowthAnalysisApp/example_data/example_data.xlsx")
FREDDY_DATA_PATH  = Path("/Users/sambra/Documents/GitHub/TheGrowthAnalysisApp/example_data/freddy_data.xlsx")

# Output files saved next to this notebook
TRACES_FILE = Path("test_traces.csv")   # curve data  (curve_id, idx, t, od)
LABELS_FILE = Path("test_labels.csv")   # bool labels (curve_id, idx, is_outlier)

for p in [EXAMPLE_DATA_PATH, FREDDY_DATA_PATH]:
    status = "✓" if p.exists() else "✗  NOT FOUND — update path above"
    print(f"  {status}  {p.name}")
print(f"\n  Traces → {TRACES_FILE.resolve()}")
print(f"  Labels → {LABELS_FILE.resolve()}")

## 1 · Load curves

In [3]:
def _parse_od(val) -> float:
    """Handle European comma-decimal strings (e.g. '0,0496') or plain floats."""
    if isinstance(val, str):
        return float(val.replace(",", "."))
    try:
        return float(val)
    except (TypeError, ValueError):
        return np.nan


def load_well(df: pd.DataFrame, well: str) -> tuple:
    """Return (time_array, od_array) for a given well column."""
    t  = df["Time"].to_numpy(dtype=float)
    od = np.array([_parse_od(v) for v in df[well]], dtype=float)
    return t, od


# example_data.xlsx — wells E1, G1, H1, H2, H3
df_example = pd.read_excel(EXAMPLE_DATA_PATH, sheet_name="Sheet1")
EXAMPLE_WELLS = ["E1", "G1", "H1", "H2", "H3"]

# freddy_data.xlsx — wells A8, C9, C3
# (user listed "a8, c9, a8, c3" — treating the duplicated a8 as a typo; using A8, C9, C3)
df_freddy = pd.read_excel(FREDDY_DATA_PATH, sheet_name="GP_20210930_141313_MTP01_OD")
FREDDY_WELLS = ["A8", "C9", "C3"]

curves: dict = {}
for w in EXAMPLE_WELLS:
    t, od = load_well(df_example, w)
    curves[f"example_{w}"] = {"t": t, "od": od, "source": "example_data", "well": w}

for w in FREDDY_WELLS:
    t, od = load_well(df_freddy, w)
    curves[f"freddy_{w}"] = {"t": t, "od": od, "source": "freddy_data", "well": w}

print(f"Loaded {len(curves)} curves:\n")
for cid, v in curves.items():
    od_finite = v["od"][np.isfinite(v["od"])]
    print(
        f"  {cid:20s}  {len(v['t']):3d} pts  "
        f"OD [{od_finite.min():.4f} – {od_finite.max():.4f}]  "
        f"t [{v['t'].min():.0f} – {v['t'].max():.0f}]"
    )

Loaded 8 curves:

  example_E1            694 pts  OD [0.0483 – 0.4676]  t [0 – 8316]
  example_G1            694 pts  OD [0.0478 – 0.2993]  t [0 – 8316]
  example_H1            694 pts  OD [0.0470 – 0.3542]  t [0 – 8316]
  example_H2            694 pts  OD [0.0476 – 0.0662]  t [0 – 8316]
  example_H3            694 pts  OD [0.0476 – 0.1851]  t [0 – 8316]
  freddy_A8             286 pts  OD [0.0000 – 7.6210]  t [0 – 8548]
  freddy_C9             286 pts  OD [0.0150 – 8.9080]  t [0 – 8548]
  freddy_C3             286 pts  OD [0.0010 – 10.3130]  t [0 – 8548]


## 2 · Inject synthetic outliers

Two types of outlier are added per curve (counts specified in `OUTLIER_PLAN`):
- **Spike** — a single measurement jumped 2–4× the OD range above the local signal
- **Dip** — a single measurement pulled 1–2× the OD range below the local signal

The random seed is fixed so results are reproducible.

In [4]:
RNG = np.random.default_rng(42)

# Stash original OD before any modification
curves_original: dict = {cid: data["od"].copy() for cid, data in curves.items()}

# Outlier counts per curve  {curve_id: {spikes, dips}}
OUTLIER_PLAN = {
    "example_E1": {"spikes": 2, "dips": 1},
    "example_G1": {"spikes": 1, "dips": 2},
    "example_H1": {"spikes": 2, "dips": 1},
    "example_H2": {"spikes": 1, "dips": 1},
    "example_H3": {"spikes": 3, "dips": 0},
    "freddy_A8":  {"spikes": 2, "dips": 1},
    "freddy_C9":  {"spikes": 1, "dips": 2},
    "freddy_C3":  {"spikes": 2, "dips": 1},
}


def inject_outliers(
    od: np.ndarray,
    n_spikes: int,
    n_dips: int,
    rng: np.random.Generator,
) -> tuple:
    """Return (modified_od, bool_mask_of_outliers)."""
    od = od.copy()
    n = len(od)
    mask = np.zeros(n, dtype=bool)

    od_finite = od[np.isfinite(od)]
    od_range = float(np.percentile(od_finite, 95) - np.percentile(od_finite, 5))
    if od_range < 1e-6:
        od_range = max(float(np.std(od_finite)), 0.05)

    available = list(range(n))

    for _ in range(n_spikes):
        if not available:
            break
        idx = int(rng.choice(available))
        od[idx] += rng.uniform(2.0, 4.0) * od_range
        mask[idx] = True
        available.remove(idx)

    for _ in range(n_dips):
        if not available:
            break
        idx = int(rng.choice(available))
        od[idx] -= rng.uniform(1.0, 2.0) * od_range
        mask[idx] = True
        available.remove(idx)

    return od, mask


curves_perturbed = copy.deepcopy(curves)
true_outlier_mask: dict = {}

for cid, plan in OUTLIER_PLAN.items():
    od_new, mask = inject_outliers(
        curves_perturbed[cid]["od"], plan["spikes"], plan["dips"], RNG
    )
    curves_perturbed[cid]["od"] = od_new
    true_outlier_mask[cid] = mask

print("Synthetic outliers injected:\n")
for cid, mask in true_outlier_mask.items():
    t_arr = curves_perturbed[cid]["t"]
    print(
        f"  {cid:20s}  {mask.sum()} outlier(s)  "
        f"at t = {t_arr[mask].tolist()}"
    )

Synthetic outliers injected:

  example_E1            3 outlier(s)  at t = [708.0, 732.0, 6444.0]
  example_G1            3 outlier(s)  at t = [5796.0, 6120.0, 6336.0]
  example_H1            3 outlier(s)  at t = [3744.0, 6492.0, 6984.0]
  example_H2            2 outlier(s)  at t = [3744.0, 5352.0]
  example_H3            3 outlier(s)  at t = [1884.0, 6876.0, 7140.0]
  freddy_A8             3 outlier(s)  at t = [3028.0, 5788.0, 5998.0]
  freddy_C9             3 outlier(s)  at t = [359.0, 4228.0, 6658.0]
  freddy_C3             3 outlier(s)  at t = [6358.0, 7738.0, 7888.0]


## 3 · Visual overview

Inspect all curves for any **genuine artifacts** in the raw data that should also be included in the ground truth. Hover over suspicious points to read their index.

In [5]:
curve_ids = list(curves_perturbed.keys())
ncols = 4
nrows = (len(curve_ids) + ncols - 1) // ncols

fig_overview = make_subplots(
    rows=nrows,
    cols=ncols,
    subplot_titles=curve_ids,
    vertical_spacing=0.10,
    horizontal_spacing=0.06,
)

for idx, cid in enumerate(curve_ids):
    r, c = divmod(idx, ncols)
    data = curves_perturbed[cid]
    t, od = data["t"], data["od"]
    fig_overview.add_trace(
        go.Scatter(
            x=t,
            y=od,
            mode="markers+lines",
            marker=dict(size=5, color="#3B82F6"),
            line=dict(color="#93C5FD", width=1),
            showlegend=False,
            hovertemplate="idx=%{pointNumber}  t=%{x:.0f}  OD=%{y:.4f}<extra></extra>",
        ),
        row=r + 1,
        col=c + 1,
    )
    fig_overview.update_xaxes(title_text="Time", row=r + 1, col=c + 1)
    fig_overview.update_yaxes(title_text="OD",   row=r + 1, col=c + 1)

fig_overview.update_layout(
    height=280 * nrows,
    width=1200,
    template="plotly_white",
    title="All curves with injected outliers — hover for index",
)
fig_overview.show()

## 4 · Annotate additional artifacts (optional)

The **injected outliers are already ground truth** — you do not need to find them.

Use this step only if the visual overview reveals genuine artifacts in the raw data (e.g. a real measurement spike that was present before injection). Marking those points ensures the ground truth reflects the actual data quality, not just the synthetic perturbations.

**Skip this section entirely if the raw data looks clean.**

### Option A — Interactive click labeling

Run the cell below. Click any point to toggle it into the ground truth (orange).  
Click again to un-toggle.

> Requires `ipywidgets` and the plotly widget extension.  
> If figures appear static, use **Option B** instead.

In [6]:
# Stores the labels the user assigns (updated live by the click handler)
user_labels: dict = {
    cid: np.zeros(len(data["t"]), dtype=bool)
    for cid, data in curves_perturbed.items()
}

COL_NORMAL  = "#3B82F6"  # blue
COL_OUTLIER = "#F97316"  # orange

if not HAS_WIDGET:
    print("FigureWidget not available — run Option B below instead.")
else:
    for cid, data in curves_perturbed.items():
        t  = data["t"]
        od = data["od"]
        n  = len(t)

        scatter = go.Scatter(
            x=t.tolist(),
            y=od.tolist(),
            mode="markers+lines",
            marker=dict(
                color=[COL_NORMAL] * n,
                size=10,
                line=dict(color="white", width=1),
            ),
            line=dict(color="#CBD5E1", width=1),
            text=[
                f"idx {i}\u2002 t={t[i]:.1f}\u2002 OD={od[i]:.4f}"
                for i in range(n)
            ],
            hovertemplate="%{text}<extra></extra>",
        )

        fw = FigureWidget([scatter])
        fw.update_layout(
            title=dict(
                text=f"<b>{cid}</b> — click to toggle outlier label",
                font=dict(size=12),
            ),
            template="plotly_white",
            height=260,
            width=820,
            margin=dict(t=38, b=28, l=55, r=15),
            showlegend=False,
            xaxis_title="Time",
            yaxis_title="OD",
        )

        def _make_handler(fw_ref, cid_ref):
            def _handler(trace, points, selector):
                with fw_ref.batch_update():
                    colors = list(trace.marker.color)
                    for i in points.point_inds:
                        if colors[i] == COL_NORMAL:
                            colors[i] = COL_OUTLIER
                            user_labels[cid_ref][i] = True
                        else:
                            colors[i] = COL_NORMAL
                            user_labels[cid_ref][i] = False
                    trace.marker.color = colors
            return _handler

        fw.data[0].on_click(_make_handler(fw, cid))
        display(fw)

    print("\nBlue = normal  |  Orange = outlier  |  Click to toggle")

FigureWidget({
    'data': [{'hovertemplate': '%{text}<extra></extra>',
              'line': {'color': '#CBD5E1', 'width': 1},
              'marker': {'color': [#3B82F6, #3B82F6, #3B82F6, ..., #3B82F6,
                                   #3B82F6, #3B82F6],
                         'line': {'color': 'white', 'width': 1},
                         'size': 10},
              'mode': 'markers+lines',
              'text': [idx 0  t=0.0  OD=0.0494, idx 1  t=12.0  OD=0.0682, idx 2 
                       t=24.0  OD=0.1051, ..., idx 691  t=8292.0  OD=0.4613, idx
                       692  t=8304.0  OD=0.4614, idx 693  t=8316.0  OD=0.4609],
              'type': 'scatter',
              'uid': 'ccc68ebf-fb60-4cbf-bf96-ead5f3819d4a',
              'x': [0.0, 12.0, 24.0, ..., 8292.0, 8304.0, 8316.0],
              'y': [0.0494, 0.0682, 0.1051, ..., 0.4613, 0.4614, 0.4609]}],
    'layout': {'height': 260,
               'margin': {'b': 28, 'l': 55, 'r': 15, 't': 38},
               'showlegend':

FigureWidget({
    'data': [{'hovertemplate': '%{text}<extra></extra>',
              'line': {'color': '#CBD5E1', 'width': 1},
              'marker': {'color': [#3B82F6, #3B82F6, #3B82F6, ..., #3B82F6,
                                   #3B82F6, #3B82F6],
                         'line': {'color': 'white', 'width': 1},
                         'size': 10},
              'mode': 'markers+lines',
              'text': [idx 0  t=0.0  OD=0.0500, idx 1  t=12.0  OD=0.0734, idx 2 
                       t=24.0  OD=0.1265, ..., idx 691  t=8292.0  OD=0.0480, idx
                       692  t=8304.0  OD=0.0481, idx 693  t=8316.0  OD=0.0480],
              'type': 'scatter',
              'uid': 'a639a118-c375-40bc-9e80-03c49c9e16b8',
              'x': [0.0, 12.0, 24.0, ..., 8292.0, 8304.0, 8316.0],
              'y': [0.05, 0.0734, 0.1265, ..., 0.048, 0.0481, 0.048]}],
    'layout': {'height': 260,
               'margin': {'b': 28, 'l': 55, 'r': 15, 't': 38},
               'showlegend': Fal

FigureWidget({
    'data': [{'hovertemplate': '%{text}<extra></extra>',
              'line': {'color': '#CBD5E1', 'width': 1},
              'marker': {'color': [#3B82F6, #3B82F6, #3B82F6, ..., #3B82F6,
                                   #3B82F6, #3B82F6],
                         'line': {'color': 'white', 'width': 1},
                         'size': 10},
              'mode': 'markers+lines',
              'text': [idx 0  t=0.0  OD=0.0644, idx 1  t=12.0  OD=0.1498, idx 2 
                       t=24.0  OD=0.2485, ..., idx 691  t=8292.0  OD=0.0473, idx
                       692  t=8304.0  OD=0.0474, idx 693  t=8316.0  OD=0.0474],
              'type': 'scatter',
              'uid': '02c622cf-db04-43cf-aa80-f8a80e165b72',
              'x': [0.0, 12.0, 24.0, ..., 8292.0, 8304.0, 8316.0],
              'y': [0.0644, 0.1498, 0.2485, ..., 0.0473, 0.0474, 0.0474]}],
    'layout': {'height': 260,
               'margin': {'b': 28, 'l': 55, 'r': 15, 't': 38},
               'showlegend':

FigureWidget({
    'data': [{'hovertemplate': '%{text}<extra></extra>',
              'line': {'color': '#CBD5E1', 'width': 1},
              'marker': {'color': [#3B82F6, #3B82F6, #3B82F6, ..., #3B82F6,
                                   #3B82F6, #3B82F6],
                         'line': {'color': 'white', 'width': 1},
                         'size': 10},
              'mode': 'markers+lines',
              'text': [idx 0  t=0.0  OD=0.0501, idx 1  t=12.0  OD=0.0494, idx 2 
                       t=24.0  OD=0.0493, ..., idx 691  t=8292.0  OD=0.0478, idx
                       692  t=8304.0  OD=0.0480, idx 693  t=8316.0  OD=0.0478],
              'type': 'scatter',
              'uid': '38a091e0-b049-4dfc-9ae9-9f736e005489',
              'x': [0.0, 12.0, 24.0, ..., 8292.0, 8304.0, 8316.0],
              'y': [0.0501, 0.0494, 0.0493, ..., 0.0478, 0.048, 0.0478]}],
    'layout': {'height': 260,
               'margin': {'b': 28, 'l': 55, 'r': 15, 't': 38},
               'showlegend': 

FigureWidget({
    'data': [{'hovertemplate': '%{text}<extra></extra>',
              'line': {'color': '#CBD5E1', 'width': 1},
              'marker': {'color': [#3B82F6, #3B82F6, #3B82F6, ..., #3B82F6,
                                   #3B82F6, #3B82F6],
                         'line': {'color': 'white', 'width': 1},
                         'size': 10},
              'mode': 'markers+lines',
              'text': [idx 0  t=0.0  OD=0.1851, idx 1  t=12.0  OD=0.0495, idx 2 
                       t=24.0  OD=0.0493, ..., idx 691  t=8292.0  OD=0.0478, idx
                       692  t=8304.0  OD=0.0479, idx 693  t=8316.0  OD=0.0477],
              'type': 'scatter',
              'uid': '1bd4c39f-8b88-47dd-b4a6-1d3c290ecee8',
              'x': [0.0, 12.0, 24.0, ..., 8292.0, 8304.0, 8316.0],
              'y': [0.1851, 0.0495, 0.0493, ..., 0.0478, 0.0479, 0.0477]}],
    'layout': {'height': 260,
               'margin': {'b': 28, 'l': 55, 'r': 15, 't': 38},
               'showlegend':

FigureWidget({
    'data': [{'hovertemplate': '%{text}<extra></extra>',
              'line': {'color': '#CBD5E1', 'width': 1},
              'marker': {'color': [#3B82F6, #3B82F6, #3B82F6, ..., #3B82F6,
                                   #3B82F6, #3B82F6],
                         'line': {'color': 'white', 'width': 1},
                         'size': 10},
              'mode': 'markers+lines',
              'text': [idx 0  t=0.0  OD=0.0010, idx 1  t=29.0  OD=0.0000, idx 2 
                       t=59.0  OD=0.0000, ..., idx 283  t=8488.0  OD=6.9440, idx
                       284  t=8518.0  OD=6.9490, idx 285  t=8548.0  OD=6.9470],
              'type': 'scatter',
              'uid': 'da067339-9778-4b8d-9b87-861d5a17509f',
              'x': [0.0, 29.0, 59.0, ..., 8488.0, 8518.0, 8548.0],
              'y': [0.001, 0.0, 0.0, ..., 6.944, 6.949, 6.947]}],
    'layout': {'height': 260,
               'margin': {'b': 28, 'l': 55, 'r': 15, 't': 38},
               'showlegend': False,
  

FigureWidget({
    'data': [{'hovertemplate': '%{text}<extra></extra>',
              'line': {'color': '#CBD5E1', 'width': 1},
              'marker': {'color': [#3B82F6, #3B82F6, #3B82F6, ..., #3B82F6,
                                   #3B82F6, #3B82F6],
                         'line': {'color': 'white', 'width': 1},
                         'size': 10},
              'mode': 'markers+lines',
              'text': [idx 0  t=0.0  OD=0.0150, idx 1  t=29.0  OD=0.0510, idx 2 
                       t=59.0  OD=0.0620, ..., idx 283  t=8488.0  OD=7.7630, idx
                       284  t=8518.0  OD=7.7700, idx 285  t=8548.0  OD=7.7710],
              'type': 'scatter',
              'uid': '53da4875-1d59-4f8f-aac8-77a3fe117a9e',
              'x': [0.0, 29.0, 59.0, ..., 8488.0, 8518.0, 8548.0],
              'y': [0.015, 0.051, 0.062, ..., 7.763, 7.77, 7.771]}],
    'layout': {'height': 260,
               'margin': {'b': 28, 'l': 55, 'r': 15, 't': 38},
               'showlegend': False,

FigureWidget({
    'data': [{'hovertemplate': '%{text}<extra></extra>',
              'line': {'color': '#CBD5E1', 'width': 1},
              'marker': {'color': [#3B82F6, #3B82F6, #3B82F6, ..., #3B82F6,
                                   #3B82F6, #3B82F6],
                         'line': {'color': 'white', 'width': 1},
                         'size': 10},
              'mode': 'markers+lines',
              'text': [idx 0  t=0.0  OD=0.0010, idx 1  t=29.0  OD=0.0180, idx 2 
                       t=59.0  OD=0.0250, ..., idx 283  t=8488.0  OD=8.7030, idx
                       284  t=8518.0  OD=8.7870, idx 285  t=8548.0  OD=8.7700],
              'type': 'scatter',
              'uid': '1822a5ca-a179-48fc-b38d-7c7fb6301bf6',
              'x': [0.0, 29.0, 59.0, ..., 8488.0, 8518.0, 8548.0],
              'y': [0.001, 0.018, 0.025, ..., 8.703, 8.787, 8.77]}],
    'layout': {'height': 260,
               'margin': {'b': 28, 'l': 55, 'r': 15, 't': 38},
               'showlegend': False,


Blue = normal  |  Orange = outlier  |  Click to toggle


### Option B — Text-based fallback

Note the index of any genuine artifact from the hover tooltip in the overview plot, fill in the lists below, and run the cell.

In [7]:
# ── Fill in outlier indices manually, then run this cell ─────────────────────
# Leave a list empty ([]) if you see no outliers in that curve.
# Indices correspond to the hover tooltip in the overview plot above.

manual_outlier_indices = {
    "example_E1": [],   # e.g. [5, 23, 67]
    "example_G1": [],
    "example_H1": [],
    "example_H2": [],
    "example_H3": [],
    "freddy_A8":  [],
    "freddy_C9":  [],
    "freddy_C3":  [],
}

# Apply to user_labels (overwrites any click-based labels)
for cid, indices in manual_outlier_indices.items():
    if cid not in user_labels:
        continue
    user_labels[cid][:] = False
    n = len(user_labels[cid])
    for i in indices:
        if 0 <= i < n:
            user_labels[cid][i] = True
        else:
            print(f"  Warning: index {i} out of range for {cid} (n={n})")

n_labeled = sum(v.sum() for v in user_labels.values())
print(f"Applied manual labels: {n_labeled} point(s) marked as outliers.")

Applied manual labels: 0 point(s) marked as outliers.


## 5 · Save traces and labels

Run this cell **after** labeling (Option A or B).

- **`test_traces.csv`** — long-format table of curve data: `curve_id, idx, t, od`
- **`test_labels.csv`** — boolean labels: `curve_id, idx, is_outlier`

`is_outlier` is the **union** of injected outliers and any extra points you labelled manually.

In [None]:
trace_rows = []
label_rows = []

for cid in curves_perturbed:
    data    = curves_perturbed[cid]
    t_arr   = data["t"]
    od_arr  = data["od"]
    inj_m   = true_outlier_mask[cid]
    usr_m   = user_labels[cid]
    combined = inj_m | usr_m          # injected + any extra user picks

    for i in range(len(t_arr)):
        trace_rows.append({"curve_id": cid, "idx": i, "t": float(t_arr[i]), "od": float(od_arr[i])})
        label_rows.append({"curve_id": cid, "idx": i, "is_outlier": bool(combined[i])})

traces_df = pd.DataFrame(trace_rows)
labels_df = pd.DataFrame(label_rows)

traces_df.to_csv(TRACES_FILE, index=False)
labels_df.to_csv(LABELS_FILE, index=False)

n_out = labels_df["is_outlier"].sum()
print(f"Traces → {TRACES_FILE.resolve()}  ({len(traces_df):,} rows)")
print(f"Labels → {LABELS_FILE.resolve()}  ({n_out} outliers / {len(labels_df):,} total)")

print(f"\n  {'Curve':20s}  {'Points':>6}  {'Injected':>8}  {'User-extra':>10}  {'Combined':>8}")
for cid in curves_perturbed:
    inj   = true_outlier_mask[cid]
    usr   = user_labels[cid]
    extra = (~inj) & usr              # user-labelled but not injected
    comb  = inj | usr
    n     = len(inj)
    print(f"  {cid:20s}  {n:>6}  {inj.sum():>8}  {extra.sum():>10}  {comb.sum():>8}")

## 6 · Ground truth summary

Breakdown of what is in the saved ground truth: synthetic injections and any extra annotated artifacts.

In [None]:
n_total   = len(labels_df)
n_out     = int(labels_df["is_outlier"].sum())
n_normal  = n_total - n_out

print(f"Ground truth dataset\n{'─'*40}")
print(f"  Curves          : {len(curves_perturbed)}")
print(f"  Total points    : {n_total:,}")
print(f"  Outliers        : {n_out}  ({100*n_out/n_total:.2f}%)")
print(f"  Normal          : {n_normal:,}")

print(f"\n  {'Curve':20s}  {'Points':>6}  {'Synthetic':>9}  {'Annotated':>9}  {'Total GT':>8}")
for cid in curves_perturbed:
    inj   = true_outlier_mask[cid]
    usr   = user_labels[cid]
    extra = int((~inj & usr).sum())   # annotated beyond synthetic
    total = int((inj | usr).sum())
    print(f"  {cid:20s}  {len(inj):>6}  {int(inj.sum()):>9}  {extra:>9}  {total:>8}")