In [1]:
import pandas as pd
import numpy as np

def extract_bn_episodes(
    df: pd.DataFrame,
    time_col: str = "time_s",          # or "tick" if you prefer
    flag_col: str = "is_bn",
    prod_cols: list[str] = None,
    score_col: str = "bottleneck_score",
    min_len: int = 2,                  # minimum episode duration in minutes
    merge_gap: int = 0                 # merge episodes separated by <= merge_gap non-bn minutes
) -> pd.DataFrame:

    d = df.copy()
    d = d.sort_values(time_col).reset_index(drop=True)

    if prod_cols is None:
        prod_cols = [c for c in d.columns if c.startswith("production__") and c.endswith("_per_min")]
    if not prod_cols:
        raise ValueError("No production__*_per_min columns found. Pass prod_cols explicitly.")

    d[flag_col] = d[flag_col].fillna(False).astype(bool)

    # run boundaries
    x = d[flag_col].to_numpy()
    start_of_run = np.r_[True, x[1:] != x[:-1]]
    run_id = start_of_run.cumsum()
    d["_run"] = run_id

    # raw bn runs
    runs = (
        d[d[flag_col]]
        .groupby("_run", as_index=False)
        .agg(
            start=(time_col, "min"),
            end=(time_col, "max"),
            minutes=(time_col, "size"),
        )
        .sort_values("start")
        .reset_index(drop=True)
    )

    # optional merge by small gaps
    if merge_gap > 0 and len(runs) > 0:
        merged = []
        cur = runs.loc[0].to_dict()
        for i in range(1, len(runs)):
            nxt = runs.loc[i].to_dict()
            gap = int(nxt["start"] - cur["end"]) - 1  # time_col is in seconds or ticks? see note below
            if gap <= merge_gap:
                cur["end"] = max(cur["end"], nxt["end"])
                cur["minutes"] = None  # recompute later
            else:
                merged.append(cur)
                cur = nxt
        merged.append(cur)
        runs = pd.DataFrame(merged)

    # convert time units into minutes properly
    # time_s seems to be in seconds. Your data is per minute so step is 60.
    step = int(pd.Series(d[time_col].diff().dropna()).mode().iloc[0]) if len(d) > 1 else 60
    if step <= 0:
        step = 60

    # finalize episodes with dominant item computed over the whole episode
    episodes = []
    for ep_id, r in enumerate(runs.itertuples(index=False), start=1):
        mask = (d[time_col] >= r.start) & (d[time_col] <= r.end) & d[flag_col]
        chunk = d.loc[mask]

        # duration in minutes based on step
        duration_min = int((chunk[time_col].max() - chunk[time_col].min()) / step) + 1 if len(chunk) else 0
        if duration_min < min_len:
            continue

        # dominant item for the episode based on summed production during the episode
        sums = chunk[prod_cols].sum(axis=0)
        total = float(sums.sum())
        dom_item = str(sums.idxmax()) if total > 0 else None
        dom_share = float(sums.max() / total) if total > 0 else np.nan

        peak = float(chunk[score_col].max()) if score_col in chunk.columns and len(chunk) else np.nan
        mean = float(chunk[score_col].mean()) if score_col in chunk.columns and len(chunk) else np.nan

        episodes.append({
            "episode_id": ep_id,
            "start": int(chunk[time_col].min()),
            "end": int(chunk[time_col].max()),
            "duration_min": duration_min,
            "dominant_item": dom_item,
            "dominant_share": dom_share,
            "peak_score": peak,
            "mean_score": mean,
            "bn_minutes": int(len(chunk))
        })

    ep = pd.DataFrame(episodes).sort_values(["peak_score","duration_min"], ascending=[False, False]).reset_index(drop=True)
    return ep

episodes = extract_bn_episodes(
    dfA,
    time_col="time_s",
    flag_col="is_bn",
    prod_cols=prod_rate_cols,
    score_col="bottleneck_score",
    min_len=2,
    merge_gap=0
)

episodes

NameError: name 'dfA' is not defined