# kaggle.ipynb — 用已保存的模型在 Kaggle 上推理并生成 submission.csv

本 Notebook 默认按 **LightGBM（本地 `local.ipynb` 训练产物）** 进行推理，并自动生成 `/kaggle/working/submission.csv`。

## 你需要上传哪些内容（强烈建议做成一个 Kaggle Dataset）
把本地目录下的以下文件上传到 Kaggle，方式建议：**Create Dataset → Upload**，然后在 Notebook 里 **Add Data**。

必须：
- `models/` 目录（包含多个 `lgbm_{action}.txt`）
- `thresholds.joblib`

建议（可选）：
- `train_frame_samples_cfg.joblib`（仅用于记录超参；推理不需要）

### 你需要修改哪里
只需要改第 1 个代码单元中的：
- `ARTIFACT_DATASET_NAME`：你上传的模型数据集目录名（出现在 `/kaggle/input/<这里>`）

竞赛数据路径会自动探测（通过查找 `test.csv`），一般不需要改。


In [24]:
from __future__ import annotations

import os
import json
import hashlib
import subprocess
from pathlib import Path
from typing import Any

# === 你要改的基本只有这一项 ===
# 这是你在 Kaggle 上传的“模型产物 Dataset”在 /kaggle/input 下出现的目录名
# 例：如果 /kaggle/input 里有文件夹 "mabe-lgbm-artifacts"，这里就填这个
ARTIFACT_DATASET_NAME = "mabe-challenge-dataset"

# 模型类型：默认 lgbm（本项目实际使用）。
# 下面也给了 pytorch/tensorflow/onnx 的模板（可不管）。
MODEL_TYPE = "lgbm"  # {"lgbm","pytorch","tensorflow","onnx"}

# 竞赛数据：一般无需改，Notebook 会自动扫描 /kaggle/input 找到含 test.csv 的目录
COMP_DATA_DIR: Path | None = None

# 产物输入目录（模型/阈值所在）
ARTIFACT_DIR: Path | None = None

# 输出目录（Kaggle 要求写到 /kaggle/working）
WORK_DIR = Path("/kaggle/working")
WORK_DIR.mkdir(parents=True, exist_ok=True)

# 推理参数（通常无需改）
MEDIAN_KERNEL = 9       # odd
MIN_DURATION_SEC = 0.10
MERGE_GAP_SEC = 0.05
OUTPUT_SELF_AS_LITERAL = True

# 提交列名（比赛格式固定，一般无需改）
SUB_COLS = ["row_id", "video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]

assert MODEL_TYPE in {"lgbm", "pytorch", "tensorflow", "onnx"}

print("MODEL_TYPE:", MODEL_TYPE)
print("WORK_DIR:", WORK_DIR)

MODEL_TYPE: lgbm
WORK_DIR: /kaggle/working


In [25]:
# 2) 检查 Kaggle 输入目录与文件是否齐全（并打印树）

def _find_comp_data_dir() -> Path:
    base = Path("/kaggle/input")
    if not base.exists():
        raise RuntimeError("This notebook is intended to run on Kaggle. /kaggle/input not found.")

    candidates = []
    for p in base.iterdir():
        if not p.is_dir():
            continue
        if (p / "test.csv").exists() and (p / "sample_submission.csv").exists():
            candidates.append(p)

    if not candidates:
        raise FileNotFoundError("Could not find competition data dir containing test.csv under /kaggle/input")

    # if multiple, prefer one that also has test_tracking
    candidates.sort(key=lambda x: (not (x / "test_tracking").exists(), x.name))
    return candidates[0]


def _tree(path: Path, max_depth: int = 3, max_entries: int = 300) -> None:
    path = Path(path)
    print(f"\n[tree] {path}")
    count = 0

    def rec(cur: Path, depth: int) -> None:
        nonlocal count
        if depth > max_depth:
            return
        try:
            entries = sorted(cur.iterdir(), key=lambda p: (p.is_file(), p.name))
        except Exception:
            return
        for e in entries:
            if count >= max_entries:
                print("... (truncated)")
                return
            rel = e.relative_to(path)
            print("  " * depth + ("- " + str(rel) + ("/" if e.is_dir() else "")))
            count += 1
            if e.is_dir():
                rec(e, depth + 1)

    rec(path, 0)


COMP_DATA_DIR = _find_comp_data_dir() if COMP_DATA_DIR is None else Path(COMP_DATA_DIR)
print("Detected COMP_DATA_DIR:", COMP_DATA_DIR)

ARTIFACT_DIR = Path("/kaggle/input") / ARTIFACT_DATASET_NAME if ARTIFACT_DIR is None else Path(ARTIFACT_DIR)
print("ARTIFACT_DIR:", ARTIFACT_DIR)

# Show input dirs
print("\nAvailable /kaggle/input dirs:")
for p in sorted(Path("/kaggle/input").iterdir()):
    if p.is_dir():
        print("-", p.name)

_tree(COMP_DATA_DIR, max_depth=2)
_tree(ARTIFACT_DIR, max_depth=3)

# required files
REQ = [
    COMP_DATA_DIR / "test.csv",
    COMP_DATA_DIR / "sample_submission.csv",
    ARTIFACT_DIR / "thresholds.joblib",
    ARTIFACT_DIR / "models",
]

for r in REQ:
    assert r.exists(), f"Missing required path: {r}"

print("\nOK: required files exist.")


Detected COMP_DATA_DIR: /kaggle/input/MABe-mouse-behavior-detection
ARTIFACT_DIR: /kaggle/input/mabe-challenge-dataset

Available /kaggle/input dirs:
- MABe-mouse-behavior-detection
- mabe-challenge-dataset

[tree] /kaggle/input/MABe-mouse-behavior-detection
- test_tracking/
  - test_tracking/AdaptableSnail/
    - test_tracking/AdaptableSnail/438887472.parquet
- train_annotation/
  - train_annotation/AdaptableSnail/
    - train_annotation/AdaptableSnail/1212811043.parquet
    - train_annotation/AdaptableSnail/1260392287.parquet
    - train_annotation/AdaptableSnail/1351098077.parquet
    - train_annotation/AdaptableSnail/1408652858.parquet
    - train_annotation/AdaptableSnail/143861384.parquet
    - train_annotation/AdaptableSnail/1596473327.parquet
    - train_annotation/AdaptableSnail/1643942986.parquet
    - train_annotation/AdaptableSnail/1717182687.parquet
    - train_annotation/AdaptableSnail/2078515636.parquet
    - train_annotation/AdaptableSnail/209576908.parquet
    - train_

In [26]:
# 3) 安装依赖（按需 pip 安装）
import importlib

def _ensure(pkg: str, pip_name: str | None = None):
    try:
        return importlib.import_module(pkg)
    except Exception:
        name = pip_name or pkg
        print(f"Installing {name}...")
        subprocess.check_call(["python", "-m", "pip", "install", "-q", name])
        return importlib.import_module(pkg)

# Kaggle 通常自带这些；缺了再装
np = _ensure("numpy")
pd = _ensure("pandas")
joblib = _ensure("joblib")
scipy = _ensure("scipy")

# LightGBM 推理必需
lgb = _ensure("lightgbm", "lightgbm==4.6.0")

from scipy.signal import medfilt

print("Versions:")
import sys
print("python", sys.version)
print("numpy", np.__version__)
print("pandas", pd.__version__)
print("joblib", joblib.__version__)
print("scipy", scipy.__version__)
print("lightgbm", lgb.__version__)


Versions:
python 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
numpy 1.26.4
pandas 2.2.3
joblib 1.5.2
scipy 1.15.3
lightgbm 4.6.0


In [27]:
# 4) 加载配置与标签映射（可选）
# 本项目的 LightGBM 推理只依赖：models/ + thresholds.joblib。
# 这里保留一个“可选配置加载”模板（有则读，没有则跳过）。

CONFIG_PATHS = [
    ARTIFACT_DIR / "config.json",
    ARTIFACT_DIR / "config.yaml",
    ARTIFACT_DIR / "config.yml",
]

config: dict[str, Any] = {}
for p in CONFIG_PATHS:
    if p.exists() and p.is_file():
        if p.suffix == ".json":
            config = json.loads(p.read_text())
        else:
            # yaml not guaranteed installed; ignore by default
            print(f"Found {p.name} but YAML loader not included; skipping.")
        print("Loaded config from:", p)
        break

# 可选 label_map
LABEL_MAP_PATH = ARTIFACT_DIR / "label_map.json"
label_map: dict[str, Any] | None = None
if LABEL_MAP_PATH.exists():
    label_map = json.loads(LABEL_MAP_PATH.read_text())
    print("Loaded label_map.json")
else:
    print("No label_map.json (OK for this competition).")


No label_map.json (OK for this competition).


In [None]:
# 5) 加载模型（PyTorch / TensorFlow / ONNX 三选一模板）
# 本项目默认用 LightGBM（MODEL_TYPE='lgbm'）。其它模板仅作参考。


def load_lgbm_models(artifact_dir: Path) -> dict[str, lgb.Booster]:
    models: dict[str, lgb.Booster] = {}
    for p in (artifact_dir / "models").glob("lgbm_*.txt"):
        action = p.stem.replace("lgbm_", "")
        models[action] = lgb.Booster(model_file=str(p))
    return models


def build_model():
    if MODEL_TYPE == "lgbm":
        models = load_lgbm_models(ARTIFACT_DIR)
        thresholds = joblib.load(ARTIFACT_DIR / "thresholds.joblib")
        assert models, "No lgbm_*.txt found under ARTIFACT_DIR/models"

        # IMPORTANT (feature alignment):
        # These models were trained from a numpy matrix without passing explicit feature names.
        # LightGBM therefore stores generic names like 'Column_0'..'Column_39'.
        # If we follow Booster.feature_name() at inference time, it will NOT match our engineered
        # columns (dx/dist/...), and any "fill missing columns" fallback would silently feed all-zero
        # features -> degenerate predictions (often only one action survives thresholding).
        #
        # So we only record the expected feature COUNT here; inference will use FEATURE_COLS order.
        first = next(iter(models.values()))
        n_feat = int(first.num_feature())

        # optional sanity check: all models should share identical feature schema
        for name, m in models.items():
            if int(m.num_feature()) != n_feat:
                print(f"[WARN] action={name} num_feature differs: {int(m.num_feature())} vs {n_feat}")

        print("Model expects n_features:", n_feat)

        return {"models": models, "thresholds": thresholds, "n_features": n_feat}

    if MODEL_TYPE == "pytorch":
        import torch
        # TODO: 你需要自己实现/导入网络结构，并加载权重
        # model = YourModel(...)
        # state = torch.load(MODEL_PATH, map_location='cpu')
        # model.load_state_dict(state)
        # model.eval()
        raise NotImplementedError("PyTorch template: implement your own model loading.")

    if MODEL_TYPE == "tensorflow":
        import tensorflow as tf
        # model = tf.keras.models.load_model(MODEL_PATH)
        raise NotImplementedError("TensorFlow template: implement your own model loading.")

    if MODEL_TYPE == "onnx":
        import onnxruntime as ort
        # sess = ort.InferenceSession(MODEL_PATH, providers=[...])
        raise NotImplementedError("ONNX template: implement your own model loading.")

    raise ValueError(MODEL_TYPE)


bundle = build_model()
if MODEL_TYPE == "lgbm":
    print("Loaded LGBM models:", len(bundle["models"]))
    print("Threshold keys:", len(bundle["thresholds"]))


Model expects n_features: 40
Example feature cols (head): ['Column_0', 'Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5', 'Column_6', 'Column_7', 'Column_8', 'Column_9']
Loaded LGBM models: 36
Threshold keys: 36


In [29]:
# 6) 加载测试数据（CSV / Parquet / 图片目录 三选一模板）
# 本竞赛：读取 test.csv + test_tracking/*.parquet


def _json_load_maybe(value: Any) -> Any:
    if value is None:
        return None
    if isinstance(value, float) and np.isnan(value):
        return None
    if isinstance(value, (list, dict)):
        return value
    s = str(value).strip()
    if not s:
        return None
    try:
        return json.loads(s)
    except Exception:
        return None


def parse_behaviors_labeled(s: Any) -> list[tuple[int, object, str]]:
    """Parse behaviors_labeled into (agent_raw, target_raw, action).

    Raw format: "mouse1,mouse2,approach" or "mouse1,self,rear".
    Keep target_raw='self' literal so submission can match public notebooks / official metric expectations.
    """
    v = _json_load_maybe(s)
    if v is None:
        return []

    out: list[tuple[int, object, str]] = []
    for item in v:
        parts = str(item).split(",")
        if len(parts) != 3:
            continue
        a_raw, t_raw, action = [p.strip() for p in parts]
        if not a_raw.startswith("mouse"):
            continue
        try:
            agent = int(a_raw.replace("mouse", ""))
        except Exception:
            continue

        if t_raw == "self":
            target: object = "self"
        else:
            if not t_raw.startswith("mouse"):
                continue
            try:
                target = int(t_raw.replace("mouse", ""))
            except Exception:
                continue
        out.append((agent, target, str(action)))
    return out


TEST_DF = pd.read_csv(COMP_DATA_DIR / "test.csv")
TEST_DF["behaviors_labeled_list"] = TEST_DF["behaviors_labeled"].apply(parse_behaviors_labeled)

assert "video_id" in TEST_DF.columns
assert "lab_id" in TEST_DF.columns
print("TEST_DF:", TEST_DF.shape)
print(TEST_DF[["lab_id", "video_id"]].head())

TEST_DF: (1, 39)
           lab_id   video_id
0  AdaptableSnail  438887472


In [30]:
# 7) 数据预处理与 Batching（按任务改）
# 本项目推理只有 1 个测试视频，也可以直接全量按帧算特征（不必复杂 batching）。

CENTER_PART_PRIORITY = ["body_center", "neck"]
NOSE_PART_PRIORITY = ["nose"]
TAIL_PART_PRIORITY = ["tail_base", "tail_midpoint"]


def _pick_available_part(available: set[str], priority: list[str]) -> str | None:
    for p in priority:
        if p in available:
            return p
    return None


def _get_tracking_path(split_dir: Path, lab_id: str, video_id: int) -> Path:
    # Kaggle data provides train_tracking/ and test_tracking/
    return split_dir / str(lab_id) / f"{int(video_id)}.parquet"


def load_tracking_df(split: str, lab_id: str, video_id: int) -> pd.DataFrame:
    split_dir = COMP_DATA_DIR / ("test_tracking" if split == "test" else "train_tracking")
    path = _get_tracking_path(split_dir, lab_id, video_id)
    if not path.exists():
        raise FileNotFoundError(path)
    df = pd.read_parquet(path)
    need = {"video_frame", "mouse_id", "bodypart", "x", "y"}
    missing = need - set(df.columns)
    if missing:
        raise ValueError(f"Tracking parquet missing columns: {missing}")
    return df


def build_mouse_state(tracking_df: pd.DataFrame, fps: float, pix_per_cm: float | None, interp_limit: int = 5) -> pd.DataFrame:
    df = tracking_df.copy()
    if df["mouse_id"].dtype == object:
        df["mouse"] = df["mouse_id"].astype(str).str.replace("mouse", "", regex=False).astype(int)
    else:
        df["mouse"] = df["mouse_id"].astype(int)
    df["video_frame"] = df["video_frame"].astype(int)

    available_parts = set(df["bodypart"].astype(str).unique())
    center_part = _pick_available_part(available_parts, CENTER_PART_PRIORITY)
    nose_part = _pick_available_part(available_parts, NOSE_PART_PRIORITY)
    tail_part = _pick_available_part(available_parts, TAIL_PART_PRIORITY)

    mice = sorted(df["mouse"].unique().astype(int).tolist())
    max_frame = int(df["video_frame"].max())
    full_index = pd.MultiIndex.from_product(
        [np.arange(0, max_frame + 1, dtype=np.int32), mice],
        names=["video_frame", "mouse"],
    )

    def _extract_part(part: str) -> pd.DataFrame:
        part_df = df.loc[df["bodypart"].astype(str) == part, ["video_frame", "mouse", "x", "y"]].copy()
        part_df = part_df.rename(columns={"x": f"{part}_x", "y": f"{part}_y"})
        part_df = part_df.set_index(["video_frame", "mouse"]).sort_index()
        return part_df

    def _to_cm(part_df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
        if pix_per_cm and np.isfinite(pix_per_cm) and pix_per_cm > 0:
            part_df[cols] = part_df[cols] / float(pix_per_cm)
        return part_df

    # center
    if center_part is not None:
        center_xy = _extract_part(center_part)
        center_xy = center_xy.rename(columns={f"{center_part}_x": "center_x", f"{center_part}_y": "center_y"}).reindex(full_index)
        center_xy = _to_cm(center_xy, ["center_x", "center_y"])
        out = center_xy
    else:
        if {"nose", "tail_base"}.issubset(available_parts):
            parts_for_center = ["nose", "tail_base"]
        elif {"ear_left", "ear_right"}.issubset(available_parts):
            parts_for_center = ["ear_left", "ear_right"]
        else:
            parts_for_center = [p for p in ["nose", "tail_base", "ear_left", "ear_right", "tail_tip"] if p in available_parts]
            if not parts_for_center:
                parts_for_center = sorted(list(available_parts))[:2]

        part_dfs = []
        for p in parts_for_center:
            d = _extract_part(p).reindex(full_index)
            d = d.rename(columns={f"{p}_x": f"x_{p}", f"{p}_y": f"y_{p}"})
            d = _to_cm(d, [f"x_{p}", f"y_{p}"])
            part_dfs.append(d)

        merged = pd.concat(part_dfs, axis=1)
        x_cols = [c for c in merged.columns if c.startswith("x_")]
        y_cols = [c for c in merged.columns if c.startswith("y_")]
        out = pd.DataFrame(index=full_index)
        out["center_x"] = merged[x_cols].mean(axis=1, skipna=True)
        out["center_y"] = merged[y_cols].mean(axis=1, skipna=True)

    out[["center_x", "center_y"]] = (
        out.groupby(level="mouse", group_keys=False)[["center_x", "center_y"]]
        .apply(lambda g: g.interpolate(limit=interp_limit, limit_direction="both"))
        .fillna(0.0)
    )

    out[["vx", "vy"]] = out.groupby(level="mouse", group_keys=False)[["center_x", "center_y"]].diff().fillna(0.0) * float(fps)
    out["speed"] = np.sqrt(out["vx"] ** 2 + out["vy"] ** 2)

    # heading from nose
    out["hx"] = 0.0
    out["hy"] = 0.0
    out["has_heading"] = np.int8(0)

    nose_xy = None
    if nose_part is not None:
        nose_xy = _extract_part(nose_part).rename(columns={f"{nose_part}_x": "nose_x", f"{nose_part}_y": "nose_y"}).reindex(full_index)
        nose_xy = _to_cm(nose_xy, ["nose_x", "nose_y"])
        nose_xy[["nose_x", "nose_y"]] = (
            nose_xy.groupby(level="mouse", group_keys=False)[["nose_x", "nose_y"]]
            .apply(lambda g: g.interpolate(limit=interp_limit, limit_direction="both"))
            .fillna(0.0)
        )

        dx = nose_xy["nose_x"] - out["center_x"]
        dy = nose_xy["nose_y"] - out["center_y"]
        norm = np.sqrt(dx * dx + dy * dy)
        ok = norm > 1e-6
        out.loc[ok, "hx"] = (dx.loc[ok] / norm.loc[ok]).astype(float)
        out.loc[ok, "hy"] = (dy.loc[ok] / norm.loc[ok]).astype(float)
        out.loc[ok, "has_heading"] = np.int8(1)

    # body len
    out["body_len"] = 0.0
    if tail_part is not None and nose_xy is not None:
        tail_xy = _extract_part(tail_part).rename(columns={f"{tail_part}_x": "tail_x", f"{tail_part}_y": "tail_y"}).reindex(full_index)
        tail_xy = _to_cm(tail_xy, ["tail_x", "tail_y"])
        tail_xy[["tail_x", "tail_y"]] = (
            tail_xy.groupby(level="mouse", group_keys=False)[["tail_x", "tail_y"]]
            .apply(lambda g: g.interpolate(limit=interp_limit, limit_direction="both"))
            .fillna(0.0)
        )
        out["body_len"] = np.sqrt((nose_xy["nose_x"] - tail_xy["tail_x"]) ** 2 + (nose_xy["nose_y"] - tail_xy["tail_y"]) ** 2).astype(float)

    return out


def _rolling_feats(s: pd.Series, windows: tuple[int, ...]) -> dict[str, pd.Series]:
    out: dict[str, pd.Series] = {}
    for w in windows:
        r = s.rolling(window=w, min_periods=1, center=True)
        out[f"mean_{w}"] = r.mean()
        out[f"std_{w}"] = r.std(ddof=0).fillna(0.0)
    return out


def pair_features_for_frames(state: pd.DataFrame, agent: int, target: int, frames: np.ndarray) -> pd.DataFrame:
    # Robustness: if behaviors_labeled references a mouse id not present in tracking for this video,
    # return all-zero features for those frames (so the model predicts background).
    available_mice = set(state.index.get_level_values("mouse").unique().astype(int).tolist())
    if int(agent) not in available_mice or int(target) not in available_mice:
        out = pd.DataFrame(0.0, index=frames, columns=FEATURE_COLS)
        for c in ["agent_has_heading", "target_has_heading"]:
            if c in out.columns:
                out[c] = out[c].astype(np.int8)
        return out

    a = state.xs(agent, level="mouse").reindex(frames)
    t = state.xs(target, level="mouse").reindex(frames)

    dx = (t["center_x"] - a["center_x"]).astype(float)
    dy = (t["center_y"] - a["center_y"]).astype(float)
    dist = np.sqrt(dx * dx + dy * dy) + 1e-6

    rel_vx = (t["vx"] - a["vx"]).astype(float)
    rel_vy = (t["vy"] - a["vy"]).astype(float)
    closing = (dx * rel_vx + dy * rel_vy) / dist
    cos_facing = (a["hx"] * dx + a["hy"] * dy) / dist

    out = pd.DataFrame(
        {
            "dx": dx,
            "dy": dy,
            "dist": dist,
            "agent_speed": a["speed"].astype(float),
            "target_speed": t["speed"].astype(float),
            "closing": closing.astype(float),
            "cos_facing": cos_facing.astype(float),
            "agent_has_heading": a["has_heading"].fillna(0).astype(np.int8),
            "target_has_heading": t["has_heading"].fillna(0).astype(np.int8),
            "agent_body_len": a["body_len"].fillna(0.0).astype(float),
            "target_body_len": t["body_len"].fillna(0.0).astype(float),
        },
        index=frames,
    )

    out["dist_diff1"] = out["dist"].diff().fillna(0.0)
    out["closing_diff1"] = out["closing"].diff().fillna(0.0)
    out["cos_facing_diff1"] = out["cos_facing"].diff().fillna(0.0)
    out["agent_speed_diff1"] = out["agent_speed"].diff().fillna(0.0)
    out["target_speed_diff1"] = out["target_speed"].diff().fillna(0.0)

    windows = (5, 15)
    for k, s in _rolling_feats(out["dist"], windows).items():
        out[f"dist_{k}"] = s
    for k, s in _rolling_feats(out["closing"], windows).items():
        out[f"closing_{k}"] = s
    for k, s in _rolling_feats(out["cos_facing"], windows).items():
        out[f"cos_facing_{k}"] = s
    for k, s in _rolling_feats(out["agent_speed"], windows).items():
        out[f"agent_speed_{k}"] = s
    for k, s in _rolling_feats(out["target_speed"], windows).items():
        out[f"target_speed_{k}"] = s

    # Relative scale features (these are the extra +4 features that make 40 total)
    eps = 1e-6
    out["dist_rel_agent"] = out["dist"] / (out["agent_body_len"] + eps)
    out["dist_rel_target"] = out["dist"] / (out["target_body_len"] + eps)
    out["agent_speed_rel"] = out["agent_speed"] / (out["agent_body_len"] + eps)
    out["target_speed_rel"] = out["target_speed"] / (out["target_body_len"] + eps)

    out = out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return out


FEATURE_COLS = [
    "dx",
    "dy",
    "dist",
    "agent_speed",
    "target_speed",
    "closing",
    "cos_facing",
    "agent_has_heading",
    "target_has_heading",
    "agent_body_len",
    "target_body_len",
    "dist_diff1",
    "closing_diff1",
    "cos_facing_diff1",
    "agent_speed_diff1",
    "target_speed_diff1",
    "dist_mean_5",
    "dist_std_5",
    "dist_mean_15",
    "dist_std_15",
    "closing_mean_5",
    "closing_std_5",
    "closing_mean_15",
    "closing_std_15",
    "cos_facing_mean_5",
    "cos_facing_std_5",
    "cos_facing_mean_15",
    "cos_facing_std_15",
    "agent_speed_mean_5",
    "agent_speed_std_5",
    "agent_speed_mean_15",
    "agent_speed_std_15",
    "target_speed_mean_5",
    "target_speed_std_5",
    "target_speed_mean_15",
    "target_speed_std_15",
    "dist_rel_agent",
    "dist_rel_target",
    "agent_speed_rel",
    "target_speed_rel",
]


In [None]:
# 8) 推理与后处理（对齐官方 metric 约束 + self 输出）

from collections import defaultdict
import re


def _segments_from_binary(binary: np.ndarray) -> list[tuple[int, int]]:
    """Return inclusive (start, stop) segments from a 0/1 array."""
    if binary.size == 0:
        return []
    binary = binary.astype(np.int8)
    changes = np.diff(binary, prepend=0, append=0)
    starts = np.where(changes == 1)[0]
    ends = np.where(changes == -1)[0] - 1
    return list(zip(starts.tolist(), ends.tolist()))


def _merge_close_segments(segs: list[tuple[int, int]], gap_frames: int) -> list[tuple[int, int]]:
    if not segs:
        return []
    if gap_frames <= 0:
        return segs
    segs = sorted(segs)
    merged = [segs[0]]
    for s, e in segs[1:]:
        ps, pe = merged[-1]
        if s <= pe + gap_frames + 1:
            merged[-1] = (ps, max(pe, e))
        else:
            merged.append((s, e))
    return merged


def _smooth_prob(prob: np.ndarray, median_kernel: int) -> np.ndarray:
    k = int(median_kernel)
    if k % 2 == 0:
        k += 1
    k = max(3, k)
    p = prob.astype(np.float32)
    if p.size >= k:
        p = medfilt(p, kernel_size=k)
    return p


def _postprocess_binary_to_segments(binary: np.ndarray, fps: float) -> list[tuple[int, int]]:
    segs = _segments_from_binary(binary)
    min_len = int(round(float(fps) * float(MIN_DURATION_SEC)))
    min_len = max(1, min_len)
    segs = [(s, e) for (s, e) in segs if (e - s + 1) >= min_len]
    gap_frames = int(round(float(fps) * float(MERGE_GAP_SEC)))
    segs = _merge_close_segments(segs, gap_frames=gap_frames)
    segs = [(s, e) for (s, e) in segs if (e - s + 1) >= min_len]
    return segs


def _mouse_num(x: object) -> int | None:
    if x is None:
        return None
    if isinstance(x, (int, np.integer)):
        return int(x)
    s = str(x).strip()
    if not s:
        return None
    if s.startswith("mouse"):
        s = s.replace("mouse", "")
    try:
        return int(s)
    except Exception:
        return None


def _normalize_pair(agent_raw: object, target_raw: object) -> tuple[int, int, str, str] | None:
    """Return (agent_int, target_int_for_features, agent_id_str, target_id_str_for_submission)."""
    a = _mouse_num(agent_raw)
    if a is None or a <= 0:
        return None
    agent_id_str = f"mouse{a}"

    # Keep literal self when present (preferred).
    if isinstance(target_raw, str) and target_raw.strip() == "self":
        target_int = a
        target_id_str = "self" if OUTPUT_SELF_AS_LITERAL else f"mouse{a}"
        return int(a), int(target_int), agent_id_str, target_id_str

    # Robustness: if target == agent (e.g., rear sometimes encoded as mouseX,mouseX),
    # output target_id as literal 'self' to match public notebook submissions.
    t = _mouse_num(target_raw)
    if t is None or t <= 0:
        return None
    if int(t) == int(a):
        target_int = a
        target_id_str = "self" if OUTPUT_SELF_AS_LITERAL else f"mouse{a}"
        return int(a), int(target_int), agent_id_str, target_id_str

    return int(a), int(t), agent_id_str, f"mouse{t}"


def run_inference_lgbm(test_df: pd.DataFrame, bundle: dict[str, Any]) -> pd.DataFrame:
    models: dict[str, lgb.Booster] = bundle["models"]
    thresholds: dict[str, float] = bundle["thresholds"]

    # Align inference features to training feature order
    # NOTE: do NOT use Booster.feature_name() here (likely 'Column_0'..), see build_model().
    n_feat_model = int(bundle.get("n_features", next(iter(models.values())).num_feature()))
    feature_cols: list[str] = list(FEATURE_COLS)
    if len(feature_cols) != n_feat_model:
        raise RuntimeError(
            f"Feature count mismatch: model expects {n_feat_model} but FEATURE_COLS has {len(feature_cols)}"
        )
    print("Using feature_cols from: FEATURE_COLS")
    print("n_features:", len(feature_cols))

    rows: list[list[Any]] = []

    for _, meta in test_df.iterrows():
        lab_id = str(meta["lab_id"])
        video_id = int(meta["video_id"])
        fps = float(meta.get("frames_per_second", 30.0) or 30.0)
        pix_per_cm = meta.get("pix_per_cm_approx", None)
        if pix_per_cm is not None and not np.isfinite(pix_per_cm):
            pix_per_cm = None

        tracking = load_tracking_df("test", lab_id, video_id)
        state = build_mouse_state(tracking, fps=fps, pix_per_cm=pix_per_cm, interp_limit=5)
        max_frame = int(state.index.get_level_values("video_frame").max())
        frames = np.arange(0, max_frame + 1, dtype=np.int32)

        candidates: list[tuple[int, object, str]] = meta["behaviors_labeled_list"]

        # group by normalized pair; enforce per-pair per-frame single action (official metric constraint)
        pair_to_actions: dict[tuple[int, int, str, str], list[str]] = defaultdict(list)
        for agent_raw, target_raw, action in candidates:
            if action not in models:
                continue
            norm = _normalize_pair(agent_raw, target_raw)
            if norm is None:
                continue
            agent_i, target_i, agent_id_str, target_id_str = norm
            pair_to_actions[(agent_i, target_i, agent_id_str, target_id_str)].append(str(action))

        row_id = 0
        for (agent_i, target_i, agent_id_str, target_id_str), actions_for_pair in pair_to_actions.items():
            actions_unique: list[str] = list(dict.fromkeys(actions_for_pair))
            feats = pair_features_for_frames(state, agent=agent_i, target=target_i, frames=frames)

            # Defensive: engineered features must all exist. If this triggers, the notebook is out of sync.
            missing = [c for c in feature_cols if c not in feats.columns]
            if missing:
                raise RuntimeError(f"Missing engineered features in feats: {missing[:10]} (total={len(missing)})")

            X = feats[feature_cols].to_numpy(dtype=np.float32)

            probs = []
            ths = []
            for action in actions_unique:
                p = models[action].predict(X)
                p = _smooth_prob(p, median_kernel=MEDIAN_KERNEL)
                probs.append(p)
                ths.append(float(thresholds.get(action, 0.5)))

            P = np.vstack(probs)  # (A, T)
            masked = P.copy()
            for i in range(masked.shape[0]):
                masked[i, masked[i] < ths[i]] = -np.inf

            winner_idx = masked.argmax(axis=0)
            winner_val = masked[winner_idx, np.arange(masked.shape[1])]
            active = np.isfinite(winner_val)

            for i, action in enumerate(actions_unique):
                binary = (active & (winner_idx == i)).astype(np.int8)
                segs = _postprocess_binary_to_segments(binary=binary, fps=fps)
                for s, e in segs:
                    rows.append(
                        [
                            row_id,
                            video_id,
                            agent_id_str,
                            target_id_str,
                            action,
                            int(s),
                            int(e + 1),  # stop_frame EXCLUSIVE
                        ]
                    )
                    row_id += 1

    sub = pd.DataFrame(rows, columns=SUB_COLS)

    # Canonical deterministic ordering
    def _agent_num(s: object) -> int:
        m = re.search(r"(\d+)$", str(s))
        return int(m.group(1)) if m else -1

    def _target_num(row: pd.Series) -> int:
        if str(row["target_id"]) == "self":
            return int(row["agent_num"])
        m = re.search(r"(\d+)$", str(row["target_id"]))
        return int(m.group(1)) if m else -1

    if len(sub):
        sub["agent_num"] = sub["agent_id"].map(_agent_num)
        sub["target_num"] = sub.apply(_target_num, axis=1)
        sub = (
            sub.sort_values(["video_id", "agent_num", "target_num", "action", "start_frame", "stop_frame"], kind="mergesort")
            .drop(columns=["agent_num", "target_num"])
            .reset_index(drop=True)
        )

    sub["row_id"] = np.arange(len(sub), dtype=np.int64)
    return sub


assert MODEL_TYPE == "lgbm", "This notebook is pre-wired for LightGBM artifacts."
submission = run_inference_lgbm(TEST_DF, bundle)
print("submission rows:", len(submission))
submission.head(10)


Using feature_cols from: bundle
n_features: 40
submission rows: 12


Unnamed: 0,row_id,video_id,agent_id,target_id,action,start_frame,stop_frame
0,0,438887472,mouse1,mouse2,chaseattack,0,18423
1,1,438887472,mouse1,mouse3,chaseattack,0,18423
2,2,438887472,mouse1,mouse4,chaseattack,0,18423
3,3,438887472,mouse2,mouse1,chaseattack,0,18423
4,4,438887472,mouse2,mouse3,chaseattack,0,18423
5,5,438887472,mouse2,mouse4,chaseattack,0,18423
6,6,438887472,mouse3,mouse1,chaseattack,0,18423
7,7,438887472,mouse3,mouse2,chaseattack,0,18423
8,8,438887472,mouse3,mouse4,chaseattack,0,18423
9,9,438887472,mouse4,mouse1,chaseattack,0,18423


In [32]:
# 9) （可选）本地验证集评估与简单可视化
# Kaggle 测试集没有标签；默认关闭。
RUN_EVAL = False

if RUN_EVAL:
    print("No evaluation implemented (test set has no labels).")


In [33]:
# 10) 生成 submission.csv 并保存到 /kaggle/working

# Safety: ensure rear is output as target_id='self' (matches public notebooks).
if len(submission):
    m_same = (submission["action"].astype(str) == "rear") & (submission["target_id"].astype(str) == submission["agent_id"].astype(str))
    if bool(m_same.any()):
        submission.loc[m_same, "target_id"] = "self"
    m_bad = (submission["action"].astype(str) == "rear") & (submission["target_id"].astype(str) != "self")
    if bool(m_bad.any()):
        print("[WARN] rear rows have non-self target_id (showing head):")
        print(submission.loc[m_bad].head().to_string(index=False))

out_path = WORK_DIR / "submission.csv"
submission.to_csv(out_path, index=False)

print("Wrote:", out_path)
print("size (bytes):", out_path.stat().st_size)
print("has self target?", bool((submission["target_id"] == "self").any()))
print(submission.head())

Wrote: /kaggle/working/submission.csv
size (bytes): 619
has self target? False
   row_id   video_id agent_id target_id       action  start_frame  stop_frame
0       0  438887472   mouse1    mouse2  chaseattack            0       18423
1       1  438887472   mouse1    mouse3  chaseattack            0       18423
2       2  438887472   mouse1    mouse4  chaseattack            0       18423
3       3  438887472   mouse2    mouse1  chaseattack            0       18423
4       4  438887472   mouse2    mouse3  chaseattack            0       18423


In [34]:
# 11) 导出中间产物与运行信息（日志、版本、校验）

def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

meta = {
    "artifact_dataset": ARTIFACT_DATASET_NAME,
    "artifact_dir": str(ARTIFACT_DIR),
    "comp_data_dir": str(COMP_DATA_DIR),
    "model_type": MODEL_TYPE,
    "median_kernel": MEDIAN_KERNEL,
    "min_duration_sec": MIN_DURATION_SEC,
    "num_models": (len(bundle["models"]) if MODEL_TYPE == "lgbm" else None),
    "thresholds_keys": (len(bundle["thresholds"]) if MODEL_TYPE == "lgbm" else None),
}

# hash a few files for debugging reproducibility
hashes = {}
th_path = ARTIFACT_DIR / "thresholds.joblib"
if th_path.exists():
    hashes[str(th_path)] = sha256_file(th_path)

# hash first 3 model files
model_files = sorted((ARTIFACT_DIR / "models").glob("lgbm_*.txt"))[:3]
for p in model_files:
    hashes[str(p)] = sha256_file(p)

meta["sha256"] = hashes

meta_path = WORK_DIR / "run_meta.json"
meta_path.write_text(json.dumps(meta, indent=2))

preview_path = WORK_DIR / "preds_preview.csv"
submission.head(100).to_csv(preview_path, index=False)

print("Wrote:", meta_path)
print("Wrote:", preview_path)


Wrote: /kaggle/working/run_meta.json
Wrote: /kaggle/working/preds_preview.csv
