In [1]:
pip install numpy pandas opencv-python torch scipy ultralytics deep-sort-realtime lightgbm scikit-learn joblib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, json, math, random
from pathlib import Path
from collections import defaultdict, deque

import numpy as np
import pandas as pd
import cv2
import torch
from scipy.io import loadmat

from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import joblib

In [3]:
class Config:
    DATA_ROOT = Path("/kaggle/input/avenuedataset/Avenue Dataset")
    TRAIN_VIDEOS_PATH = DATA_ROOT / "training_videos"
    TEST_VIDEOS_PATH = DATA_ROOT / "testing_videos"
    GROUND_TRUTH_FOLDER_PATH = DATA_ROOT / "testing_label_mask"

    YOLO_WEIGHTS = "yolov8n.pt"
    CLASSES_TO_DETECT = [0, 24, 26, 28]  # person, backpack, handbag, suitcase
    CONFIDENCE_THRESHOLD = 0.35
    IOU_NMS = 0.5
    DEVICE = 0 if torch.cuda.is_available() else "cpu"

    # DeepSORT
    MAX_AGE = 30
    N_INIT = 3
    MAX_DIST = 0.2
    MAX_IOU_DISTANCE = 0.7
    NN_BUDGET = 100

    # Feature windows (seconds)
    STATIONARY_WINDOW_SEC = 2.0
    LOITER_WINDOW_SEC = 2.0
    LOITERING_DIST_PX = 40.0
    SPEED_THRESH_PX = 20.0
    USE_TRAIN_NEGATIVES = True
    NEGATIVE_RATIO = 10.0    
    VAL_SLICE_MAX = 2000  
    VALIDATION_ON_EVEN = True
    RANDOM_SEED = 123

    CACHE_DIR = Path("./cache_dets")
    ARTIFACT_DIR = Path("./artifacts_train")

def set_seeds(s=Config.RANDOM_SEED):
    random.seed(s); np.random.seed(s); torch.manual_seed(s)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(s)
set_seeds()

In [4]:
def iou_xyxy(a, b):
    ax1, ay1, ax2, ay2 = a; bx1, by1, bx2, by2 = b
    inter_x1, inter_y1 = max(ax1, bx1), max(ay1, by1)
    inter_x2, inter_y2 = min(ax2, bx2), min(ay2, by2)
    iw, ih = max(0, inter_x2 - inter_x1), max(0, inter_y2 - inter_y1)
    inter = iw * ih
    area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
    area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
    union = area_a + area_b - inter + 1e-6
    return inter / union if union > 0 else 0.0

In [5]:
def numeric_stem(path: Path):
    s = path.stem
    digits = "".join(ch for ch in s if ch.isdigit())
    return int(digits) if digits else 0

In [6]:
def normalize_stem_for_label(stem: str) -> str:
    return str(int(stem)) if stem.isdigit() else stem

In [7]:
def robust_load_gt(gt_path: Path):
    try:
        data = loadmat(str(gt_path), squeeze_me=True, struct_as_record=False)
    except Exception as e:
        print(f"ERROR reading MAT {gt_path.name}: {e}")
        return None
    for key in ["volLabel", "label", "gt", "labels"]:
        if key not in data:
            continue
        arr = data[key]
        if isinstance(arr, np.ndarray) and arr.dtype != object:
            flat = np.asarray(arr, dtype=np.float32).ravel()
            return (flat > 0.5).astype(np.int32)
        if isinstance(arr, (list, tuple)) or (isinstance(arr, np.ndarray) and arr.dtype == object):
            vals = []
            iterable = arr.ravel().tolist() if isinstance(arr, np.ndarray) else list(arr)
            for elem in iterable:
                if elem is None:
                    vals.append(0.0)
                else:
                    e = np.asarray(elem, dtype=np.float32)
                    vals.append(float(e.ravel()[0])) if e.size > 0 else vals.append(0.0)
            if len(vals) > 0:
                flat = np.asarray(vals, dtype=np.float32)
                return (flat > 0.5).astype(np.int32)
        try:
            val = float(np.asarray(arr).ravel()[0])
            return np.asarray([1 if val > 0.5 else 0], dtype=np.int32)
        except Exception:
            continue
    return None

In [8]:
def cache_path_for(video_path: Path):
    Config.CACHE_DIR.mkdir(parents=True, exist_ok=True)
    return Config.CACHE_DIR / f"{video_path.stem}.jsonl"

In [9]:
def write_cache(video_path: Path, per_frame_dets):
    with open(cache_path_for(video_path), "w") as f:
        for dets in per_frame_dets:
            f.write(json.dumps(dets) + "\n")

In [10]:
def read_cache(video_path: Path):
    p = cache_path_for(video_path)
    if not p.exists(): return None
    with open(p, "r") as f:
        return [json.loads(line) for line in f]

In [11]:
def smooth_scores(scores, k=9):
    k = max(1, int(k) | 1)
    from collections import deque
    out = np.zeros_like(scores)
    q, s = deque(), 0.0
    for i, v in enumerate(scores):
        q.append(v); s += v
        if len(q) > k:
            s -= q.popleft()
        out[i] = s / len(q)
    return out

In [12]:
class Detector:
    def __init__(self):
        self.model = YOLO(Config.YOLO_WEIGHTS)
        self.model.fuse()
        self.names = self.model.model.names

    def detect(self, frame_bgr):
        res = self.model.predict(
            frame_bgr, conf=Config.CONFIDENCE_THRESHOLD, iou=Config.IOU_NMS,
            device=Config.DEVICE, verbose=False
        )[0]
        dets = []
        if res.boxes is None or len(res.boxes) == 0:
            return dets, self.names
        boxes = res.boxes.xyxy.cpu().numpy()
        confs = res.boxes.conf.cpu().numpy()
        clss = res.boxes.cls.cpu().numpy().astype(int)
        for (x1, y1, x2, y2), c, k in zip(boxes, confs, clss):
            if k in Config.CLASSES_TO_DETECT:
                dets.append((float(x1), float(y1), float(x2), float(y2), float(c), int(k)))
        return dets, self.names

In [13]:
class Tracker:
    def __init__(self):
        self.trk = DeepSort(
            max_age=Config.MAX_AGE, n_init=Config.N_INIT,
            max_iou_distance=Config.MAX_IOU_DISTANCE, max_cosine_distance=Config.MAX_DIST,
            nn_budget=Config.NN_BUDGET, embedder="mobilenet",
            half=True if torch.cuda.is_available() else False, bgr=True
        )

    def update(self, frame_bgr, dets, class_names):
        inputs = []
        for (x1, y1, x2, y2, c, k) in dets:
            w = max(0.0, x2 - x1)
            h = max(0.0, y2 - y1)
            inputs.append([(float(x1), float(y1), float(w), float(h)), float(c), str(class_names[k])])
        tracks = self.trk.update_tracks(inputs, frame=frame_bgr)

        det_boxes = [(float(x1), float(y1), float(x2), float(y2)) for (x1, y1, x2, y2, _, _) in dets]
        det_classes = [int(k) for *_, k in dets]

        out = []
        for t in tracks:
            if not t.is_confirmed(): continue
            x1, y1, x2, y2 = t.to_tlbr()
            cls_id = 0
            if det_boxes:
                ious = [iou_xyxy((x1, y1, x2, y2), b) for b in det_boxes]
                j = int(np.argmax(ious))
                if ious[j] > 0.3:
                    cls_id = det_classes[j]
            out.append({
                "box": (float(x1), float(y1), float(x2), float(y2)),
                "id": int(t.track_id),
                "center": (float((x1+x2)/2), float((y1+y2)/2)),
                "cls_id": int(cls_id)
            })
        return out

In [14]:
FEATURE_ORDER = [
    "num_people","num_objects",
    "avg_speed_px","max_speed_px","avg_accel_px",
    "fast_ratio","loiter_ratio","stationary_ratio",
    "min_person_obj_px","min_inter_person_px","avg_inter_person_px",
    "min_person_obj_norm","min_inter_person_norm","avg_inter_person_norm",
    "mean_track_age","median_track_age","max_track_age",
    "person_conf_mean","person_conf_max","object_conf_mean","object_conf_max",
    "grid_mean","grid_max","grid_std",
]

In [15]:
class FeatureLogger:
    def __init__(self, fps, w, h):
        self.fps = float(fps) if fps and fps > 0 else 30.0
        self.w, self.h = int(w), int(h)
        self.img_diag = max(1.0, float(math.hypot(w, h)))
        self.hist = defaultdict(lambda: deque(maxlen=int(self.fps * 10)))
        self.first_seen = {}
        self.frame_idx = 0
        self.stationary_window = int(Config.STATIONARY_WINDOW_SEC * self.fps)
        self.loiter_window = int(Config.LOITER_WINDOW_SEC * self.fps)
        self.last_detections = []  # [(cls_id, conf)]

    def set_last_detections(self, dets):
        self.last_detections = [(int(k), float(c)) for (*_, c, k) in dets]

    def step(self, tracked_objects):
        self.frame_idx += 1
        for obj in tracked_objects:
            tid = obj["id"]
            self.hist[tid].append(obj["center"])
            if tid not in self.first_seen:
                self.first_seen[tid] = self.frame_idx
        return self._aggregate(tracked_objects)

    def _aggregate(self, objs):
        people = [o for o in objs if o["cls_id"] == 0]
        others = [o for o in objs if o["cls_id"] != 0]

        num_people, num_objects = len(people), len(others)

        speeds, accels, loiter_flags, stationary_flags = [], [], [], []
        for p in people:
            tid = p["id"]; h = self.hist[tid]
            if len(h) >= 2:
                s = float(np.linalg.norm(np.array(h[-1]) - np.array(h[-2])))
                speeds.append(s)
            if len(h) >= 3:
                s1 = float(np.linalg.norm(np.array(h[-1]) - np.array(h[-2])))
                s2 = float(np.linalg.norm(np.array(h[-2]) - np.array(h[-3])))
                accels.append(abs(s1 - s2))
            if len(h) >= self.loiter_window:
                disp = float(np.linalg.norm(np.array(h[-1]) - np.array(h[-self.loiter_window])))
                loiter_flags.append(1.0 if disp < Config.LOITERING_DIST_PX else 0.0)
            if len(h) >= self.stationary_window:
                disp = float(np.linalg.norm(np.array(h[-1]) - np.array(h[-self.stationary_window])))
                stationary_flags.append(1.0 if disp < (0.6 * Config.LOITERING_DIST_PX) else 0.0)

        avg_speed = float(np.mean(speeds)) if speeds else 0.0
        max_speed = float(np.max(speeds)) if speeds else 0.0
        avg_accel = float(np.mean(accels)) if accels else 0.0
        fast_ratio = float(np.mean([s > Config.SPEED_THRESH_PX for s in speeds])) if speeds else 0.0
        loiter_ratio = float(np.mean(loiter_flags)) if loiter_flags else 0.0
        stationary_ratio = float(np.mean(stationary_flags)) if stationary_flags else 0.0

        min_person_obj = 1e6
        for p in people:
            px, py = p["center"]
            for o in others:
                ox, oy = o["center"]
                d = float(np.hypot(px - ox, py - oy))
                if d < min_person_obj: min_person_obj = d
        if not people or not others:
            min_person_obj = 1e6

        pair_dists = []
        for i in range(len(people)):
            p1 = people[i]["center"]
            for j in range(i+1, len(people)):
                p2 = people[j]["center"]
                pair_dists.append(float(np.hypot(p1[0]-p2[0], p1[1]-p2[1])))
        min_inter_person = float(np.min(pair_dists)) if pair_dists else 1e6
        avg_inter_person = float(np.mean(pair_dists)) if pair_dists else 1e6

        ages = []
        for o in objs:
            tid = o["id"]
            ages.append(self.frame_idx - self.first_seen.get(tid, self.frame_idx) + 1)
        mean_age = float(np.mean(ages)) if ages else 0.0
        med_age = float(np.median(ages)) if ages else 0.0
        max_age = float(np.max(ages)) if ages else 0.0

        p_confs = [c for cid, c in self.last_detections if cid == 0]
        o_confs = [c for cid, c in self.last_detections if cid != 0]
        p_conf_mean = float(np.mean(p_confs)) if p_confs else 0.0
        p_conf_max = float(np.max(p_confs)) if p_confs else 0.0
        o_conf_mean = float(np.mean(o_confs)) if o_confs else 0.0
        o_conf_max = float(np.max(o_confs)) if o_confs else 0.0

        grid = np.zeros((3,3), dtype=float)
        if self.w > 0 and self.h > 0:
            for p in people:
                x, y = p["center"]
                gi = min(2, max(0, int((y / self.h) * 3)))
                gj = min(2, max(0, int((x / self.w) * 3)))
                grid[gi, gj] += 1.0
        grid_mean, grid_max, grid_std = float(grid.mean()), float(grid.max()), float(grid.std())
        diag = self.img_diag
        return {
            "num_people": float(num_people),
            "num_objects": float(num_objects),
            "avg_speed_px": avg_speed,
            "max_speed_px": max_speed,
            "avg_accel_px": avg_accel,
            "fast_ratio": fast_ratio,
            "loiter_ratio": loiter_ratio,
            "stationary_ratio": stationary_ratio,
            "min_person_obj_px": min_person_obj,
            "min_inter_person_px": min_inter_person,
            "avg_inter_person_px": avg_inter_person,
            "min_person_obj_norm": min_person_obj/diag,
            "min_inter_person_norm": min_inter_person/diag,
            "avg_inter_person_norm": avg_inter_person/diag,
            "mean_track_age": mean_age,
            "median_track_age": med_age,
            "max_track_age": max_age,
            "person_conf_mean": p_conf_mean,
            "person_conf_max": p_conf_max,
            "object_conf_mean": o_conf_mean,
            "object_conf_max": o_conf_max,
            "grid_mean": grid_mean,
            "grid_max": grid_max,
            "grid_std": grid_std,
        }

         

In [16]:
class VideoProcessor:
    def __init__(self, detector, tracker):
        self.detector = detector
        self.tracker = tracker

    def process(self, video_path: Path, use_cache=True):
        cap = cv2.VideoCapture(str(video_path))
        if not cap.isOpened():
            print(f"ERROR: cannot open {video_path}")
            return [], {}
        fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
        w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        cached = read_cache(video_path) if use_cache else None
        per_frame_dets = []
        flogger = FeatureLogger(fps=fps, w=w, h=h)
        frame_feats = []
        idx = 0
        while True:
            ret, frame = cap.read()
            if not ret: break
            if cached is not None and idx < len(cached):
                raw = cached[idx]
                dets = [(float(d["box"][0]), float(d["box"][1]), float(d["box"][2]), float(d["box"][3]),
                         float(d["conf"]), int(d["cls"])) for d in raw]
                class_names = self.detector.names
            else:
                dets, class_names = self.detector.detect(frame)
                per_frame_dets.append([{"box":[d[0],d[1],d[2],d[3]], "conf":d[4], "cls":d[5]} for d in dets])

            flogger.set_last_detections(dets)
            tracked = self.tracker.update(frame, dets, class_names)
            feats = flogger.step(tracked)
            frame_feats.append(feats)
            idx += 1

        cap.release()
        if cached is None and use_cache:
            write_cache(video_path, per_frame_dets)
        return frame_feats, {"fps": fps, "w": w, "h": h}

In [17]:
def collect_test_videos():
    return sorted([p for p in Config.TEST_VIDEOS_PATH.iterdir() if p.suffix.lower() in [".avi", ".mp4"]])

def collect_train_videos():
    p = Config.TRAIN_VIDEOS_PATH
    if not p.exists(): return []
    return sorted([p for p in p.iterdir() if p.suffix.lower() in [".avi", ".mp4"]])

In [18]:
def split_val_hold(videos):
    val, hold = [], []
    for v in videos:
        if (numeric_stem(v) % 2 == 0) == Config.VALIDATION_ON_EVEN:
            val.append(v)
        else:
            hold.append(v)
    return val, hold

def build_feature_matrix(frame_feats):
    X = np.asarray([[float(f[k]) for k in FEATURE_ORDER] for f in frame_feats], dtype=np.float32)
    X[~np.isfinite(X)] = 0.0
    return X

In [19]:
def align_labels_length(labels, length):
    y = np.zeros(length, dtype=int)
    m = min(length, len(labels))
    y[:m] = labels[:m]
    return y

In [20]:
def extract_features_with_labels(detector, tracker, videos):
    vp = VideoProcessor(detector, tracker)
    rows = []
    for v in videos:
        stem_for_label = normalize_stem_for_label(v.stem)
        gt_path = Config.GROUND_TRUTH_FOLDER_PATH / f"{stem_for_label}_label.mat"
        if not gt_path.exists():
            print(f"WARNING: expected GT missing: {gt_path.name} for {v.name}, skipping.")
            continue
        gt = robust_load_gt(gt_path)
        if gt is None or gt.size == 0:
            print(f"WARNING: empty/unknown GT schema for {gt_path.name}, skipping.")
            continue

        feats, _ = vp.process(v, use_cache=True)
        if not feats:
            print(f"WARNING: no frames for {v.name}, skipping.")
            continue

        X = build_feature_matrix(feats)
        y = align_labels_length(gt, len(X))
        df = pd.DataFrame(X, columns=FEATURE_ORDER)
        df["label"] = y
        df["video"] = v.name
        df["frame_idx"] = np.arange(len(X))
        rows.append(df)
        print(f"{v.name}: frames={len(X)} aligned={len(y)} (GT={len(gt)})")
    return pd.concat(rows, axis=0, ignore_index=True) if rows else pd.DataFrame()

        

In [21]:
def extract_features_no_labels(detector, tracker, videos):
    vp = VideoProcessor(detector, tracker)
    rows = []
    for v in videos:
        feats, _ = vp.process(v, use_cache=True)
        if not feats:
            print(f"WARNING: no frames for {v.name}, skipping.")
            continue
        X = build_feature_matrix(feats)
        df = pd.DataFrame(X, columns=FEATURE_ORDER)
        df["label"] = 0
        df["video"] = v.name
        df["frame_idx"] = np.arange(len(X))
        rows.append(df)
        print(f"{v.name}: frames={len(X)} (train negatives)")
    return pd.concat(rows, axis=0, ignore_index=True) if rows else pd.DataFrame()

In [22]:
def downsample_negatives(df, ratio=Config.NEGATIVE_RATIO, seed=Config.RANDOM_SEED):
    pos = df[df["label"] == 1]
    neg = df[df["label"] == 0]
    if len(pos) == 0 or len(neg) == 0:
        return df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    max_neg = int(ratio * len(pos))
    if len(neg) > max_neg:
        neg = neg.sample(n=max_neg, random_state=seed)
    out = pd.concat([pos, neg], axis=0, ignore_index=True)
    return out.sample(frac=1.0, random_state=seed).reset_index(drop=True)

In [23]:
def train_lgbm(X_train, y_train, X_val, y_val, seed=Config.RANDOM_SEED):
    pos = max(1, int((y_train == 1).sum()))
    neg = max(1, int((y_train == 0).sum()))
    spw = neg / pos
    clf = LGBMClassifier(
        n_estimators=1500,
        num_leaves=8,
        max_depth=3,     
        learning_rate=0.05,
        min_data_in_leaf=8,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=0.5,
        reg_alpha=0.0,
        objective="binary",
        random_state=seed,
        n_jobs=-1,
        scale_pos_weight=spw
    )
    clf.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[__import__("lightgbm").early_stopping(100, first_metric_only=True, verbose=False)]
    )
    return clf

In [24]:
def evaluate(clf, X, y_true, thr=0.5):
    scores = clf.predict_proba(X)[:, 1]
    y_pred = (scores >= thr).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    try:
        auc = roc_auc_score(y_true, scores)
    except ValueError:
        auc = float("nan")
    return {"precision": float(p), "recall": float(r), "f1": float(f1), "auc": float(auc), "scores": scores}


In [25]:
def best_threshold(clf, X, y_true, steps=61):
    scores = clf.predict_proba(X)[:, 1]
    best = {"thr": 0.5, "f1": -1.0, "precision":0.0, "recall":0.0}
    for t in np.linspace(0.05, 0.95, steps):
        y_pred = (scores >= t).astype(int)
        p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
        if f1 > best["f1"]:
            best = {"thr": float(t), "f1": float(f1), "precision": float(p), "recall": float(r)}
    return best

In [26]:
print("Loading models...")
detector = Detector()
tracker = Tracker()
print("Models ready.")
test_videos = collect_test_videos()
assert len(test_videos) > 0, "No videos found in testing_videos/"
train_videos = collect_train_videos()
print(f"Found {len(test_videos)} test videos; {len(train_videos)} training videos.")
val_videos, hold_videos = split_val_hold(test_videos)
print("Validation side:", [v.name for v in val_videos])
print("Holdout side:   ", [v.name for v in hold_videos])
print("Extracting labeled features from validation side (testing_videos)...")
df_val = extract_features_with_labels(detector, tracker, val_videos)
assert not df_val.empty, "No validation data assembled."

Loading models...
YOLOv8n summary (fused): 72 layers, 3,151,904 parameters, 0 gradients, 8.7 GFLOPs
Models ready.
Found 21 test videos; 16 training videos.
Validation side: ['02.avi', '04.avi', '06.avi', '08.avi', '10.avi', '12.avi', '14.avi', '16.avi', '18.avi', '20.avi']
Holdout side:    ['01.avi', '03.avi', '05.avi', '07.avi', '09.avi', '11.avi', '13.avi', '15.avi', '17.avi', '19.avi', '21.avi']
Extracting labeled features from validation side (testing_videos)...
02.avi: frames=1211 aligned=1211 (GT=1211)
04.avi: frames=947 aligned=947 (GT=947)
06.avi: frames=1283 aligned=1283 (GT=1283)
08.avi: frames=36 aligned=36 (GT=36)
10.avi: frames=841 aligned=841 (GT=841)
12.avi: frames=1271 aligned=1271 (GT=1271)
14.avi: frames=507 aligned=507 (GT=507)
16.avi: frames=740 aligned=740 (GT=740)
18.avi: frames=294 aligned=294 (GT=294)
20.avi: frames=273 aligned=273 (GT=273)


In [27]:
if Config.USE_TRAIN_NEGATIVES and len(train_videos) > 0:
    print("Extracting negatives from training_videos...")
    df_train_neg = extract_features_no_labels(detector, tracker, train_videos)
else:
    df_train_neg = pd.DataFrame()
if not df_train_neg.empty:
    df_train_pool = pd.concat([df_val, df_train_neg], axis=0, ignore_index=True)
    df_train_bal = downsample_negatives(df_train_pool, ratio=Config.NEGATIVE_RATIO)
    X_train = df_train_bal[FEATURE_ORDER].to_numpy(np.float32)
    y_train = df_train_bal["label"].to_numpy(int)
    nval = min(Config.VAL_SLICE_MAX, len(df_val))
    X_val_int = df_val[FEATURE_ORDER].to_numpy(np.float32)[-nval:]
    y_val_int = df_val["label"].to_numpy(int)[-nval:]
    print(f"Training frames after downsampling: P={int((y_train==1).sum())}, N={int((y_train==0).sum())}")
else:
    Xv = df_val[FEATURE_ORDER].to_numpy(np.float32); yv = df_val["label"].to_numpy(int)
    split = int(0.8 * len(df_val))
    X_train, y_train = Xv[:split], yv[:split]
    X_val_int, y_val_int = Xv[split:], yv[split:]

Extracting negatives from training_videos...
01.avi: frames=1364 (train negatives)
02.avi: frames=1511 (train negatives)
03.avi: frames=1487 (train negatives)
04.avi: frames=1511 (train negatives)
05.avi: frames=815 (train negatives)
06.avi: frames=1511 (train negatives)
07.avi: frames=1099 (train negatives)
08.avi: frames=1017 (train negatives)
09.avi: frames=1391 (train negatives)
10.avi: frames=1223 (train negatives)
11.avi: frames=781 (train negatives)
12.avi: frames=145 (train negatives)
13.avi: frames=366 (train negatives)
14.avi: frames=510 (train negatives)
15.avi: frames=353 (train negatives)
16.avi: frames=244 (train negatives)
Training frames after downsampling: P=18, N=180


In [28]:
clf = train_lgbm(X_train, y_train, X_val_int, y_val_int)
best = best_threshold(clf, df_val[FEATURE_ORDER].to_numpy(np.float32), df_val["label"].to_numpy(int))
print(f"Validation sweep: F1={best['f1']:.4f} at thr={best['thr']:.2f} (P={best['precision']:.3f}, R={best['recall']:.3f})")

[LightGBM] [Info] Number of positive: 18, number of negative: 180
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 198, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090909 -> initscore=-2.302585
[LightGBM] [Info] Start training from score -2.302585
Validation sweep: F1=0.9000 at thr=0.95 (P=0.818, R=1.000)


In [29]:
print("Extracting labeled features from holdout side...")
df_hold = extract_features_with_labels(detector, tracker, hold_videos)
if df_hold.empty:
    print("No holdout data available; skipping final metrics.")
    eval_hold = None
else:
    X_hold = df_hold[FEATURE_ORDER].to_numpy(np.float32)
    y_hold = df_hold["label"].to_numpy(int)
    eval_hold = evaluate(clf, X_hold, y_hold, thr=best["thr"])
    print("\n=== FINAL (Holdout) ===")
    print(f"Precision: {eval_hold['precision']:.4f}")
    print(f"Recall:    {eval_hold['recall']:.4f}")
    print(f"F1:        {eval_hold['f1']:.4f}")
    print(f"ROC-AUC:   {eval_hold['auc']:.4f}")

Extracting labeled features from holdout side...
01.avi: frames=1439 aligned=1439 (GT=1439)
03.avi: frames=923 aligned=923 (GT=923)
05.avi: frames=1007 aligned=1007 (GT=1007)
07.avi: frames=605 aligned=605 (GT=605)
09.avi: frames=1175 aligned=1175 (GT=1175)
11.avi: frames=472 aligned=472 (GT=472)
13.avi: frames=549 aligned=549 (GT=549)
15.avi: frames=1001 aligned=1001 (GT=1001)
17.avi: frames=426 aligned=426 (GT=426)
19.avi: frames=248 aligned=248 (GT=248)
21.avi: frames=76 aligned=76 (GT=76)

=== FINAL (Holdout) ===
Precision: 0.0000
Recall:    0.0000
F1:        0.0000
ROC-AUC:   0.8702


In [62]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

def sweep_thresholds(y_true, y_prob, steps=201):
    ts = np.linspace(0.0, 1.0, steps)
    out = []
    for t in ts:
        y_pred = (y_prob >= t).astype(int)
        p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
        out.append((float(t), float(p), float(r), float(f1)))
    return np.array(out)

import numpy as np
from sklearn.metrics import precision_recall_fscore_support


def pick_threshold(curve, metric="f1"):
    idx = {"precision": 1, "recall": 2, "f1": 3}[metric]
    best_row = curve[np.argmax(curve[:, idx])]
    best_t, best_p, best_r, best_f1 = best_row
    return {"thr": best_t, "precision": best_p, "recall": best_r, "f1": best_f1}

In [63]:

y_val_true = df_val["label"].to_numpy(np.int32)
y_val_prob = clf.predict_proba(df_val[FEATURE_ORDER].to_numpy(np.float32))[:, 1]
curve = sweep_thresholds(y_val_true, y_val_prob, steps=2001) # finer grid (0.0005 step)
best = pick_threshold(curve, metric="f1")
print(f"Best F1={best['f1']:.3f} at thr={best['thr']:.4f} (P={best['precision']:.3f}, R={best['recall']:.3f})")
best_thr=best

Best F1=0.900 at thr=0.9435 (P=0.818, R=1.000)


In [64]:
eval_hold = None
if df_hold is not None and not df_hold.empty:
    y_hold_true = df_hold["label"].to_numpy(np.int32)
    y_hold_prob = clf.predict_proba(df_hold[FEATURE_ORDER].to_numpy(np.float32))[:, 1]
    y_hold_pred = (y_hold_prob >= best['thr']).astype(int)



In [65]:
out_dir = Config.ARTIFACT_DIR
out_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(clf, out_dir / "lgbm_frame_scorer.joblib")

meta = {
    "model_type": "lightgbm",
    "threshold": float(0.9435), 
    "feature_order": list(FEATURE_ORDER),
    "val_videos": [v.name for v in val_videos],
    "holdout_videos": [v.name for v in hold_videos],
    "best_iteration": int(getattr(clf, "best_iteration_", getattr(clf, "n_estimators", 0))),
    "negative_ratio": float(Config.NEGATIVE_RATIO),
    "validation_operating_point": {
        "precision": 0.818,
        "recall": 1.000,
        "f1": 0.900,
        "threshold": 0.9435
    }
}

In [66]:
if eval_hold is not None:
    meta["final_holdout"] = {k: float(eval_hold[k]) for k in ["precision","recall","f1","auc"]}

with open(out_dir / "meta.json", "w") as f:
    json.dump(meta, f, indent=2)

val_scores = y_val_prob
df_val_out = df_val.copy()
df_val_out["score"] = val_scores
df_val_out.to_csv(out_dir / "validation_frames.csv", index=False)

if df_hold is not None and not df_hold.empty:
    hold_scores = clf.predict_proba(df_hold[FEATURE_ORDER].to_numpy(np.float32))[:, 1]
    df_hold_out = df_hold.copy()
    df_hold_out["score"] = hold_scores
    df_hold_out.to_csv(out_dir / "holdout_frames.csv", index=False)

print(f"\nArtifacts written to {out_dir.resolve()}")


Artifacts written to /kaggle/working/artifacts_train
