In [None]:
# Cell 1 — Imports & paths

%load_ext autoreload
%autoreload 2

from pathlib import Path
import numpy as np
from tqdm.auto import tqdm

import torch
import torchvision
from ultralytics import YOLO
import cv2

PROJECT_ROOT = Path("..").resolve()

DATA_ROOT = PROJECT_ROOT / "mug_coco_yolo"
VAL_IMAGES_DIR = DATA_ROOT / "images" / "val2017"
VAL_LABELS_DIR = DATA_ROOT / "labels" / "val2017"

OUTPUT_DIR = PROJECT_ROOT / "outputs"

print("Val images:", VAL_IMAGES_DIR)
print("Val labels:", VAL_LABELS_DIR)
print("Outputs:", OUTPUT_DIR)


In [None]:
# Cell 2 — Helper: load YOLO-format ground truth labels

def load_yolo_labels(label_path, img_w, img_h):
    boxes = []
    if not label_path.exists():
        return boxes

    with open(label_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 5:
                continue
            cls, xc, yc, w, h = parts
            cls = int(cls)
            # For mug-only dataset, cls should be 0
            xc = float(xc) * img_w
            yc = float(yc) * img_h
            w = float(w) * img_w
            h = float(h) * img_h
            x1 = xc - w / 2
            y1 = yc - h / 2
            x2 = xc + w / 2
            y2 = yc + h / 2
            boxes.append([x1, y1, x2, y2])
    return np.array(boxes, dtype=np.float32)


In [None]:
# Cell 3 — IoU computation (pairwise)

def compute_iou(box1, box2):
    """
    box1, box2: [x1, y1, x2, y2]
    """
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter_w = max(0, x2 - x1)
    inter_h = max(0, y2 - y1)
    inter = inter_w * inter_h

    area1 = max(0, box1[2] - box1[0]) * max(0, box1[3] - box1[1])
    area2 = max(0, box2[2] - box2[0]) * max(0, box2[3] - box2[1])

    union = area1 + area2 - inter
    if union == 0:
        return 0.0
    return inter / union


In [None]:
# Cell 4 — Helper: compute AP & AR from predictions and ground truths

def evaluate_predictions(preds, gts, iou_threshold=0.5):
    """
    preds: list of dicts:
        {
          'image_id': str,
          'boxes': np.array [N_pred, 4],
          'scores': np.array [N_pred]
        }

    gts: dict mapping image_id -> np.array [N_gt, 4]

    Returns:
      - AP (Average Precision)
      - AR (Average Recall)
    """

    all_scores = []
    all_matches = []
    num_gt_total = 0

    for image_id, gt_boxes in gts.items():
        num_gt_total += len(gt_boxes)

    # Flatten predictions with image_id
    flat_preds = []
    for p in preds:
        img_id = p["image_id"]
        boxes = p["boxes"]
        scores = p["scores"]
        for b, s in zip(boxes, scores):
            flat_preds.append((img_id, b, s))

    # Sort by score descending
    flat_preds.sort(key=lambda x: x[2], reverse=True)

    # For each GT box, track whether it was already "matched"
    gt_matched = {
        img_id: np.zeros(len(gts[img_id]), dtype=bool)
        for img_id in gts.keys()
    }

    tp = []
    fp = []

    for img_id, box_pred, score in flat_preds:
        all_scores.append(score)

        if img_id not in gts or len(gts[img_id]) == 0:
            # prediction on an image with no cups -> false positive
            tp.append(0)
            fp.append(1)
            continue

        gt_boxes = gts[img_id]
        best_iou = 0.0
        best_gt_idx = -1

        for i, box_gt in enumerate(gt_boxes):
            iou = compute_iou(box_pred, box_gt)
            if iou > best_iou:
                best_iou = iou
                best_gt_idx = i

        if best_iou >= iou_threshold and not gt_matched[img_id][best_gt_idx]:
            # True positive
            tp.append(1)
            fp.append(0)
            gt_matched[img_id][best_gt_idx] = True
        else:
            # False positive
            tp.append(0)
            fp.append(1)

    if len(tp) == 0:
        return 0.0, 0.0

    tp = np.array(tp)
    fp = np.array(fp)

    cum_tp = np.cumsum(tp)
    cum_fp = np.cumsum(fp)

    recalls = cum_tp / max(num_gt_total, 1)
    precisions = cum_tp / np.maximum(cum_tp + cum_fp, 1e-9)

    # Average Precision as area under P-R curve (simple numeric integration)
    # We'll do a standard 11-point approximation or trapezoidal: here trapezoidal.
    ap = 0.0
    if len(recalls) > 1:
        # sort by recall ascending
        order = np.argsort(recalls)
        r_sorted = recalls[order]
        p_sorted = precisions[order]
        for i in range(1, len(r_sorted)):
            dr = r_sorted[i] - r_sorted[i-1]
            ap += p_sorted[i] * dr

    # Average Recall as mean of recall values where there is at least one prediction
    ar = recalls[-1] if len(recalls) > 0 else 0.0

    return float(ap), float(ar)


In [None]:
# Cell 5 — Load YOLOv8 model

# Adjust path if you trained with a different name
YOLO_WEIGHTS = OUTPUT_DIR / "mug_yolov8n_notebook" / "weights" / "best.pt"
print("YOLO weights:", YOLO_WEIGHTS, " | Exists:", YOLO_WEIGHTS.exists())

yolo_model = YOLO(str(YOLO_WEIGHTS))


In [None]:
# Cell 6 — Load Faster R-CNN model

import torchvision

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

num_classes = 2  # background + mug

fasterrcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None)
in_features = fasterrcnn.roi_heads.box_predictor.cls_score.in_features
fasterrcnn.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(
    in_features, num_classes
)

faster_rcnn_path = OUTPUT_DIR / "fasterrcnn_mug.pth"
print("Faster R-CNN weights:", faster_rcnn_path, " | Exists:", faster_rcnn_path.exists())

state = torch.load(faster_rcnn_path, map_location=DEVICE)
fasterrcnn.load_state_dict(state)
fasterrcnn.to(DEVICE)
fasterrcnn.eval()


In [None]:
# Cell 7 — Collect ground truth boxes for all val images

val_image_paths = sorted(list(VAL_IMAGES_DIR.glob("*.jpg")))
print("Num val images:", len(val_image_paths))

gt_boxes_by_image = {}

for img_path in val_image_paths:
    img = cv2.imread(str(img_path))
    if img is None:
        continue
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    h, w = img.shape[:2]

    label_path = VAL_LABELS_DIR / (img_path.stem + ".txt")
    gt_boxes = load_yolo_labels(label_path, w, h)  # [N_gt, 4] or []
    gt_boxes_by_image[img_path.name] = gt_boxes


In [None]:
# Cell 8 — Run YOLOv8 on all validation images and collect predictions

yolo_preds = []
conf_threshold = 0.25

for img_path in tqdm(val_image_paths, desc="YOLOv8 val inference"):
    results = yolo_model.predict(
        source=str(img_path),
        imgsz=640,
        conf=conf_threshold,
        verbose=False,
    )

    r = results[0]
    if r.boxes is None or len(r.boxes) == 0:
        boxes_xyxy = np.zeros((0, 4), dtype=np.float32)
        scores = np.zeros((0,), dtype=np.float32)
    else:
        boxes_xyxy = r.boxes.xyxy.cpu().numpy().astype(np.float32)
        scores = r.boxes.conf.cpu().numpy().astype(np.float32)

    yolo_preds.append({
        "image_id": img_path.name,
        "boxes": boxes_xyxy,
        "scores": scores,
    })


In [None]:
# Cell 9 — Run Faster R-CNN on all validation images and collect predictions

from torchvision.transforms.functional import to_tensor

faster_preds = []

for img_path in tqdm(val_image_paths, desc="Faster R-CNN val inference"):
    img_bgr = cv2.imread(str(img_path))
    if img_bgr is None:
        boxes_xyxy = np.zeros((0, 4), dtype=np.float32)
        scores = np.zeros((0,), dtype=np.float32)
    else:
        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
        img_tensor = to_tensor(img_rgb).to(DEVICE).unsqueeze(0)  # [1,3,H,W]

        with torch.no_grad():
            outputs = fasterrcnn(img_tensor)

        out = outputs[0]
        boxes_xyxy = out["boxes"].detach().cpu().numpy().astype(np.float32)
        scores = out["scores"].detach().cpu().numpy().astype(np.float32)

        # optional: apply score threshold
        keep = scores >= conf_threshold
        boxes_xyxy = boxes_xyxy[keep]
        scores = scores[keep]

    faster_preds.append({
        "image_id": img_path.name,
        "boxes": boxes_xyxy,
        "scores": scores,
    })


In [None]:
# Cell 10 — Evaluate YOLOv8 (IoU@0.5, AP, AR)

ap_yolo, ar_yolo = evaluate_predictions(yolo_preds, gt_boxes_by_image, iou_threshold=0.5)

print(f"YOLOv8 @ IoU=0.5 -> AP: {ap_yolo:.4f}, AR: {ar_yolo:.4f}")


In [None]:
# Cell 11 — Evaluate Faster R-CNN (IoU@0.5, AP, AR)

ap_frcnn, ar_frcnn = evaluate_predictions(faster_preds, gt_boxes_by_image, iou_threshold=0.5)

print(f"Faster R-CNN @ IoU=0.5 -> AP: {ap_frcnn:.4f}, AR: {ar_frcnn:.4f}")


In [None]:
# Cell 12 — Summary comparison

import pandas as pd

df = pd.DataFrame([
    {"model": "YOLOv8",       "IoU@0.5": 0.5, "AP": ap_yolo,   "AR": ar_yolo},
    {"model": "Faster R-CNN", "IoU@0.5": 0.5, "AP": ap_frcnn, "AR": ar_frcnn},
])

df
