In [3]:
import os
import json
from collections import defaultdict

# ==========================================
# CONFIG
# ==========================================
root_dataset = "./flatbug-dataset"

datasets_to_eval = {
    #"alus",
    #"bioscan",
    #"diversityscanner",
    "nhm-beetles-crops",
    #"artaxor",
    #"collembolai",
    #"gernat2018",
    #"cao2022",
    #"sittinger2023",
    #"amarathunga2022",
    #"biodiscover-arm",
}

IOU_THRESHOLD = 0.5   # Standard IoU threshold for BB evaluation


# ==========================================
# IoU FUNCTION
# ==========================================
def iou(box1, box2):
    """Compute IoU between two COCO bboxes: [x, y, w, h]."""
    x1, y1, w1, h1 = box1
    x2, y2, w2, h2 = box2

    xa = max(x1, x2)
    ya = max(y1, y2)
    xb = min(x1 + w1, x2 + w2)
    yb = min(y1 + h1, y2 + h2)

    inter = max(0, xb - xa) * max(0, yb - ya)
    union = w1*h1 + w2*h2 - inter
    if union == 0:
        return 0
    return inter / union


# ==========================================
# GLOBAL COUNTS
# ==========================================
TP_global = 0
FP_global = 0
FN_global = 0


# ==========================================
# PROCESS EACH DATASET
# ==========================================
for dataset_name in sorted(os.listdir(root_dataset)):

    if dataset_name.lower() not in datasets_to_eval:
        continue

    dataset_path = os.path.join(root_dataset, dataset_name)

    print("\n==================================================")
    print(f" Evaluating DATASET: {dataset_name}")
    print("==================================================")

    gt_path = os.path.join(dataset_path, "instances_default.json")
    pred_path = os.path.join(dataset_path, "sam3_results_BB.json")

    if not os.path.isfile(gt_path) or not os.path.isfile(pred_path):
        print(f"❌ Missing GT or SAM3 file in {dataset_name}, skipping.")
        continue

    gt = json.load(open(gt_path))
    pred = json.load(open(pred_path))

    # Organize GT by image
    gt_by_image = defaultdict(list)
    for ann in gt["annotations"]:
        gt_by_image[ann["image_id"]].append(ann)

    # Organize predictions by image
    pred_by_image = defaultdict(list)
    for ann in pred["annotations"]:
        pred_by_image[ann["image_id"]].append(ann)

    # Dataset-level counts
    TP = 0
    FP = 0
    FN = 0

    # ==========================================
    # PER-IMAGE EVALUATION
    # ==========================================
    for img_id in gt_by_image.keys():

        gt_objs = gt_by_image[img_id]
        pred_objs = pred_by_image.get(img_id, [])

        matched_gt = set()  # indices of GT objects matched

        pred_objs = sorted(pred_objs, key=lambda x: x.get("score", 1.0), reverse=True)

        for p in pred_objs:
            best_iou = 0
            best_gt = None

            for idx, g in enumerate(gt_objs):
                if idx in matched_gt:
                    continue

                ## if p["category_id"] != g["category_id"]:
                ##   continue

                iou_val = iou(p["bbox"], g["bbox"])

                if iou_val > best_iou:
                    best_iou = iou_val
                    best_gt = idx

            if best_iou >= IOU_THRESHOLD:
                TP += 1
                matched_gt.add(best_gt)
            else:
                FP += 1

        FN += len(gt_objs) - len(matched_gt)

    # ==========================================
    # DATASET METRICS
    # ==========================================
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    print("\n--- Results for dataset:", dataset_name, "---")
    print(f"TP = {TP}")
    print(f"FP = {FP}")
    print(f"FN = {FN}")
    print(f"Precision = {precision:.4f}")
    print(f"Recall    = {recall:.4f}")
    print(f"F1 Score  = {f1:.4f}")

    # Add to global totals
    TP_global += TP
    FP_global += FP
    FN_global += FN


# ==========================================
# GLOBAL METRICS (ALL DATASETS)
# ==========================================
print("\n==================================================")
print("     OVERALL BOUNDING-BOX EVALUATION METRICS")
print("==================================================")

precision_global = TP_global / (TP_global + FP_global) if TP_global + FP_global > 0 else 0
recall_global = TP_global / (TP_global + FN_global) if TP_global + FN_global > 0 else 0
f1_global = 2 * precision_global * recall_global / (precision_global + recall_global) if precision_global + recall_global > 0 else 0

print(f"TP = {TP_global}")
print(f"FP = {FP_global}")
print(f"FN = {FN_global}")
print("-----------------------------------------------")
print(f"Precision = {precision_global:.4f}")
print(f"Recall    = {recall_global:.4f}")
print(f"F1 Score  = {f1_global:.4f}")



 Evaluating DATASET: NHM-beetles-crops
❌ Missing GT or SAM3 file in NHM-beetles-crops, skipping.

     OVERALL BOUNDING-BOX EVALUATION METRICS
TP = 0
FP = 0
FN = 0
-----------------------------------------------
Precision = 0.0000
Recall    = 0.0000
F1 Score  = 0.0000


In [None]:
# ==========================

# metrics using IOU of seg masks

# ==========================



import os
import json
import numpy as np
from collections import defaultdict
import cv2

# ==========================
# CONFIG
# ==========================
root_dataset = "./flatbug-dataset"

datasets_to_eval = {
    #"nhm-beetles-crops",
    #"cao2022",
    #"gernat2018",
    #"sittinger2023",
    #"amarathunga2022",
    #"biodiscover-arm",
    #"mothitor",
    #"dirt",
    #"diopsis",
    #"ami-traps",
    #"amt",
    #"pematoeuropep",
    #"abram2023",
    #"antrax",
    "pinoy2023",
    "sticky-pi",
    "ubc-pitfall-traps",
    #todo
    #"alus",
    #"bioscan",
    #"diversityscanner",
    #"artaxor",
    #"collembolai",
    #"Ubc-scanned-sticky-cards",

    }

IOU_THRESHOLD = 0.5  # for segmentation matching

# ==========================
# UTILITIES
# ==========================

def polygons_to_mask(polygons, height, width):
    mask = np.zeros((height, width), dtype=np.uint8)
    for poly in polygons:
        if not poly:
            continue
        try:
            pts = np.array(poly, dtype=np.int32).reshape(-1, 2)
            cv2.fillPoly(mask, [pts], 1)
        except Exception:
            continue
    return mask


def seg_to_mask(segmentation, height, width):
    """Convert COCO segmentation (polygons or RLE) to binary mask."""
    if segmentation is None:
        return None
    if isinstance(segmentation, list):
        if len(segmentation) == 0:
            return np.zeros((height, width), dtype=np.uint8)
        if all(isinstance(x, (list, tuple)) for x in segmentation):
            return polygons_to_mask(segmentation, height, width)
        return None
    if isinstance(segmentation, dict):
        try:
            from pycocotools import mask as mask_utils
            return mask_utils.decode(segmentation).astype(np.uint8)
        except Exception:
            return None
    return None


def mask_iou(mask1, mask2):
    inter = np.logical_and(mask1, mask2).sum()
    union = np.logical_or(mask1, mask2).sum()
    return float(inter) / float(union) if union > 0 else 0.0
# ==========================
# GLOBAL COUNTERS
# ==========================
TP_global = 0
FP_global = 0
FN_global = 0
# ==========================
# DATASET EVALUATION
# =========================
for dataset_name in sorted(os.listdir(root_dataset)):
    if dataset_name.lower() not in datasets_to_eval:
        print(f"Skipping folder: {dataset_name}")
        continue

    dataset_path = os.path.join(root_dataset, dataset_name)
    gt_file = os.path.join(dataset_path, "instances_default.json")
    ##pred_file = os.path.join(dataset_path, "sam3_results.json")
    pred_file = os.path.join(dataset_path, "sam3_results_pyramid_v2.json")

    if not os.path.isfile(gt_file) or not os.path.isfile(pred_file):
        print(f"❌ Missing GT or SAM3 predictions in {dataset_name}, skipping.")
        continue

    print(f"\n======================\nEVALUATING DATASET: {dataset_name}\n======================")

    gt = json.load(open(gt_file))
    pred = json.load(open(pred_file))

    # map annotations by file_name instead of image_id
    gt_by_file = defaultdict(list)
    gt_image_sizes = {}
    for im in gt.get("images", []):
        gt_image_sizes[im["file_name"]] = (im["height"], im["width"])
    for ann in gt["annotations"]:
        file_name = next((im["file_name"] for im in gt["images"] if im["id"] == ann["image_id"]), None)
        if file_name:
            gt_by_file[file_name].append(ann)

    pred_by_file = defaultdict(list)
    for ann in pred["annotations"]:
        file_name = ann.get("file_name")
        if file_name:
            pred_by_file[file_name].append(ann)

    # dataset counters
    TP = FP = FN = 0
    skipped_pred_count = 0
    missing_size_count = 0

    # per-image evaluation

    for file_name, gt_objs in gt_by_file.items():
        if file_name not in gt_image_sizes:
            missing_size_count += 1
            print(f"WARNING: {file_name} missing size info, skipping")
            continue

        H, W = gt_image_sizes[file_name]
        pred_objs = pred_by_file.get(file_name, [])

        gt_masks = [seg_to_mask(g.get("segmentation"), H, W) for g in gt_objs]
        gt_cats = [g.get("category_id") for g in gt_objs]

        pred_masks = []
        pred_cats = []
        pred_scores = []
        for p in pred_objs:
            mask = seg_to_mask(p.get("segmentation"), H, W)
            if mask is None:
                skipped_pred_count += 1
            pred_masks.append(mask)
            pred_cats.append(p.get("category_id"))
            pred_scores.append(p.get("score", 1.0))


        # match predictions to GT
        matched_gt = set()
        order = sorted(range(len(pred_objs)), key=lambda i: pred_scores[i] if pred_masks[i] is not None else 0.0, reverse=True)
        for pi in order:
            pmask = pred_masks[pi]
            if pmask is None:
                continue
            pcat = pred_cats[pi]

            best_iou = 0.0
            best_gi = None

            for gi, (gmask, gcat) in enumerate(zip(gt_masks, gt_cats)):
                if gi in matched_gt or gmask is None:
                    continue
                ##if pcat != gcat:
                ##    continue
                iou = mask_iou(pmask, gmask)
                if iou > best_iou:
                    best_iou = iou
                    best_gi = gi

            if best_iou >= IOU_THRESHOLD and best_gi is not None:
                TP += 1
                matched_gt.add(best_gi)
            else:
                FP += 1

        FN += sum(1 for gi in range(len(gt_objs)) if gi not in matched_gt and gt_masks[gi] is not None)

    # dataset metrics
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0


    print(f"TP={TP} FP={FP} FN={FN} Precision={precision:.4f} Recall={recall:.4f} F1={f1:.4f}")
    print(f"Skipped pred masks: {skipped_pred_count}, Missing image sizes: {missing_size_count}")

    # update global totals
    TP_global += TP
    FP_global += FP
    FN_global += FN

# ==========================
# Overall metrics
# ==========================
precision_global = TP_global / (TP_global + FP_global) if TP_global + FP_global > 0 else 0
recall_global = TP_global / (TP_global + FN_global) if TP_global + FN_global > 0 else 0
f1_global = 2 * precision_global * recall_global / (precision_global + recall_global) if precision_global + recall_global > 0 else 0


print("\n======================")
print("OVERALL SEGMENTATION EVALUATION")
print(f"TP={TP_global} FP={FP_global} FN={FN_global}")
print(f"Precision={precision_global:.4f} Recall={recall_global:.4f} F1={f1_global:.4f}")
print("======================")

Skipping folder: .ipynb_checkpoints
Skipping folder: .~lock.metadata.csv#
Skipping folder: ALUS
Skipping folder: AMI-traps
Skipping folder: AMT
Skipping folder: ArTaxOr
Skipping folder: BIOSCAN
Skipping folder: CollembolAI
Skipping folder: DIRT
Skipping folder: Diopsis
Skipping folder: DiversityScanner
Skipping folder: Mothitor
Skipping folder: NHM-beetles-crops
Skipping folder: PeMaToEuroPep
Skipping folder: abram2023
Skipping folder: amarathunga2022
Skipping folder: anTraX
Skipping folder: biodiscover-arm
Skipping folder: cao2022
Skipping folder: gernat2018
Skipping folder: metadata.csv

EVALUATING DATASET: pinoy2023


In [None]:
!pkill -f ipykernel

In [2]:
!nvidia-smi

Sun Dec 14 14:07:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA TITAN Xp                Off |   00000000:03:00.0 Off |                  N/A |
| 23%   25C    P8              9W /  250W |       2MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA TITAN Xp                Off |   00

In [3]:
!nvidia-smi pmon -c 1

    0          -     -      -      -      -      -      -      -    -              
# gpu         pid   type     sm    mem    enc    dec    jpg    ofa    command 
# Idx           #    C/G      %      %      %      %      %      %    name 
    1          -     -      -      -      -      -      -      -    -              


In [None]:
!pkill -9 python