In [65]:
import os
import cv2
import torch
import numpy as np
from models.detector import Detector

# ---------------- CONFIG ----------------
IMG_PATH = "/Users/anirudhmamgain/Desktop/Object_detection_from_scratch/Dataset/train/images/IMG_8471_jpg.rf.ae5a71d679103fd7cba244719b0e3157.jpg"
CHECKPOINT = "/Users/anirudhmamgain/Desktop/Object_detection_from_scratch/checkpoints/new_last_model.pth"

IMG_SIZE = 416
GRID_SIZE = 52
NUM_CLASSES = 7

OBJ_THRESH = 0.9
SCORE_THRESH = 0.2
NMS_IOU_THRESH = 0.2
MAX_DETECTIONS = 10


CLASS_NAMES = [
    "fish", "jellyfish", "penguin",
    "puffin", "shark", "starfish", "stingray"
]

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")


# ---------------- NMS ----------------
def nms(boxes, scores, iou_thresh):
    if len(boxes) == 0:
        return []

    boxes = torch.tensor(boxes, dtype=torch.float32)
    scores = torch.tensor(scores, dtype=torch.float32)

    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]

    areas = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
    order = scores.argsort(descending=True)

    keep = []

    while order.numel() > 0:
        i = order[0].item()
        keep.append(i)

        if order.numel() == 1:
            break

        xx1 = torch.maximum(x1[i], x1[order[1:]])
        yy1 = torch.maximum(y1[i], y1[order[1:]])
        xx2 = torch.minimum(x2[i], x2[order[1:]])
        yy2 = torch.minimum(y2[i], y2[order[1:]])

        inter = (xx2 - xx1).clamp(0) * (yy2 - yy1).clamp(0)
        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-6)

        order = order[1:][iou < iou_thresh]

    return keep


# ---------------- LOAD MODEL ----------------
model = Detector(num_classes=NUM_CLASSES)
model.load_state_dict(torch.load(CHECKPOINT, map_location=device))
model.to(device)
model.eval()

# ---------------- LOAD IMAGE ----------------
img = cv2.imread(IMG_PATH)
orig_h, orig_w = img.shape[:2]

img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img_resized = cv2.resize(img_rgb, (IMG_SIZE, IMG_SIZE))

img_tensor = torch.from_numpy(img_resized).float() / 255.0
img_tensor = img_tensor.permute(2, 0, 1).unsqueeze(0).to(device)

# ---------------- INFERENCE ----------------
with torch.no_grad():
    preds = model(img_tensor)

preds = preds.permute(0, 2, 3, 1)[0]  # [52, 52, 5 + C]

boxes = []
scores = []
labels = []

# ---------------- DECODE ----------------
for gy in range(GRID_SIZE):
    for gx in range(GRID_SIZE):
        obj = torch.sigmoid(preds[gy, gx, 4]).item()
        if obj < OBJ_THRESH:
            continue

        cls_probs = torch.sigmoid(preds[gy, gx, 5:])
        cls_id = torch.argmax(cls_probs).item()
        cls_conf = cls_probs[cls_id].item()

        score = obj * cls_conf
        if score < SCORE_THRESH:
            continue

        cx, cy, w, h = torch.sigmoid(preds[gy, gx, :4]).cpu().numpy()

        cx = cx * orig_w
        cy = cy * orig_h
        w  = w  * orig_w
        h  = h  * orig_h

        MAX_BOX_RATIO = 0.4
        w = min(w, MAX_BOX_RATIO * orig_w)
        h = min(h, MAX_BOX_RATIO * orig_h)

        x1 = int(cx - w / 2)
        y1 = int(cy - h / 2)
        x2 = int(cx + w / 2)
        y2 = int(cy + h / 2)



        boxes.append([x1, y1, x2, y2])
        scores.append(score)
        labels.append(cls_id)

# ---------------- PER-CLASS NMS ----------------
final_indices = []

for cls in range(NUM_CLASSES):
    cls_boxes = []
    cls_scores = []
    cls_map = []

    for i, lbl in enumerate(labels):
        if lbl == cls:
            cls_boxes.append(boxes[i])
            cls_scores.append(scores[i])
            cls_map.append(i)

    keep = nms(cls_boxes, cls_scores, NMS_IOU_THRESH)

    for k in keep:
        final_indices.append(cls_map[k])

# ---------------- LIMIT MAX DETECTIONS ----------------
final_indices = sorted(
    final_indices,
    key=lambda i: scores[i],
    reverse=True
)[:MAX_DETECTIONS]

# ---------------- DRAW ----------------
for i in final_indices:
    x1, y1, x2, y2 = boxes[i]
    cls_id = labels[i]
    score = scores[i]

    cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
    text = f"{CLASS_NAMES[cls_id]} {score:.2f}"
    cv2.putText(
        img,
        text,
        (x1, max(y1 - 5, 10)),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.7,
        (0, 0, 255),
        2
    )

# ---------------- SAVE ----------------
os.makedirs("outputs", exist_ok=True)
out_path = "outputs/result3.jpg"
cv2.imwrite(out_path, img)

print(f"Inference complete. Saved to {out_path}")


Inference complete. Saved to outputs/result3.jpg


### map evaluation

In [None]:
!pip install torchmetrics


In [28]:
def decode_predictions(
    preds,
    img_size=416,
    num_classes=7,
    obj_thresh=0.7,
    score_thresh=0.6,
    max_detections=5
):
    preds = preds.permute(0, 2, 3, 1)  # [B, S, S, 5+C]
    B, S, _, _ = preds.shape

    all_outputs = []

    for b in range(B):
        boxes = []
        scores = []
        labels = []

        for gy in range(S):
            for gx in range(S):
                obj = torch.sigmoid(preds[b, gy, gx, 4]).item()
                if obj < obj_thresh:
                    continue

                cls_probs = torch.sigmoid(preds[b, gy, gx, 5:])
                cls_id = torch.argmax(cls_probs).item()
                cls_conf = cls_probs[cls_id].item()

                score = obj * cls_conf
                if score < score_thresh:
                    continue

                cx, cy, w, h = torch.sigmoid(
                    preds[b, gy, gx, :4]
                ).cpu().numpy()

                # IMAGE-normalized decode (matches your dataset)
                cx *= img_size
                cy *= img_size
                w  *= img_size
                h  *= img_size

                x1 = cx - w / 2
                y1 = cy - h / 2
                x2 = cx + w / 2
                y2 = cy + h / 2

                boxes.append([x1, y1, x2, y2])
                scores.append(score)
                labels.append(cls_id)

        if len(boxes) > 0:
            boxes = torch.tensor(boxes, dtype=torch.float32)
            scores = torch.tensor(scores, dtype=torch.float32)
            labels = torch.tensor(labels, dtype=torch.long)

            # -------- MAX DETECTIONS --------
            scores_sorted, idx = scores.sort(descending=True)
            idx = idx[:max_detections]

            boxes = boxes[idx]
            scores = scores_sorted[:max_detections]
            labels = labels[idx]
        else:
            boxes = torch.zeros((0, 4))
            scores = torch.zeros((0,))
            labels = torch.zeros((0,), dtype=torch.long)

        all_outputs.append({
            "boxes": boxes,
            "scores": scores,
            "labels": labels
        })

    return all_outputs


In [34]:
import torch
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from torch.utils.data import DataLoader
from dataloader.data_load import UnderwaterDataset

metric = MeanAveragePrecision(iou_type="bbox", iou_thresholds=[0.25])

val_dataset = UnderwaterDataset(
    img_dir="/Users/anirudhmamgain/Desktop/Object_detection_from_scratch/Dataset/valid/images",
    label_dir="/Users/anirudhmamgain/Desktop/Object_detection_from_scratch/Dataset/valid/labels"
)

val_loader = DataLoader(
    val_dataset,
    batch_size=8,
    shuffle=False,
)

model.eval()

with torch.no_grad():
    for imgs, targets in val_loader:
        imgs = imgs.to(device)
        preds = model(imgs)

        outputs = decode_predictions(
            preds,
            img_size=416,
            num_classes=7,
            obj_thresh=0.05,
            score_thresh=0.05,
            max_detections=100
        )

        gt = []
        for t in targets:
            obj_mask = t[..., 4] == 1
            if obj_mask.sum() == 0:
                gt.append({
                    "boxes": torch.zeros((0, 4)),
                    "labels": torch.zeros((0,), dtype=torch.long)
                })
                continue

            cxcywh = t[..., 0:4][obj_mask].clone()
            labels = torch.argmax(t[..., 5:], dim=-1)[obj_mask]

            cx = cxcywh[:, 0] * 416
            cy = cxcywh[:, 1] * 416
            w  = cxcywh[:, 2] * 416
            h  = cxcywh[:, 3] * 416

            x1 = cx - w / 2
            y1 = cy - h / 2
            x2 = cx + w / 2
            y2 = cy + h / 2

            boxes = torch.stack([x1, y1, x2, y2], dim=1)

            gt.append({
                "boxes": boxes,
                "labels": labels
            })

        metric.update(outputs, gt)

results = metric.compute()
print(results)


{'map': tensor(2.3653e-06), 'map_50': tensor(-1.), 'map_75': tensor(-1.), 'map_small': tensor(-1.), 'map_medium': tensor(-1.), 'map_large': tensor(2.3653e-06), 'mar_1': tensor(0.0022), 'mar_10': tensor(0.0028), 'mar_100': tensor(0.0028), 'mar_small': tensor(-1.), 'mar_medium': tensor(-1.), 'mar_large': tensor(0.0028), 'map_per_class': tensor(-1.), 'mar_100_per_class': tensor(-1.), 'classes': tensor([0, 1, 2, 3, 4, 5, 6], dtype=torch.int32)}


In [35]:
import time
import torch

model.eval()

device = torch.device("mps")
model.to(device)

dummy = torch.randn(1, 3, 416, 416).to(device)

for _ in range(20):
    _ = model(dummy)

if device.type == "mps":
    torch.mps.synchronize()

start = time.time()
num_runs = 100

for _ in range(num_runs):
    _ = model(dummy)

if device.type == "mps":
    torch.mps.synchronize()

end = time.time()

fps = num_runs / (end - start)
latency = (end - start) / num_runs * 1000

print(f"FPS: {fps:.2f}")
print(f"Latency per image: {latency:.2f} ms")


FPS: 125.39
Latency per image: 7.98 ms


In [37]:
import time
import torch

model.eval()

device = "cpu"
model.to(device)

dummy = torch.randn(1, 3, 416, 416).to(device)

for _ in range(20):
    _ = model(dummy)


start = time.time()
num_runs = 100

for _ in range(num_runs):
    _ = model(dummy)


end = time.time()

fps = num_runs / (end - start)
latency = (end - start) / num_runs * 1000

print(f"FPS: {fps:.2f}")
print(f"Latency per image: {latency:.2f} ms")


FPS: 25.14
Latency per image: 39.77 ms


In [38]:
import os
import torch

temp_path = "temp_model.pth"
torch.save(model.state_dict(), temp_path)

size_mb = os.path.getsize(temp_path) / (1024 * 1024)
print(f"Model size: {size_mb:.2f} MB")

os.remove(temp_path)


Model size: 6.03 MB


In [39]:
import time
import torch
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from torch.utils.data import DataLoader
from dataloader.data_load import UnderwaterDataset

# ---------------- CONFIG ----------------
IMG_SIZE = 416
NUM_CLASSES = 7
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

CONFIGS = {
    "Speed": {
        "obj_thresh": 0.85,
        "score_thresh": 0.80,
        "nms_iou": 0.30,
        "max_det": 3
    },
    "Balanced": {
        "obj_thresh": 0.70,
        "score_thresh": 0.60,
        "nms_iou": 0.50,
        "max_det": 5
    },
    "Recall": {
        "obj_thresh": 0.50,
        "score_thresh": 0.50,
        "nms_iou": 0.70,
        "max_det": 10
    }
}

# ---------------- DATA ----------------
val_dataset = UnderwaterDataset(
    img_dir="Dataset/valid/images",
    label_dir="Dataset/valid/labels"
)

val_loader = DataLoader(
    val_dataset,
    batch_size=8,
    shuffle=False
)

model.eval()
model.to(DEVICE)

# ---------------- FPS MEASUREMENT ----------------
def measure_fps():
    dummy = torch.randn(1, 3, IMG_SIZE, IMG_SIZE).to(DEVICE)

    for _ in range(10):
        _ = model(dummy)

    if DEVICE.type == "mps":
        torch.mps.synchronize()

    start = time.time()
    runs = 100
    for _ in range(runs):
        _ = model(dummy)

    if DEVICE.type == "mps":
        torch.mps.synchronize()

    return runs / (time.time() - start)

# ---------------- MAIN LOOP ----------------
results = []

for name, cfg in CONFIGS.items():
    metric = MeanAveragePrecision(iou_type="bbox", iou_thresholds=[0.25])
    total_detections = 0
    total_images = 0

    with torch.no_grad():
        for imgs, targets in val_loader:
            imgs = imgs.to(DEVICE)
            preds = model(imgs)

            outputs = decode_predictions(
                preds,
                img_size=IMG_SIZE,
                num_classes=NUM_CLASSES,
                obj_thresh=cfg["obj_thresh"],
                score_thresh=cfg["score_thresh"],
                max_detections=cfg["max_det"]
            )

            gt = []
            for t in targets:
                obj_mask = t[..., 4] == 1
                if obj_mask.sum() == 0:
                    gt.append({
                        "boxes": torch.zeros((0, 4)),
                        "labels": torch.zeros((0,), dtype=torch.long)
                    })
                    continue

                cxcywh = t[..., 0:4][obj_mask]
                labels = torch.argmax(t[..., 5:], dim=-1)[obj_mask]

                cx = cxcywh[:, 0] * IMG_SIZE
                cy = cxcywh[:, 1] * IMG_SIZE
                w  = cxcywh[:, 2] * IMG_SIZE
                h  = cxcywh[:, 3] * IMG_SIZE

                boxes = torch.stack([
                    cx - w / 2,
                    cy - h / 2,
                    cx + w / 2,
                    cy + h / 2
                ], dim=1)

                gt.append({"boxes": boxes, "labels": labels})

            metric.update(outputs, gt)

            for o in outputs:
                total_detections += o["boxes"].shape[0]
                total_images += 1

    metrics = metric.compute()
    fps = measure_fps()

    results.append({
        "Config": name,
        "mAP@0.25": metrics["map"].item(),
        "FPS": fps,
        "Avg Detections": total_detections / total_images
    })

# ---------------- PRINT RESULTS ----------------
print("\nTrade-off Results\n")
for r in results:
    print(
        f"{r['Config']:>8} | "
        f"mAP@0.25: {r['mAP@0.25']:.4f} | "
        f"FPS: {r['FPS']:.2f} | "
        f"Avg Det/Image: {r['Avg Detections']:.2f}"
    )



Trade-off Results

   Speed | mAP@0.25: 0.0001 | FPS: 125.60 | Avg Det/Image: 3.00
Balanced | mAP@0.25: 0.0000 | FPS: 125.49 | Avg Det/Image: 5.00
  Recall | mAP@0.25: 0.0000 | FPS: 122.90 | Avg Det/Image: 10.00
