In [12]:
# %% Imports
import os, time, json, urllib.request, shutil
from pathlib import Path
from typing import List, Dict

import torch
import pandas as pd
import matplotlib.pyplot as plt

from ultralytics import YOLO  # YOLOv* and YOLO‑World
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

In [13]:
# ─── Cell 1: Setup paths ─────────────────────────────────────────────────────────
yolov11_weight   = "weights/bestyolo.pt"
yoloworld_weight = "weights/yolov8l-worldv2.pt"   # ← download and point here
owlv2_weight     = "weights/owlv2.pt"       # ← download and point here

image_folder      = "dataset/final/images"
yolo_annot_folder = "dataset/final/labels"


In [14]:
# ─── Cell 2: Data preprocess ────────────────────────────────────────────────────
import os
import pandas as pd
from PIL import Image

# gather all image paths
image_paths = sorted([
    os.path.join(image_folder, f)
    for f in os.listdir(image_folder)
    if f.lower().endswith((".jpg", ".jpeg", ".png"))
])

# build corresponding YOLO-label paths
label_paths = [
    os.path.join(yolo_annot_folder, os.path.splitext(os.path.basename(p))[0] + ".txt")
    for p in image_paths
]

# YOLO DataFrame
yolo_df = pd.DataFrame({
    "image_path": image_paths,
    "label_path": label_paths
})

# Build a COCO‐style DataFrame from YOLO-format labels
coco_rows = []
for img_path, lbl_path in zip(image_paths, label_paths):
    img = Image.open(img_path)
    w, h = img.size
    with open(lbl_path, "r") as f:
        for line in f:
            cls, xc, yc, bw, bh = map(float, line.split())
            # convert normalized center→pixel xywh
            x1 = (xc - bw/2) * w
            y1 = (yc - bh/2) * h
            coco_rows.append({
                "image_path": img_path,
                "category_id": int(cls),
                "bbox": [x1, y1, bw * w, bh * h]
            })

coco_df = pd.DataFrame(coco_rows)


In [15]:
# ─── Cell X: GPU memory used in GB ─────────────────────────────────────────────
import subprocess

def get_gpu_mem_gb():
    """
    Returns the currently used GPU memory (first GPU) in gigabytes.
    """
    # query only “memory.used” (in MiB)
    out = subprocess.check_output([
        "nvidia-smi",
        "--query-gpu=memory.used",
        "--format=csv,noheader,nounits"
    ])
    used_mib = float(out.decode().strip().split()[0])
    used_gb  = used_mib / 1024.0
    return used_gb

In [16]:
# ─── Cell 3: Create data.yaml with absolute path ───────────────────────────────
import yaml

# ← hard-code your project root here:
project_root = r"C:\Users\chrap\Documents\code\vision_pipeline\model_evaluation"

# number of classes (as a pure Python int)
num_classes = int(coco_df.category_id.max()) + 1

data_cfg = {
    'path':  project_root,           # absolute root
    'train': 'dataset/final/images',       # relative to path/
    'val':   'dataset/final/images',       # same for validation
    'nc':    num_classes,            # number of classes
    #'names': [str(i) for i in range(num_classes)]
    'names': ["graffiti", "trash"]
}

with open('data.yaml', 'w') as f:
    yaml.safe_dump(data_cfg, f, sort_keys=False)

# Optional: verify contents
print(yaml.safe_load(open("data.yaml")))

{'path': 'C:\\Users\\chrap\\Documents\\code\\vision_pipeline\\model_evaluation', 'train': 'dataset/final/images', 'val': 'dataset/final/images', 'nc': 2, 'names': ['graffiti', 'trash']}


In [17]:
# ─── Cell DEBUG: sanity-check your dataframes ─────────────────────────────────
import os

print("Number of images:", len(yolo_df))
print("Number of unique images in coco_df (with at least one GT box):",
      coco_df.image_path.nunique())

# Check for any images where the label file doesn’t exist or is empty
missing_lbl = [p for p, l in zip(yolo_df.image_path, yolo_df.label_path)
               if not os.path.isfile(l)]
empty_lbl   = [l for l in yolo_df.label_path if os.path.isfile(l) and os.path.getsize(l)==0]
print("Images missing a .txt label file:", missing_lbl)
print("Label files present but empty:", empty_lbl)

# Show a few entries
print("\nyolo_df head:\n", yolo_df.head())
print("\ncoco_df head:\n", coco_df.head())


Number of images: 89
Number of unique images in coco_df (with at least one GT box): 52
Images missing a .txt label file: []
Label files present but empty: ['dataset/final/labels\\clean_blank10.txt', 'dataset/final/labels\\clean_blank12.txt', 'dataset/final/labels\\clean_blank13.txt', 'dataset/final/labels\\clean_blank16.txt', 'dataset/final/labels\\clean_blank19.txt', 'dataset/final/labels\\clean_blank23.txt', 'dataset/final/labels\\clean_blank25.txt', 'dataset/final/labels\\clean_blank27.txt', 'dataset/final/labels\\clean_blank28.txt', 'dataset/final/labels\\clean_blank29.txt', 'dataset/final/labels\\clean_blank30.txt', 'dataset/final/labels\\clean_blank31.txt', 'dataset/final/labels\\clean_blank32.txt', 'dataset/final/labels\\clean_blank33.txt', 'dataset/final/labels\\clean_blank35.txt', 'dataset/final/labels\\clean_blank37.txt', 'dataset/final/labels\\clean_blank38.txt', 'dataset/final/labels\\clean_blank39.txt', 'dataset/final/labels\\clean_blank40.txt', 'dataset/final/labels\\clea

In [18]:
# ─── Cell 4: YOLOv11 inference ─────────────────────────────────────────────────
import time
import subprocess
from ultralytics import YOLO

def infer_yolov11(df, weights, data_yaml='data.yaml'):
    model = YOLO(weights)
    _ = model(df.image_path.iloc[:1].tolist())  # warm up

    start = time.time()
    results = model.val(data=data_yaml, verbose=False)
    end = time.time()

    gpu_util   = get_gpu_mem_gb()
    map50      = results.box.ap50        # was map50
    map50_95   = results.box.ap          # was map50_95
    avg_ms     = (end - start) * 1000 / len(df)

    return gpu_util, map50, map50_95, avg_ms


In [19]:
def infer_yoloworld_eval(df, weights, conf=0.30, save_vis=True, vis_dir="yolo_world_vis"):
    class_names = ["graffiti", "trash"]
    model = YOLO(weights)
    model.set_classes(class_names)
    _ = model(df.image_path.iloc[:1].tolist())  # warmup

    # inference
    results = model.predict(source=df.image_path.tolist(), conf=conf, verbose=False)

    # save visualizations
    if save_vis:
        os.makedirs(vis_dir, exist_ok=True)
        for img_path, res in zip(df.image_path.tolist(), results):
            im_arr = res.plot()
            cv2.imwrite(os.path.join(vis_dir, os.path.basename(img_path)), im_arr)

    # build preds & targets
    preds, targets = [], []
    for img_path, res in zip(df.image_path.tolist(), results):
        boxes  = torch.tensor(res.boxes.xyxy.cpu())
        scores = torch.tensor(res.boxes.conf.cpu())
        labels = torch.tensor(res.boxes.cls.cpu().long())  # assume 0=graffiti, 1=trash
        preds.append({"boxes": boxes, "scores": scores, "labels": labels})

        gt = coco_df[coco_df.image_path == img_path]
        if not gt.empty:
            xywh = torch.tensor(gt.bbox.tolist())
            gt_boxes = torch.cat([xywh[:, :2], xywh[:, :2] + xywh[:, 2:]], dim=1)
            gt_labels = torch.tensor(gt.category_id.tolist(), dtype=torch.int64)
        else:
            gt_boxes  = torch.zeros((0,4), dtype=torch.float32)
            gt_labels = torch.zeros((0,), dtype=torch.int64)
        targets.append({"boxes": gt_boxes, "labels": gt_labels})

    # evaluate
    metric = MeanAveragePrecision(iou_type="bbox")
    metric.update(preds, targets)
    m = metric.compute()

    # GPU usage
    out = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"])
    gpu_mem_gb = float(out.decode().strip()) / 1024.0
    avg_ms = (time.time() - start) * 1000 / len(df)

    return gpu_mem_gb, m["map_50"].item(), m["map"].item(), avg_ms


In [20]:
# ─── Cell X: OWLv2 inference & mAP for graffiti+trash on GPU with viz toggle ───
import os
import time
import subprocess
import torch
from transformers import Owlv2Processor, Owlv2ForObjectDetection
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from PIL import Image
import cv2
import numpy as np

def infer_owlv2(df,
                model_id="google/owlv2-base-patch16-ensemble",
                text_labels=None,
                conf_threshold=0.05,
                save_vis=False,
                vis_dir="owlv2_vis"):
    """
    Runs OWLv2 on df.image_path using GPU if available, computes open-vocab mAP vs. coco_df GT,
    measures GPU mem (GB) and avg inference time (ms/image).
    If save_vis=True, saves annotated images to vis_dir.
    """
    # default to graffiti+trash
    if text_labels is None:
        text_labels = ["graffiti", "trash"]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # load processor & model
    processor = Owlv2Processor.from_pretrained(model_id)
    model     = Owlv2ForObjectDetection.from_pretrained(model_id).to(device)
    model.eval()

    # warm-up
    img0 = Image.open(df.image_path.iloc[0]).convert("RGB")
    inputs = processor(text=[text_labels], images=img0, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        _ = model(**inputs)

    preds, targets = [], []
    start = time.time()

    for img_path in df.image_path.tolist():
        img = Image.open(img_path).convert("RGB")
        inputs = processor(text=[text_labels], images=img, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)

        # post-process
        proc = processor.post_process_grounded_object_detection(
            outputs=outputs,
            threshold=conf_threshold,
            target_sizes=torch.tensor([(img.height, img.width)]),
            text_labels=text_labels
        )[0]

        # visualize if asked
        if save_vis:
            os.makedirs(vis_dir, exist_ok=True)
            vis = np.array(img)[:, :, ::-1].copy()  # RGB→BGR
            for box in proc["boxes"]:
                x1,y1,x2,y2 = map(int, box.tolist())
                cv2.rectangle(vis, (x1,y1), (x2,y2), (0,255,0), 2)
            cv2.imwrite(os.path.join(vis_dir, os.path.basename(img_path)), vis)

        # build predictions
        boxes  = proc["boxes"].cpu()
        scores = proc["scores"].cpu()
        labels = torch.tensor([text_labels.index(lbl) for lbl in proc["text_labels"]],
                              dtype=torch.int64)
        preds.append({"boxes": boxes, "scores": scores, "labels": labels})

        # build targets
        gt = coco_df[coco_df.image_path == img_path]
        if not gt.empty:
            xywh      = torch.tensor(gt.bbox.tolist())
            x1y1      = xywh[:, :2]
            wh        = xywh[:, 2:]
            x2y2      = x1y1 + wh
            gt_boxes  = torch.cat([x1y1, x2y2], dim=1)
            gt_labels = torch.tensor(gt.category_id.tolist(), dtype=torch.int64)
        else:
            gt_boxes  = torch.zeros((0,4), dtype=torch.float32)
            gt_labels = torch.zeros((0,),   dtype=torch.int64)
        targets.append({"boxes": gt_boxes, "labels": gt_labels})

    end = time.time()

    # compute mAP
    metric = MeanAveragePrecision(iou_type="bbox")
    metric.update(preds, targets)
    mAPs     = metric.compute()
    map50    = mAPs["map_50"].item()
    map50_95 = mAPs["map"].item()

    # GPU memory used in GB
    out = subprocess.check_output([
        "nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"
    ]).decode().strip()
    gpu_mem_gb = float(out) / 1024.0

    avg_ms = (end - start) * 1000 / len(df)

    return gpu_mem_gb, map50, map50_95, avg_ms

# Example:
# gpu_gb, m50, m5095, avg_ms = infer_owlv2(
#     yolo_df,
#     text_labels=["graffiti","trash"],
#     conf_threshold=0.1,
#     save_vis=True,
#     vis_dir="owlv2_multi_vis"
# )


In [21]:
# ─── Cell 6: Run all three and collect ─────────────────────────────────────────
metrics = {}
metrics["owlv2"]    = infer_owlv2(yolo_df)
metrics["yolov11"]  = infer_yolov11(yolo_df,  yolov11_weight)
metrics["yolo-world"] = infer_yoloworld_eval(yolo_df, yoloworld_weight)

# unpack into lists for plotting
models     = list(metrics.keys())
gpu_utils  = [metrics[m][0] for m in models]
maps50     = [metrics[m][1] for m in models]
maps50_95  = [metrics[m][2] for m in models]
speeds_ms  = [metrics[m][3] for m in models]



0: 320x640 (no detections), 23.0ms
Speed: 1.7ms preprocess, 23.0ms inference, 0.7ms postprocess per image at shape (1, 3, 320, 640)
Ultralytics 8.3.127  Python-3.10.16 torch-2.6.0+cu126 CUDA:0 (NVIDIA GeForce RTX 4090, 24564MiB)
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 601.7187.5 MB/s, size: 45.7 KB)


[34m[1mval: [0mScanning C:\Users\chrap\Documents\code\vision_pipeline\model_evaluation\dataset\final\labels... 89 images, 37 backgrounds, 0 corrupt: 100%|██████████| 89/89 [00:00<00:00, 1888.83it/s]

[34m[1mval: [0mNew cache created: C:\Users\chrap\Documents\code\vision_pipeline\model_evaluation\dataset\final\labels.cache



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 6/6 [00:02<00:00,  2.12it/s]


                   all         89        133          0          0          0          0
Speed: 0.9ms preprocess, 5.9ms inference, 0.0ms loss, 1.1ms postprocess per image
Results saved to [1mruns\detect\val4[0m

0: 320x640 (no detections), 17.4ms
Speed: 1.1ms preprocess, 17.4ms inference, 0.7ms postprocess per image at shape (1, 3, 320, 640)


  boxes = torch.tensor(res.boxes.xyxy.cpu())
  scores = torch.tensor(res.boxes.conf.cpu())


In [22]:
# ─── Cell 7: Plot bar charts (with inline display, robust scalar conversion) ─────
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

def to_scalar(x):
    """Convert x to a single float:
    - None      → 0.0
    - scalar    → float(x)
    - array/list → float(first element) if possible
    """
    if x is None:
        return 0.0
    # torch tensor?
    try:
        if hasattr(x, 'item'):
            return float(x.item())
    except:
        pass
    # numpy array or list/tuple
    if isinstance(x, (list, tuple, np.ndarray)):
        if len(x) > 0:
            return float(x[0])
        else:
            return 0.0
    # fallback
    try:
        return float(x)
    except:
        return 0.0

# sanitize metrics
gpu_utils_clean  = [to_scalar(x) for x in gpu_utils]
maps50_clean     = [to_scalar(x) for x in maps50]
maps50_95_clean  = [to_scalar(x) for x in maps50_95]
speeds_ms_clean  = [to_scalar(x) for x in speeds_ms]

def plot_bar(vals, ylabel, title):
    plt.figure()
    bars = plt.bar(models, vals)
    plt.ylabel(ylabel)
    plt.title(title)
    for bar, v in zip(bars, vals):
        plt.text(bar.get_x() + bar.get_width()/2,
                 bar.get_height(),
                 f"{v:.2f}",
                 ha='center', va='bottom')
    plt.tight_layout()
    plt.show()

# GPU memory utilization (% stays same name though it’s GB if you used get_gpu_mem_gb)
plot_bar(gpu_utils_clean, "GPU memory used (GB)", "GPU Memory Utilization")

# mAP@50
plot_bar(maps50_clean, "mAP@50", "mAP@50 Comparison")

# mAP@50–95
plot_bar(maps50_95_clean, "mAP@50–95", "mAP@50–95 Comparison")

# avg inference speed
plot_bar(speeds_ms_clean, "Avg. inference (ms/frame)", "Inference Speed")


TypeError: only length-1 arrays can be converted to Python scalars