Ran only once to get sample from dataset

In [1]:
# import random, shutil
# from pathlib import Path

# # Paths
# base = Path("/Users/brandonbyrd/Documents/Big Data/Big Data Project/tools/caltech-ped-converter/data/images")
# sample_dir = Path("/Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images")
# sample_dir.mkdir(parents=True, exist_ok=True)

# # How many images or what fraction
# fraction = 0.01   # 1% of the dataset (~2360 images)
# imgs = sorted(p for p in base.rglob("*.png"))
# sample = random.sample(imgs, int(len(imgs) * fraction))

# # Copy sampled images
# for src in sample:
#     dst = sample_dir / src.name
#     shutil.copy2(src, dst)

# print(f"Sampled {len(sample)} images to {sample_dir}")


In [2]:
from ultralytics import YOLO
from pathlib import Path
import pandas as pd

# === CONFIG ===
# Point to folder of sample images.
INPUT_DIR = Path("/Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images")
# INPUT_DIR   = Path("/Users/brandonbyrd/Documents/Big Data/Big Data Project/test_images")  # change to Caltech frames later
OUTPUT_DIR  = Path("/Users/brandonbyrd/Documents/Big Data/Big Data Project/runs/yolo_test")
IMG_SIZE    = 640
CONF_THRESH = 0.25
RECURSIVE   = True
SAVE_PARQUET= True
PERSON_ONLY = False         # set True to filter to COCO class 0 (person)
NAME        = "trial_run"   # subfolder under OUTPUT_DIR

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}

def list_images(root: Path, recursive: bool = True):
    if root.is_file() and root.suffix.lower() in IMG_EXTS:
        return [root]
    it = root.rglob("*") if recursive else root.iterdir()
    return sorted([p for p in it if p.suffix.lower() in IMG_EXTS])

# images = list_images(INPUT_DIR, RECURSIVE)
images = sorted(INPUT_DIR.glob("*.png"))[:100]
print(f"Found {len(images)} images in {INPUT_DIR.resolve()}")
for p in images[:5]:
    print("  ", p)

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
model = YOLO("yolo11n.pt")
print("Classes:", model.names)


Found 100 images in /Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images
   /Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images/set00_V000_1245.png
   /Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images/set00_V000_1267.png
   /Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images/set00_V000_1366.png
   /Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images/set00_V000_1374.png
   /Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images/set00_V000_1402.png
Classes: {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra'

In [3]:
from pathlib import Path
import collections

p = INPUT_DIR
exts = collections.Counter([q.suffix.lower() for q in (p.rglob("*") if RECURSIVE else p.iterdir()) if q.is_file()])
print("File extensions counts:", exts)
print("Total image candidates (jpg/png/jpeg/bmp/webp):", len([q for q in (p.rglob('*') if RECURSIVE else p.iterdir())
                                                             if q.suffix.lower() in {'.jpg','.jpeg','.png','.bmp','.webp'}]))
# Check for .seq/.vbb (Caltech originals)
print("SEQ files:", len(list(p.rglob("*.seq"))) if p.exists() else 0)
print("VBB files:", len(list(p.rglob("*.vbb"))) if p.exists() else 0)


File extensions counts: Counter({'.png': 2366})
Total image candidates (jpg/png/jpeg/bmp/webp): 2366
SEQ files: 0
VBB files: 0


In [4]:
from PIL import Image
imgs = [q for q in (INPUT_DIR.rglob("*") if RECURSIVE else INPUT_DIR.iterdir())
        if q.suffix.lower() in {'.jpg','.jpeg','.png','.bmp','.webp'}]
print("First few:", imgs[:3])
if imgs:
    Image.open(imgs[0]).verify()
    print("PIL can read first image ")
else:
    print("No readable images found ")


First few: [PosixPath('/Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images/set03_V007_793.png'), PosixPath('/Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images/set04_V011_271.png'), PosixPath('/Users/brandonbyrd/Documents/Big Data/Big Data Project/derived/caltech_yolo/sample_images/set05_V009_1309.png')]
PIL can read first image 


In [5]:
# ---- Inference ----
predict_kwargs = dict(
    imgsz=IMG_SIZE,
    conf=CONF_THRESH,
    save=True,            # saves annotated JPGs
    save_txt=True,        # saves YOLO txt labels
    save_conf=True,       # include confidences in txt
    project=str(OUTPUT_DIR),
    name=NAME,
    exist_ok=True,
    verbose=False
)
if PERSON_ONLY:
    predict_kwargs["classes"] = [0]  # COCO 'person'

results = model.predict(source=[str(p) for p in images], **predict_kwargs, stream=True)
print("Outputs ->", (OUTPUT_DIR / NAME).resolve())

# ---- Build detections table ----
rows = []
for res in results:
    h, w = res.orig_shape
    # fallback to 0.0 if key missing
    inf_ms = float(res.speed.get("inference", 0.0)) if hasattr(res, "speed") else 0.0
    img_path = res.path
    for b in res.boxes:
        cls_id = int(b.cls)
        conf   = float(b.conf)
        x1, y1, x2, y2 = map(float, b.xyxy[0].tolist())
        rows.append({
            "image_path": img_path,
            "model": "yolo",
            "class_id": cls_id,
            "class_name": model.names.get(cls_id, str(cls_id)),
            "conf": conf,
            "x1": x1, "y1": y1, "x2": x2, "y2": y2,
            "width": w, "height": h,
            "inf_ms": inf_ms
        })

df = pd.DataFrame(rows)
print("Detections table:", df.shape)
display(df.head())


Outputs -> /Users/brandonbyrd/Documents/Big Data/Big Data Project/runs/yolo_test/trial_run
Results saved to [1m/Users/brandonbyrd/Documents/Big Data/Big Data Project/runs/yolo_test/trial_run[0m
99 labels saved to /Users/brandonbyrd/Documents/Big Data/Big Data Project/runs/yolo_test/trial_run/labels
Detections table: (696, 12)


Unnamed: 0,image_path,model,class_id,class_name,conf,x1,y1,x2,y2,width,height,inf_ms
0,/Users/brandonbyrd/Documents/Big Data/Big Data...,yolo,2,car,0.833985,94.38913,167.853546,168.443298,219.774567,640,480,88.661248
1,/Users/brandonbyrd/Documents/Big Data/Big Data...,yolo,7,truck,0.686377,198.053925,127.238297,289.849609,210.586868,640,480,88.661248
2,/Users/brandonbyrd/Documents/Big Data/Big Data...,yolo,5,bus,0.571295,390.046021,152.36879,424.602173,189.992722,640,480,88.661248
3,/Users/brandonbyrd/Documents/Big Data/Big Data...,yolo,2,car,0.539656,2.976372,138.293793,103.015976,243.879303,640,480,88.661248
4,/Users/brandonbyrd/Documents/Big Data/Big Data...,yolo,2,car,0.4912,300.276611,169.153076,317.732544,187.916199,640,480,88.661248


In [6]:
pip install "pandas==2.1.4" "numpy<2" --force-reinstall


Collecting pandas==2.1.4
  Using cached pandas-2.1.4-cp310-cp310-macosx_10_9_x86_64.whl.metadata (18 kB)
Collecting numpy<2
  Using cached numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl.metadata (61 kB)
Collecting python-dateutil>=2.8.2 (from pandas==2.1.4)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas==2.1.4)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas==2.1.4)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas==2.1.4)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Using cached pandas-2.1.4-cp310-cp310-macosx_10_9_x86_64.whl (11.7 MB)
Using cached numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl (20.6 MB)
Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached six-1.17.0-py2.py3-none

In [7]:
out_csv = OUTPUT_DIR / NAME / "yolo_detections.csv"
df.to_csv(out_csv, index=False)
print("Saved:", out_csv.resolve())

Saved: /Users/brandonbyrd/Documents/Big Data/Big Data Project/runs/yolo_test/trial_run/yolo_detections.csv


In [8]:
# # ---- Save CSV next to the run outputs ----
# if SAVE_PARQUET and len(df):
#     out_csv = OUTPUT_DIR / NAME / "yolo_detections.csv"
#     df.to_csv(out_csv, index=False)
#     print("Saved:", out_csv.resolve())


In [9]:
# # ---- Save Parquet next to the run outputs ----
# if SAVE_PARQUET and len(df):
#     out_pq = OUTPUT_DIR / NAME / "yolo_detections.parquet"
#     df.to_parquet(out_pq, index=False)
#     print("Saved:", out_pq.resolve())

# # ---- Preview a few annotated images ----
# from IPython.display import Image, display
# import glob

# for img_path in glob.glob(str(OUTPUT_DIR / NAME / "*.jpg"))[:3]:
#     display(Image(filename=img_path))


In [10]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shutil, glob

# ---------- paths ----------
CSV_PATH   = Path("/Users/brandonbyrd/Documents/Big Data/Big Data Project/runs/yolo_test/trial_run/yolo_detections.csv")
RUN_IMGDIR = Path("/Users/brandonbyrd/Documents/Big Data/Big Data Project/runs/yolo_test/trial_run")  # YOLO’s annotated JPGs live here
FIGDIR     = Path("/Users/brandonbyrd/Documents/Big Data/Big Data Project/reports/figures")
FIGDIR.mkdir(parents=True, exist_ok=True)

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
# Basic derived columns
df["bbox_w"] = df["x2"] - df["x1"]
df["bbox_h"] = df["y2"] - df["y1"]
df["bbox_area"] = df["bbox_w"] * df["bbox_h"]
df["img_area"] = df["width"] * df["height"]
df["bbox_area_norm"] = df["bbox_area"] / df["img_area"]

# Detections per image
per_img = df.groupby("image_path").size().rename("detections").reset_index()

# ---------- 1) detections per image ----------
plt.figure()
sns.histplot(per_img["detections"], bins=20, kde=False)
plt.xlabel("Detections per image")
plt.ylabel("Count")
plt.title("Distribution of detections per image (n={})".format(len(per_img)))
plt.tight_layout()
plt.savefig(FIGDIR / "det_per_image.png", dpi=200)
plt.close()

# ---------- 2) confidence distribution ----------
plt.figure()
sns.histplot(df["conf"], bins=25, kde=True, stat="density")
plt.xlabel("Confidence")
plt.ylabel("Density")
plt.title("Detection confidence distribution")
plt.tight_layout()
plt.savefig(FIGDIR / "confidence_hist.png", dpi=200)
plt.close()

# ---------- 3) bbox area (normalized) ----------
plt.figure()
sns.histplot(df["bbox_area_norm"].clip(upper=0.25), bins=30)  # clip tail so small-object mode is visible
plt.xlabel("Normalized bbox area (area / image area)")
plt.ylabel("Count")
plt.title("Normalized bounding box area (clipped at 0.25 for visibility)")
plt.tight_layout()
plt.savefig(FIGDIR / "bbox_area_norm.png", dpi=200)
plt.close()

# ---------- 4) confidence vs bbox area ----------
plt.figure()
plt.scatter(df["bbox_area_norm"], df["conf"], s=6, alpha=0.4)
plt.xlabel("Normalized bbox area")
plt.ylabel("Confidence")
plt.title("Confidence vs. box size")
plt.tight_layout()
plt.savefig(FIGDIR / "conf_vs_area.png", dpi=200)
plt.close()

# ---------- 5) inference latency (per image) ----------
# inf_ms is duplicated across boxes of the same image; compute one per image
lat = df.groupby("image_path")["inf_ms"].first().reset_index()
plt.figure()
sns.histplot(lat["inf_ms"], bins=25, kde=False)
plt.xlabel("Per-image inference time (ms)")
plt.ylabel("Count")
plt.title("YOLOv11n inference time (CPU)")
plt.tight_layout()
plt.savefig(FIGDIR / "latency_hist.png", dpi=200)
plt.close()

# ---------- 6) class distribution (bar) ----------
plt.figure()
cls_counts = df["class_name"].value_counts().sort_values(ascending=False)
sns.barplot(x=cls_counts.index, y=cls_counts.values)
plt.xlabel("Class")
plt.ylabel("Detections")
plt.title("Class frequency")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig(FIGDIR / "class_bar.png", dpi=200)
plt.close()

# ---------- copy a few annotated images from the YOLO run folder ----------
# YOLO saves annotated .jpgs in the run directory; grab a few for the paper
annotated = sorted(glob.glob(str(RUN_IMGDIR / "*.jpg")))[:3]
picked = []
for p in annotated:
    dst = FIGDIR / ("example_" + Path(p).name)
    shutil.copy2(p, dst)
    picked.append(dst)

summary = {
    "num_images": len(per_img),
    "num_detections": len(df),
    "mean_det_per_image": per_img["detections"].mean(),
    "median_det_per_image": per_img["detections"].median(),
    "mean_latency_ms": lat["inf_ms"].mean(),
}
summary, picked


({'num_images': 99,
  'num_detections': 696,
  'mean_det_per_image': 7.03030303030303,
  'median_det_per_image': 6.0,
  'mean_latency_ms': 88.66124833999945},
 [PosixPath('/Users/brandonbyrd/Documents/Big Data/Big Data Project/reports/figures/example_set00_V000_1245.jpg'),
  PosixPath('/Users/brandonbyrd/Documents/Big Data/Big Data Project/reports/figures/example_set00_V000_1267.jpg'),
  PosixPath('/Users/brandonbyrd/Documents/Big Data/Big Data Project/reports/figures/example_set00_V000_1366.jpg')])