YOLO Cropper + CSV Manifest (Research Pipeline)
-----------------------------------------------
In this section I use a trained YOLO detector to propose lesion crops from full-field
mammograms and persist them to disk, while also writing a structured CSV manifest that
pairs each crop with its source image, label, detector confidence, and metadata.

Why I generate model-derived crops
 - to bootstrap ROI-centric classifiers without manual box annotation
 - to enable fast ablations on ROI-only vs full-image training
 - to create a compact artifact for qualitative review and downstream experiments

 What I output
 - JPEG crops saved under `data/YOLO_CROPS/`
 - a `yolo_crops.csv` manifest with columns:
   `image_path, yolo_cropped_image_path, label, label_name, det_conf, is_fallback,
    laterality_LEFT, laterality_RIGHT, view_CC, view_MLO`

 Design choices
 - optional fallback center-crop when no detections are found (disabled by default)
 - bounding boxes are expanded by a factor to preserve diagnostic context
 - all I/O is guarded and logged so failures can be traced cleanly

In [2]:
# === YOLO cropper: run detector on full images, save crops, write a CSV ===
import os
import cv2
import math
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from ultralytics import YOLO

# -------------------- CONFIG --------------------
# Project root (parent of notebooks/)
PROJECT_DIR = Path().resolve().parent

DATA_DIR    = PROJECT_DIR / "data"
MODELS_DIR  = PROJECT_DIR / "models"   # change if you stored weights elsewhere

INPUT_CSV   = DATA_DIR / "calc_preprocessed_clean.csv"    
WEIGHTS_PATH = MODELS_DIR / "yolo_best_model.pt"                      
OUTPUT_DIR  = DATA_DIR / "YOLO_CROPS"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_CSV  = OUTPUT_DIR / "yolo_crops.csv"

# Debug prints to verify paths
print("CSV exists:", INPUT_CSV.exists())
print("Weights exist:", WEIGHTS_PATH.exists())
print("Output dir:", OUTPUT_DIR)          # output CSV file with crop paths + labels

CONF_THRES = 0.25       # YOLO confidence threshold
IOU_THRES = 0.5         # NMS IoU threshold
MAX_DETS_PER_IMAGE = 10 # limit number of saved crops per image
BOX_EXPAND = 1.20       # expand each detected box by this factor for context (1.0 = no expand)
DEVICE = 0              # set to 0 for GPU, or "cpu" if no GPU

# If you want to save *at least* one fallback crop when YOLO finds nothing:
SAVE_FALLBACK = False   # True/False
FALLBACK_SIZE = 512     # pixel size of square center crop if no detections and SAVE_FALLBACK=True

# ------------------------------------------------

def ensure_dir(p: Path):
    """Create directory path `p` recursively if missing; return `p` for chaining."""
    p.mkdir(parents=True, exist_ok=True)
    return p

def expand_box(x1, y1, x2, y2, scale, W, H):
    """Expand a [x1,y1,x2,y2] box around its center by `scale`, and clamp to [0,W)×[0,H)."""
    cx = (x1 + x2) / 2.0
    cy = (y1 + y2) / 2.0
    w  = (x2 - x1) * scale
    h  = (y2 - y1) * scale
    nx1 = max(0, int(round(cx - w / 2.0)))
    ny1 = max(0, int(round(cy - h / 2.0)))
    nx2 = min(W - 1, int(round(cx + w / 2.0)))
    ny2 = min(H - 1, int(round(cy + h / 2.0)))
    # ensure strictly inside image and non-empty
    nx1, ny1 = min(nx1, W - 2), min(ny1, H - 2)
    nx2, ny2 = max(nx2, nx1 + 1), max(ny2, ny1 + 1)
    return nx1, ny1, nx2, ny2

def safe_imread(p: str):
    """Read an image with OpenCV; raise a clear error if it fails."""
    img = cv2.imread(p)
    if img is None:
        raise FileNotFoundError(f"Failed to read image: {p}")
    return img

def save_crop(img, box_xyxy, save_path: Path):
    """Cut a crop from `img` using pixel box, write it to `save_path`, and return the path."""
    x1, y1, x2, y2 = map(int, box_xyxy)
    crop = img[y1:y2, x1:x2]
    ensure_dir(save_path.parent)
    ok = cv2.imwrite(str(save_path), crop)
    if not ok:
        raise IOError(f"Failed to write crop: {save_path}")
    return str(save_path)

def label_name_from_int(v: int) -> str:
    """Map numeric label to a descriptive string for readability in the manifest."""
    return "malignant" if int(v) == 1 else "benign"

# Load model (weights must exist; path is verified above)
model = YOLO(WEIGHTS_PATH)

# Read input manifest of full images (and metadata)
df = pd.read_csv(INPUT_CSV)

# Basic column checks (expect at least image_path + label)
required_cols = {"image_path", "label"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"CSV is missing required columns: {missing}. Present: {list(df.columns)}")

rows_out = []  # collect output rows
OUTPUT_DIR = ensure_dir(OUTPUT_DIR)

# Iterate over each full image and generate 0..K crops based on YOLO detections
for idx, row in tqdm(df.iterrows(), total=len(df), desc="YOLO cropping"):
    full_path = str(row["image_path"])
    lbl = int(row["label"])
    lbl_name = label_name_from_int(lbl)

    # Read laterality and view metadata to carry forward into the output CSV
    laterality_LEFT = int(row["laterality_LEFT"])
    laterality_RIGHT = int(row["laterality_RIGHT"])
    view_CC = int(row["view_CC"])
    view_MLO = int(row["view_MLO"])

    # Load the full image
    try:
        img = safe_imread(full_path)
    except Exception as e:
        print(f"[WARN] {e}")
        continue

    H, W = img.shape[:2]

    # Run the detector to get proposed boxes (class-agnostic here)
    try:
        results = model.predict(
            source=full_path,
            conf=CONF_THRES,
            iou=IOU_THRES,
            max_det=MAX_DETS_PER_IMAGE,
            verbose=False,
            device=DEVICE
        )
    except Exception as e:
        print(f"[WARN] Inference failed on {full_path}: {e}")
        continue

    # Parse detections (may be empty)
    if len(results) == 0 or results[0].boxes is None or results[0].boxes.data is None:
        det_boxes = []
        det_confs = []
    else:
        b = results[0].boxes
        # b.xyxy (n,4), b.conf (n,)
        det_boxes = b.xyxy.cpu().numpy() if hasattr(b.xyxy, "cpu") else b.xyxy.numpy()
        det_confs = b.conf.cpu().numpy() if hasattr(b.conf, "cpu") else b.conf.numpy()

    # Optional fallback: save a center crop if no detection (disabled by default)
    if len(det_boxes) == 0 and SAVE_FALLBACK:
        side = min(FALLBACK_SIZE, min(W, H))
        cx, cy = W // 2, H // 2
        x1 = max(0, cx - side // 2)
        y1 = max(0, cy - side // 2)
        x2 = min(W, x1 + side)
        y2 = min(H, y1 + side)
        # adjust in case we hit bounds
        x1 = max(0, x2 - side)
        y1 = max(0, y2 - side)

        save_path = OUTPUT_DIR / f"{Path(full_path).stem}_fallback.jpg"
        try:
            saved = save_crop(img, (x1, y1, x2, y2), save_path)
            rows_out.append({
                "image_path": full_path,
                "yolo_cropped_image_path": saved,
                "label": lbl,
                "label_name": lbl_name,
                "det_conf": None,
                "is_fallback": True,
                "laterality_LEFT": laterality_LEFT,
                "laterality_RIGHT": laterality_RIGHT,
                "view_CC": view_CC,
                "view_MLO": view_MLO,
            })
        except Exception as e:
            print(f"[WARN] Failed to save fallback crop for {full_path}: {e}")

    # Save each detected crop (boxes are expanded slightly to retain context)
    for k, (xyxy, conf) in enumerate(zip(det_boxes, det_confs)):
        x1, y1, x2, y2 = map(float, xyxy)
        ex1, ey1, ex2, ey2 = expand_box(x1, y1, x2, y2, BOX_EXPAND, W, H)

        save_name = f"{Path(full_path).stem}_det{k+1}_c{conf:.2f}.jpg"
        save_path = OUTPUT_DIR / save_name

        try:
            saved = save_crop(img, (ex1, ey1, ex2, ey2), save_path)
            rows_out.append({
                "image_path": full_path,
                "yolo_cropped_image_path": saved,
                "label": lbl,
                "label_name": lbl_name,
                "det_conf": float(conf),
                "is_fallback": False,
                "laterality_LEFT": laterality_LEFT,
                "laterality_RIGHT": laterality_RIGHT,
                "view_CC": view_CC,
                "view_MLO": view_MLO,
            })
        except Exception as e:
            print(f"[WARN] Failed to save crop for {full_path} det#{k+1}: {e}")

# Persist manifest for downstream analysis and training
out_df = pd.DataFrame(rows_out, columns=[
    "image_path",
    "yolo_cropped_image_path",
    "label",
    "label_name",
    "det_conf",
    "is_fallback",
    "laterality_LEFT",
    "laterality_RIGHT",
    "view_CC",
    "view_MLO",
])
out_df.to_csv(OUTPUT_CSV, index=False)
print(f"[DONE] Saved {len(out_df)} YOLO crops to {OUTPUT_DIR} and manifest to {OUTPUT_CSV}")

CSV exists: True
Weights exist: True
Output dir: C:\Users\PC\Desktop\final project\CBIS-DDSM\calc\YOLO model\data\YOLO_CROPS


YOLO cropping: 100%|██████████| 1871/1871 [04:50<00:00,  6.44it/s]

[DONE] Saved 1609 YOLO crops to C:\Users\PC\Desktop\final project\CBIS-DDSM\calc\YOLO model\data\YOLO_CROPS and manifest to C:\Users\PC\Desktop\final project\CBIS-DDSM\calc\YOLO model\data\YOLO_CROPS\yolo_crops.csv



