In [None]:
# for downloading the dataset
!pip install roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="DQzc1eSyvt3GbvZF1MyU") # IMPORTANT: always redact key before commit
project = rf.workspace("autoboat-at-virginia-tech").project("buoy-and-boat-cv-h8o5p")
version = project.version(25)
dataset = version.download("yolo26")

In [None]:
# augment buoy hue
# ts chatgpt btw. it works, don't worry about it

import argparse
import cv2
import numpy as np
import random
from pathlib import Path
import hashlib

# ================= CONFIG =================
ROOT = Path(__file__).resolve().parent  # anchor paths to script location
SPLITS = ["train", "valid", "test"]  # All splits to process

DEFAULT_OUT_BASE = ROOT  # Output base directory
DEFAULT_LOG_DIR = ROOT / "regression_logs"

BUOY_CLASS_ID = 1
VALID_EXTS = {".jpg", ".jpeg", ".png"}

# ---- Tight Maritime HSV (SAFE) ----
HSV_H = 0.12   # Â±22Â°
HSV_S = 0.25   # Â±25%
HSV_V = 0.15   # Â±15%

HSV_PROB = 0.7

# -----------------------------------------
def parse_args():
    parser = argparse.ArgumentParser(description="Buoy-only augmentation for all splits")
    parser.add_argument("--out-base", type=Path, default=DEFAULT_OUT_BASE, help="Output base directory")
    parser.add_argument("--log-dir", type=Path, default=DEFAULT_LOG_DIR, help="Regression log directory")
    return parser.parse_args()

# =========================================

def load_labels(path):
    labels = []
    with open(path) as f:
        for line in f:
            parts = line.split()
            if len(parts) < 5:
                # Skip malformed/empty lines
                continue
            # Only keep class + x y w h; drop any extra columns (e.g., confidence)
            try:
                labels.append(list(map(float, parts[:5])))
            except ValueError:
                continue
    return labels

def save_labels(path, labels):
    with open(path, "w") as f:
        for l in labels:
            f.write(" ".join(f"{x:.6f}" for x in l) + "\n")

def apply_hsv(roi):
    hsv = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV).astype(np.float32)

    hsv[..., 0] = (hsv[..., 0] + random.uniform(-HSV_H, HSV_H) * 180) % 180
    hsv[..., 1] *= random.uniform(1 - HSV_S, 1 + HSV_S)
    hsv[..., 2] *= random.uniform(1 - HSV_V, 1 + HSV_V)

    hsv[..., 1:] = np.clip(hsv[..., 1:], 0, 255)
    return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)

def buoy_only_augment(img, labels):
    h, w = img.shape[:2]
    img_aug = img.copy()

    buoy_boxes = []

    for cls, x, y, bw, bh in labels:
        if int(cls) != BUOY_CLASS_ID:
            continue

        x1 = max(0, int((x - bw / 2) * w))
        y1 = max(0, int((y - bh / 2) * h))
        x2 = min(w, int((x + bw / 2) * w))
        y2 = min(h, int((y + bh / 2) * h))

        if x2 - x1 < 8 or y2 - y1 < 8:
            continue

        roi = img_aug[y1:y2, x1:x2]

        if random.random() < HSV_PROB:
            roi = apply_hsv(roi)

        img_aug[y1:y2, x1:x2] = roi
        buoy_boxes.append((x1, y1, x2, y2))

    return img_aug, buoy_boxes

def regression_check(original, augmented, buoy_boxes):
    mask = np.zeros(original.shape[:2], dtype=np.uint8)

    for x1, y1, x2, y2 in buoy_boxes:
        mask[y1:y2, x1:x2] = 1

    diff = np.abs(original.astype(int) - augmented.astype(int))
    changed = np.any(diff > 2, axis=2)

    illegal_change = np.any((changed == 1) & (mask == 0))
    return not illegal_change

def main():
    args = parse_args()

    out_base = args.out_base.expanduser().resolve()
    log_dir = args.log_dir.expanduser().resolve()
    log_dir.mkdir(parents=True, exist_ok=True)

    grand_total = 0
    grand_saved = 0
    grand_missing_labels = 0
    grand_unreadable = 0
    grand_regression_failures = 0
    grand_no_buoys = 0

    for split in SPLITS:
        img_dir = ROOT / split / "images"
        lbl_dir = ROOT / split / "labels"
        out_img_dir = out_base / f"{split}_aug" / "images"
        out_lbl_dir = out_base / f"{split}_aug" / "labels"

        if not img_dir.exists():
            print(f"[skip] Split '{split}' images dir not found: {img_dir}")
            continue
        if not lbl_dir.exists():
            print(f"[skip] Split '{split}' labels dir not found: {lbl_dir}")
            continue

        out_img_dir.mkdir(parents=True, exist_ok=True)
        out_lbl_dir.mkdir(parents=True, exist_ok=True)

        total = 0
        saved = 0
        missing_labels = 0
        unreadable = 0
        regression_failures = 0
        no_buoys = 0

        for img_path in img_dir.iterdir():
            if not img_path.is_file() or img_path.suffix.lower() not in VALID_EXTS:
                continue

            total += 1

            lbl_path = lbl_dir / f"{img_path.stem}.txt"
            if not lbl_path.exists():
                missing_labels += 1
                print(f"[skip] missing label for: {img_path.name}")
                continue

            img = cv2.imread(str(img_path))
            if img is None:
                unreadable += 1
                print(f"[skip] cannot read image: {img_path}")
                continue

            labels = load_labels(lbl_path)
            
            # Check if image has any buoys
            has_buoys = any(int(lbl[0]) == BUOY_CLASS_ID for lbl in labels)

            # If no buoys, just copy the original unchanged
            if not has_buoys:
                no_buoys += 1
                if cv2.imwrite(str(out_img_dir / img_path.name), img):
                    save_labels(out_lbl_dir / lbl_path.name, labels)
                    saved += 1
                else:
                    print(f"[error] failed to write: {img_path.name}")
                continue

            # Apply augmentation to buoy regions only
            img_aug, buoy_boxes = buoy_only_augment(img, labels)

            if not regression_check(img, img_aug, buoy_boxes):
                regression_failures += 1
                cv2.imwrite(str(log_dir / f"{split}_{img_path.stem}_FAIL.jpg"), img_aug)
                # Still save the original if augmentation fails regression check
                if cv2.imwrite(str(out_img_dir / img_path.name), img):
                    save_labels(out_lbl_dir / lbl_path.name, labels)
                    saved += 1
                continue

            if cv2.imwrite(str(out_img_dir / img_path.name), img_aug):
                save_labels(out_lbl_dir / lbl_path.name, labels)
                saved += 1
            else:
                print(f"[error] failed to write: {img_path.name}")

        print(
            f"âœ… {split}: total={total}, saved={saved}, no_buoys={no_buoys}, missing_labels={missing_labels}, "
            f"unreadable={unreadable}, regression_failures={regression_failures}"
        )

        grand_total += total
        grand_saved += saved
        grand_missing_labels += missing_labels
        grand_unreadable += unreadable
        grand_regression_failures += regression_failures
        grand_no_buoys += no_buoys

    print(
        f"\nðŸŽ‰ All splits done. total={grand_total}, saved={grand_saved}, no_buoys={grand_no_buoys}, "
        f"missing_labels={grand_missing_labels}, unreadable={grand_unreadable}, regression_failures={grand_regression_failures}"
    )


if __name__ == "__main__":
    main()
