### Data Quality Check

This python notebook aims to comb through the available training data & identify any issues with the data including missclassified/mislabelled samples & the removal of training samples that prevent the effective training of the yolov5 model due to exploding gradients.

#### Import Statements

In [47]:
import os
import yaml
import shutil
import cv2

#### Configuration Parameters

In [None]:
dataset_root = "../data"
quarantine_root = "../quarantine"

exts = [".jpg", ".jpeg", ".png"]
min_box_size = 0.01
splits = ["train"]

#### Helper Functions

In [None]:
# Load data.yaml and create class mapping
with open(os.path.join(dataset_root, "data.yaml"), "r") as f:
    data_cfg = yaml.safe_load(f)

class_names = data_cfg["names"]
num_classes = len(class_names)

# Map plural and singular forms to same ID (accounts for filename inconsistencies)
class_lookup = {}
for idx, name in enumerate(class_names):
    name_lower = name.lower().strip()
    class_lookup[name_lower] = idx          
    if name_lower.endswith("s") and not name_lower.endswith("ss"):
        class_lookup[name_lower[:-1]] = idx

class_lookup["catterpillar"] = class_lookup.get("caterpillars", None)

In [None]:
def get_expected_class_from_filename(fname):
    """Extract the expected class prefix from a filename (before '-')"""
    base = os.path.basename(fname).replace("\\", "/")  # normalize slashes
    base = os.path.splitext(base)[0]
    prefix = base.split("-")[0].strip().lower()
    return prefix

def load_label_classes(label_file):
    """Load all class IDs from a YOLO .txt label file"""
    cls_set = set()
    if not os.path.exists(label_file):
        return cls_set
    with open(label_file, "r") as f:
        for line in f:
            parts = line.strip().split()
            if not parts:
                continue
            try:
                cls_set.add(int(parts[0]))
            except ValueError:
                continue
    return cls_set

def find_image(label_file, images_dir):
    base = os.path.splitext(os.path.basename(label_file))[0]
    for ext in exts:
        img_path = os.path.join(images_dir, base + ext)
        if os.path.exists(img_path):
            return img_path
    return None

def find_label(image_file, labels_dir):
    base = os.path.splitext(os.path.basename(image_file))[0]
    label_file = os.path.join(labels_dir, base + ".txt")
    return label_file if os.path.exists(label_file) else None

def check_label_file(label_file, images_dir):
    bad = False
    tiny_boxes = []

    with open(label_file) as f:
        lines = [l.strip() for l in f if l.strip()]

    if not lines:
        return True, tiny_boxes

    for line in lines:
        parts = line.split()
        if len(parts) != 5:
            bad = True
            continue
        try:
            cls, x, y, w, h = parts
            cls = int(cls)
            x, y, w, h = map(float, (x, y, w, h))
        except ValueError:
            bad = True
            continue

        if cls < 0 or cls >= num_classes:
            bad = True

        if not (0 <= x <= 1 and 0 <= y <= 1 and 0 < w <= 1 and 0 < h <= 1):
            bad = True

        if w < min_box_size or h < min_box_size:
            tiny_boxes.append((cls, x, y, w, h))

    img_path = find_image(label_file, images_dir)
    if not img_path:
        return True, tiny_boxes

    img = cv2.imread(img_path)
    if img is None or img.shape[0] == 0 or img.shape[1] == 0:
        return True, tiny_boxes

    return bad, tiny_boxes

def validate_expected_class(image_path, label_path):
    """Ensure label file contains the class expected by filename"""
    expected_name = get_expected_class_from_filename(image_path)
    expected_id = class_lookup.get(expected_name, None)
    if expected_id is None:
        return False

    actual_classes = load_label_classes(label_path)

    return expected_id in actual_classes

#### Quarantine Run

In [None]:
for split in splits:
    print(f"\n Processing {split} dataset")
    labels_dir = os.path.join(dataset_root, split, "labels")
    images_dir = os.path.join(dataset_root, split, "images")

    quarantined_count = 0

    # Quarantine images with missing labels
    for path, _, files in os.walk(images_dir):
        for f in files:
            if not any(f.endswith(ext) for ext in exts):
                continue
            img_path = os.path.join(path, f)
            label_file = find_label(img_path, labels_dir)
            if label_file is None:
                rel_img_path = os.path.relpath(img_path, dataset_root)
                quarantine_image_path = os.path.join(quarantine_root, rel_img_path)
                os.makedirs(os.path.dirname(quarantine_image_path), exist_ok=True)
                shutil.move(img_path, quarantine_image_path)
                quarantined_count += 1
                print(f"No label for image quarantined: {rel_img_path}")

    # Validate label files and expected class
    for path, _, files in os.walk(labels_dir):
        for f in files:
            if not f.endswith(".txt"):
                continue

            full_label = os.path.join(path, f)
            img_path = find_image(full_label, images_dir)
            if img_path is None:
                continue

            bad, tiny_boxes = check_label_file(full_label, images_dir)
            has_expected_class = validate_expected_class(img_path, full_label)

            if bad or tiny_boxes or not has_expected_class:
                
                # Quarantine label
                rel_label_path = os.path.relpath(full_label, dataset_root)
                quarantine_label_path = os.path.join(quarantine_root, rel_label_path)
                os.makedirs(os.path.dirname(quarantine_label_path), exist_ok=True)
                shutil.move(full_label, quarantine_label_path)

                # Quarantine image
                if os.path.exists(img_path):
                    rel_img_path = os.path.relpath(img_path, dataset_root)
                    quarantine_image_path = os.path.join(quarantine_root, rel_img_path)
                    os.makedirs(os.path.dirname(quarantine_image_path), exist_ok=True)
                    shutil.move(img_path, quarantine_image_path)

                quarantined_count += 1

                if not has_expected_class:
                    print(f"Expected class missing. Quarantined: {split}/{f}")

                for cls, x, y, w, h in tiny_boxes:
                    print(f"Tiny box quarantined in {split}/{f}: class={cls}, x={x}, y={y}, w={w}, h={h}")

                if bad and not tiny_boxes:
                    print(f"Bad label quarantined: {split}/{f}")

    print(f"{quarantined_count} files moved to quarantine from {split}.")


Processing train dataset
Expected class missing. Quarantined: train/ants-1-_jpg.rf.8227b4d5f1cbbd72c290c1ca6012a337.txt
Expected class missing. Quarantined: train/ants-1-_jpg.rf.cb38695892dc2ca4d3ee97d3f8a06ba7.txt
Expected class missing. Quarantined: train/ants-1-_jpg.rf.ef0ce7e104418cd6d0e0f57c45de1f35.txt
Expected class missing. Quarantined: train/ants-43-_jpg.rf.5c01ce87bf9d12aeb114c5b660d01bcc.txt
Expected class missing. Quarantined: train/ants-43-_jpg.rf.8c614663f0ab9bb2857426408333970f.txt
Expected class missing. Quarantined: train/ants-43-_jpg.rf.fe6e6f03ee483c3d50ae4a46294bc2ae.txt
Expected class missing. Quarantined: train/ants-44-_jpg.rf.4882d220b4e0699c42a1b408855d7b10.txt
Expected class missing. Quarantined: train/ants-44-_jpg.rf.998f5b8fb25ff0c4a7a1a303b5e62032.txt
Expected class missing. Quarantined: train/ants-44-_jpg.rf.f5e9eae2086f547bfcac9d2791dbcb3e.txt
Expected class missing. Quarantined: train/ants-442-_jpg.rf.79ac8a62d66f98caf11a3b661a647b95.txt
Expected class m