In [2]:
import os,random,csv
from pathlib import Path
from PIL import Image

In [None]:
random.seed(42)
MED_ROOT = Path("datasets//medical")
NON_ROOT = Path("datasets//non_medical")
OUT_ROOT = Path("datasets//clean_data")
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

SPLITS = 0.8, 0.1, 0.1
TARGET_SIZE = (224, 224)
CONVERT_TO_RGB = True

In [6]:
def list_images(root: Path):
    return [p for p in root.rglob("*") if p.suffix.lower() in IMG_EXTS]

def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True)

def save_img(src: Path, dst: Path):
    img = Image.open(src)
    if CONVERT_TO_RGB: img = img.convert("RGB")
    img = img.resize(TARGET_SIZE, Image.BILINEAR)
    img.save(dst)

def split_indices(n, train, val, test):
    idxs = list(range(n)); random.shuffle(idxs)
    n_tr = int(train*n); n_va = int(val*n)
    return idxs[:n_tr], idxs[n_tr:n_tr+n_va], idxs[n_tr+n_va:]

def write_split(files, split, cls):
    out_dir = OUT_ROOT / split / cls
    ensure_dir(out_dir)
    for f in files:
        save_img(f, out_dir / f.name)

def write_manifest(split):
    rows = []
    for cls, label in [("medical", 1), ("non_medical", 0)]:
        for p in (OUT_ROOT / split / cls).rglob("*"):
            if p.suffix.lower() in IMG_EXTS:
                rows.append([str(p.resolve()), label])
    with open(OUT_ROOT / f"{split}_manifest.csv", "w", newline="") as f:
        w = csv.writer(f); w.writerow(["path","label"]); w.writerows(rows)

def split_and_map(files, train, val, test):
    n = len(files)
    tr, va, te = split_indices(n, train, val, test)
    return [files[i] for i in tr], [files[i] for i in va], [files[i] for i in te]

def main():
    med_files = list_images(MED_ROOT)
    non_files = list_images(NON_ROOT)
    if not med_files or not non_files:
        raise RuntimeError("No images found. Check MED_ROOT and NON_ROOT.")

    n = min(len(med_files), len(non_files))
    random.shuffle(med_files); med_files = med_files[:n]
    random.shuffle(non_files); non_files = non_files[:n]
    print(f"[info] Using {n} medical and {n} non_medical images.")

    train, val, test = SPLITS
    med_tr, med_va, med_te = split_and_map(med_files, train, val, test)
    non_tr, non_va, non_te = split_and_map(non_files, train, val, test)

    print("[write] train..."); write_split(med_tr, "train", "medical"); write_split(non_tr, "train", "non_medical")
    print("[write] val..."  ); write_split(med_va, "val",   "medical"); write_split(non_va, "val",   "non_medical")
    print("[write] test..." ); write_split(med_te, "test",  "medical"); write_split(non_te, "test",  "non_medical")

    for split in ["train","val","test"]:
        write_manifest(split)

    print("[done] Cleaned dataset is under data_clean/(train|val|test)/(medical|non_medical)")
if __name__ == "__main__":
    main()

[info] Using 3707 medical and 3707 non_medical images.
[write] train...
[write] val...
[write] test...
[done] Cleaned dataset is under data_clean/(train|val|test)/(medical|non_medical)
