# 🧪 MMOTU Dataset Preprocessing Notebook
This notebook performs preprocessing for multi-modal ovarian tumor ultrasound segmentation: unifying file extensions, extracting intersection IDs, preprocessing to `.npy`, and splitting train/val/test.

In [12]:
from pathlib import Path
import os

# === Configuration ===
DATA_ROOT = Path('/root/autodl-tmp')

# 2D data
TWO_D_IMG_DIR = DATA_ROOT / 'OTU_2d' / 'images'
TWO_D_MASK_DIR = DATA_ROOT / 'OTU_2d' / 'annotations'

# CEUS data
CEUS_IMG_DIR = DATA_ROOT / 'OTU_3d' / 'images'
CEUS_MASK_DIR = DATA_ROOT / 'OTU_3d' / 'annotations'

# Output folders
OUT_ROOT = DATA_ROOT / 'preprocessed_mmOTU'
OUT_2D_IMG = OUT_ROOT / '2d' / 'images'
OUT_2D_MASK = OUT_ROOT / '2d' / 'masks'
OUT_CEUS_IMG = OUT_ROOT / '3d' / 'images'
OUT_CEUS_MASK = OUT_ROOT / '3d' / 'masks'

for p in [OUT_2D_IMG, OUT_2D_MASK, OUT_CEUS_IMG, OUT_CEUS_MASK]:
    p.mkdir(parents=True, exist_ok=True)

IMG_SIZE = (256, 256)
SPLIT_RATIO = (0.6, 0.2, 0.2)


## Step 1: Normalize file extensions to lowercase

In [13]:
def lowercase_extensions(folder: Path, exts=(".JPG", ".PNG", ".JPEG")):
    cnt = 0
    for fname in os.listdir(folder):
        src = folder / fname
        if not src.is_file():
            continue
        for ext in exts:
            if fname.endswith(ext):
                dst = folder / fname.replace(ext, ext.lower())
                if src != dst:
                    os.rename(src, dst)
                    cnt += 1
                break
    return cnt

print("Renamed:")
print("2D images:", lowercase_extensions(TWO_D_IMG_DIR))
print("2D masks :", lowercase_extensions(TWO_D_MASK_DIR))
print("CEUS images:", lowercase_extensions(CEUS_IMG_DIR))
print("CEUS masks :", lowercase_extensions(CEUS_MASK_DIR))


Renamed:
2D images: 0
2D masks : 0
CEUS images: 0
CEUS masks : 0


## Step 2: Extract common IDs between 2D and CEUS

In [14]:
def collect_ids_2d(img_dir: Path, mask_dir: Path):
    imgs = set([p.stem for p in img_dir.glob('*.jpg')])
    masks = set([p.stem.replace('_binary', '') for p in mask_dir.glob('*.png') if '_binary' in p.stem])
    return imgs & masks

def collect_ids_ceus(img_dir: Path, mask_dir: Path):
    imgs = set([p.stem for p in img_dir.glob('*.jpg')] + [p.stem for p in img_dir.glob('*.png')])
    masks = set([p.stem for p in mask_dir.glob('*.png')])
    return imgs & masks

ids_2d = collect_ids_2d(TWO_D_IMG_DIR, TWO_D_MASK_DIR)
ids_ceus = collect_ids_ceus(CEUS_IMG_DIR, CEUS_MASK_DIR)
common_ids = sorted(list(ids_2d & ids_ceus))
print(f"✅ Found {len(common_ids)} valid paired IDs.")


✅ Found 170 valid paired IDs.


## Step 3: Preprocess images and masks to `.npy`

In [15]:
import numpy as np
from PIL import Image
from tqdm import tqdm

def load_gray(path, size):
    return np.array(Image.open(path).convert("L").resize(size)) / 255.0

def load_mask(path, size):
    m = Image.open(path).resize(size)
    m = np.array(m)
    if m.ndim == 3:
        m = m[...,0]
    return (m > 10).astype(np.uint8)

converted = 0
for idx in tqdm(common_ids):
    try:
        # 2D
        img2d = load_gray(TWO_D_IMG_DIR / f"{idx}.jpg", IMG_SIZE)
        msk2d = load_mask(TWO_D_MASK_DIR / f"{idx}_binary.png", IMG_SIZE)
        np.save(OUT_2D_IMG / f"{idx}.npy", img2d)
        np.save(OUT_2D_MASK / f"{idx}.npy", msk2d)

        # CEUS
        if (CEUS_IMG_DIR / f"{idx}.jpg").exists():
            ceus_path = CEUS_IMG_DIR / f"{idx}.jpg"
        else:
            ceus_path = CEUS_IMG_DIR / f"{idx}.png"
        imgceus = load_gray(ceus_path, IMG_SIZE)
        mskceus = load_mask(CEUS_MASK_DIR / f"{idx}.png", IMG_SIZE)
        np.save(OUT_CEUS_IMG / f"{idx}.npy", imgceus)
        np.save(OUT_CEUS_MASK / f"{idx}.npy", mskceus)

        converted += 1
    except Exception as e:
        print(f"❌ Error on ID {idx}: {e}")

print(f"✅ Preprocessed {converted} samples to {OUT_ROOT}")


100%|██████████| 170/170 [00:03<00:00, 53.68it/s]

✅ Preprocessed 170 samples to /root/autodl-tmp/preprocessed_mmOTU





## Step 4: Split into train/val/test

In [16]:
import random

random.seed(42)
random.shuffle(common_ids)
n = len(common_ids)
n_train = int(SPLIT_RATIO[0] * n)
n_val = int(SPLIT_RATIO[1] * n)

train_ids = common_ids[:n_train]
val_ids = common_ids[n_train:n_train+n_val]
test_ids = common_ids[n_train+n_val:]

def save_list(name, ids):
    with open(OUT_ROOT / f"{name}_ids.txt", "w") as f:
        for i in ids:
            f.write(i + "\n")

save_list("train", train_ids)
save_list("val", val_ids)
save_list("test", test_ids)

print(f"✅ Split complete: {len(train_ids)} train, {len(val_ids)} val, {len(test_ids)} test")


✅ Split complete: 102 train, 34 val, 34 test


## Step 5: Define simple data augmentation

In [17]:
from PIL import ImageEnhance

def augment_pair(x2d, xceus, y):
    img2d = Image.fromarray((x2d * 255).astype(np.uint8))
    imgc = Image.fromarray((xceus * 255).astype(np.uint8))
    msk = Image.fromarray((y * 255).astype(np.uint8))

    # flip
    if random.random() < 0.5:
        img2d = img2d.transpose(Image.FLIP_LEFT_RIGHT)
        imgc = imgc.transpose(Image.FLIP_LEFT_RIGHT)
        msk = msk.transpose(Image.FLIP_LEFT_RIGHT)

    # rotate
    if random.random() < 0.5:
        angle = random.uniform(-10, 10)
        img2d = img2d.rotate(angle)
        imgc = imgc.rotate(angle)
        msk = msk.rotate(angle)

    # contrast CEUS
    if random.random() < 0.5:
        imgc = ImageEnhance.Contrast(imgc).enhance(random.uniform(0.8, 1.2))

    return (
        np.array(img2d).astype(np.float32)/255.0,
        np.array(imgc).astype(np.float32)/255.0,
        (np.array(msk) > 10).astype(np.uint8)
    )
