In [1]:
import os
import re
import shutil
from pathlib import Path
from PIL import Image, ImageOps

In [2]:
DATA_DIR = Path("/home/alex/data")

In [3]:
# CIFAR-100 superclass to subclass mapping
cifar100_superclass_map = {
    "aquatic_mammals": ["beaver", "dolphin", "otter", "seal", "whale"],
    "fish": ["aquarium_fish", "flatfish", "ray", "shark", "trout"],
    "flowers": ["orchid", "poppy", "rose", "sunflower", "tulip"],
    "food_containers": ["bottle", "bowl", "can", "cup", "plate"],
    "fruit_and_vegetables": ["apple", "mushroom", "orange", "pear", "sweet_pepper"],
    "household_electrical_devices": ["clock", "keyboard", "lamp", "telephone", "television"],
    "household_furniture": ["bed", "chair", "couch", "table", "wardrobe"],
    "insects": ["bee", "beetle", "butterfly", "caterpillar", "cockroach"],
    "large_carnivores": ["bear", "leopard", "lion", "tiger", "wolf"],
    "large_man_made_outdoor_things": ["bridge", "castle", "house", "road", "skyscraper"],
    "large_natural_outdoor_scenes": ["cloud", "forest", "mountain", "plain", "sea"],
    "large_omnivores_and_herbivores": ["camel", "cattle", "chimpanzee", "elephant", "kangaroo"],
    "medium_sized_mammals": ["fox", "porcupine", "possum", "raccoon", "skunk"],
    "non_insect_invertebrates": ["crab", "lobster", "snail", "spider", "worm"],
    "people": ["baby", "boy", "girl", "man", "woman"],
    "reptiles": ["crocodile", "dinosaur", "lizard", "snake", "turtle"],
    "small_mammals": ["hamster", "mouse", "rabbit", "shrew", "squirrel"],
    "trees": ["maple_tree", "oak_tree", "palm_tree", "pine_tree", "willow_tree"],
    "vehicles_1": ["bicycle", "bus", "motorcycle", "pickup_truck", "train"],
    "vehicles_2": ["lawn_mower", "rocket", "streetcar", "tank", "tractor"],
}

# Reverse mapping: subclass -> superclass
subclass_to_superclass = {
    subclass: superclass
    for superclass, subclasses in cifar100_superclass_map.items()
    for subclass in subclasses
}

In [4]:
# ================= CONFIGURATION =================
# Put the path to your top-level folder here
ROOT_DIR = "/Users/alexc/data/ms_cifar100_ai_data"

# Output directory (Optional: separate output is safer than overwriting)
# If None, it will overwrite or save alongside originals (risky).
# I recommend setting a new path to verify results first.
OUTPUT_DIR = "/Users/alexc/data/ms_cifar100_ai_data_cleaned"
# =================================================

In [5]:
def strip_metadata_and_process(img_path, output_path, is_gemini, is_chatgpt):
    try:
        with Image.open(img_path) as img:
            # 1. Convert to RGB (Standardize mode)
            if img.mode in ("RGBA", "P"):
                img = img.convert("RGB")

            # 2. Deep Copy to strip metadata
            data = list(img.getdata())
            clean_img = Image.new(img.mode, img.size)
            clean_img.putdata(data)

            # 3. Process Dimensions (Both Gemini & ChatGPT)
            if is_gemini or is_chatgpt:
                width, height = clean_img.size

                # --- Step A: Calculate Center Square Crop ---
                # This fixes the aspect ratio for landscape/portrait images
                # ensuring we don't "squish" the image when resizing.
                min_dim = min(width, height)

                left = (width - min_dim) / 2
                top = (height - min_dim) / 2
                right = (width + min_dim) / 2
                bottom = (height + min_dim) / 2

                # Define the initial square box
                crop_box = (left, top, right, bottom)

                # --- Step B: Extra Watermark Trim (Gemini Only) ---
                # Only Gemini gets the extra 2% zoom to remove the corner star.
                # ChatGPT images are just center-cropped to square.
                if is_gemini:
                    zoom_factor = 0.02
                    margin = min_dim * zoom_factor
                    # Contract the box by the margin
                    crop_box = (left + margin, top + margin, right - margin, bottom - margin)

                # Apply the crop
                clean_img = clean_img.crop(crop_box)

                # --- Step C: Resize to Target 1024x1024 ---
                clean_img = clean_img.resize((1024, 1024), Image.Resampling.LANCZOS)

            # 4. Save to new path
            output_path.parent.mkdir(parents=True, exist_ok=True)
            clean_img.save(output_path, "PNG", optimize=True)

            print(f"Processed: {output_path.name}")
            return True

    except Exception as e:
        print(f"Error processing {img_path.name}: {e}")
        return False

def main():
    root_path = Path(ROOT_DIR)
    output_base = Path(OUTPUT_DIR) if OUTPUT_DIR else root_path

    # Walk through all directories
    for subdir, dirs, files in os.walk(root_path):
        subdir_path = Path(subdir)
        counter = 1

        for filename in files:
            if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
                continue

            # Identify Source
            is_gemini = "Gemini_Generated_Image" in filename
            is_chatgpt = "ChatGPT Image" in filename

            if not (is_gemini or is_chatgpt):
                continue

            # Rename logic
            parent_name = subdir_path.name
            safe_parent_name = re.sub(r'[^\w\-_]', '_', parent_name)
            new_filename = f"{safe_parent_name}_{counter:03d}.png"
            counter += 1

            # Path logic
            input_file = subdir_path / filename
            rel_path = subdir_path.relative_to(root_path)
            output_file = output_base / rel_path / new_filename

            # Pass both flags to the processor
            strip_metadata_and_process(input_file, output_file, is_gemini, is_chatgpt)


In [6]:
def sync_dir_structure(target_dir_path: str):
    """
    Reorganize a flat directory of subclass folders into CIFAR-100 superclass hierarchy.

    Args:
        target_dir_path: Path to directory containing subclass folders at top level
    """
    target_path = Path(target_dir_path)

    moved, skipped = 0, []
    # Iterate through immediate subdirectories
    for subdir in target_path.iterdir():
        if not subdir.is_dir():
            continue

        subclass_name = subdir.name
        if subclass_name not in subclass_to_superclass: # Check if this is a known CIFAR-100 subclass
            skipped.append(subclass_name)
            continue

        superclass_name = subclass_to_superclass[subclass_name]

        # Create superclass directory if needed
        superclass_dir = target_path / superclass_name
        superclass_dir.mkdir(exist_ok=True)

        # Move subclass directory under superclass
        new_location = superclass_dir / subclass_name
        shutil.move(str(subdir), str(new_location))

        moved += 1
        print(f"Moved: {subclass_name} -> {superclass_name}/{subclass_name}")

    # Report results
    print(f"\nCompleted: {moved} directories moved")
    if skipped:
        print(f"Skipped {len(skipped)} unrecognized directories: {skipped}")

disorganized_dir = DATA_DIR / "synthetic_cifar100_research/cifar100_512x512_master"
sync_dir_structure(target_dir_path=disorganized_dir)

Moved: poppy -> flowers/poppy
Moved: skunk -> medium_sized_mammals/skunk
Moved: whale -> aquatic_mammals/whale
Moved: crab -> non_insect_invertebrates/crab
Moved: seal -> aquatic_mammals/seal
Moved: lizard -> reptiles/lizard
Moved: caterpillar -> insects/caterpillar
Moved: shark -> fish/shark
Moved: bed -> household_furniture/bed
Moved: crocodile -> reptiles/crocodile
Moved: rabbit -> small_mammals/rabbit
Moved: mountain -> large_natural_outdoor_scenes/mountain
Moved: bottle -> food_containers/bottle
Moved: palm_tree -> trees/palm_tree
Moved: snake -> reptiles/snake
Moved: plate -> food_containers/plate
Moved: telephone -> household_electrical_devices/telephone
Moved: beaver -> aquatic_mammals/beaver
Moved: bicycle -> vehicles_1/bicycle
Moved: rose -> flowers/rose
Moved: bee -> insects/bee
Moved: cloud -> large_natural_outdoor_scenes/cloud
Moved: forest -> large_natural_outdoor_scenes/forest
Moved: man -> people/man
Moved: tractor -> vehicles_2/tractor
Moved: skyscraper -> large_man_ma