In [None]:
import os
import re
from pathlib import Path
from PIL import Image, ImageOps

In [None]:
# ================= CONFIGURATION =================
# Put the path to your top-level folder here
ROOT_DIR = "/Users/alexc/data/ms_cifar100_ai_data"

# Output directory (Optional: separate output is safer than overwriting)
# If None, it will overwrite or save alongside originals (risky).
# I recommend setting a new path to verify results first.
OUTPUT_DIR = "/Users/alexc/data/ms_cifar100_ai_data_cleaned"
# =================================================

In [None]:
def strip_metadata_and_process(img_path, output_path, is_gemini, is_chatgpt):
    try:
        with Image.open(img_path) as img:
            # 1. Convert to RGB (Standardize mode)
            if img.mode in ("RGBA", "P"):
                img = img.convert("RGB")

            # 2. Deep Copy to strip metadata
            data = list(img.getdata())
            clean_img = Image.new(img.mode, img.size)
            clean_img.putdata(data)

            # 3. Process Dimensions (Both Gemini & ChatGPT)
            if is_gemini or is_chatgpt:
                width, height = clean_img.size

                # --- Step A: Calculate Center Square Crop ---
                # This fixes the aspect ratio for landscape/portrait images
                # ensuring we don't "squish" the image when resizing.
                min_dim = min(width, height)

                left = (width - min_dim) / 2
                top = (height - min_dim) / 2
                right = (width + min_dim) / 2
                bottom = (height + min_dim) / 2

                # Define the initial square box
                crop_box = (left, top, right, bottom)

                # --- Step B: Extra Watermark Trim (Gemini Only) ---
                # Only Gemini gets the extra 2% zoom to remove the corner star.
                # ChatGPT images are just center-cropped to square.
                if is_gemini:
                    zoom_factor = 0.02
                    margin = min_dim * zoom_factor
                    # Contract the box by the margin
                    crop_box = (left + margin, top + margin, right - margin, bottom - margin)

                # Apply the crop
                clean_img = clean_img.crop(crop_box)

                # --- Step C: Resize to Target 1024x1024 ---
                clean_img = clean_img.resize((1024, 1024), Image.Resampling.LANCZOS)

            # 4. Save to new path
            output_path.parent.mkdir(parents=True, exist_ok=True)
            clean_img.save(output_path, "PNG", optimize=True)

            print(f"Processed: {output_path.name}")
            return True

    except Exception as e:
        print(f"Error processing {img_path.name}: {e}")
        return False

def main():
    root_path = Path(ROOT_DIR)
    output_base = Path(OUTPUT_DIR) if OUTPUT_DIR else root_path

    # Walk through all directories
    for subdir, dirs, files in os.walk(root_path):
        subdir_path = Path(subdir)
        counter = 1

        for filename in files:
            if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
                continue

            # Identify Source
            is_gemini = "Gemini_Generated_Image" in filename
            is_chatgpt = "ChatGPT Image" in filename

            if not (is_gemini or is_chatgpt):
                continue

            # Rename logic
            parent_name = subdir_path.name
            safe_parent_name = re.sub(r'[^\w\-_]', '_', parent_name)
            new_filename = f"{safe_parent_name}_{counter:03d}.png"
            counter += 1

            # Path logic
            input_file = subdir_path / filename
            rel_path = subdir_path.relative_to(root_path)
            output_file = output_base / rel_path / new_filename

            # Pass both flags to the processor
            strip_metadata_and_process(input_file, output_file, is_gemini, is_chatgpt)

if __name__ == "__main__":
    main()


In [None]:
def sync_dir_structure(target_dir_path):
    """

    """
    pass
