In [16]:
"""
-Arman Bazarchi-
Formatted_Images

# Goal: Move all images from Persistent Landing to Formatted Zone,
#       convert all images to JPEG format, and preserve folder structure
#       kingdom/class/family/specie/uuid.jpg
...

"""

from minio import Minio
from PIL import Image, UnidentifiedImageError
import io
import re
import os
from tqdm import tqdm

# ==============================
# 1️⃣ Configuration
# ==============================
LANDING_ZONE = "landing-zone"
PERSISTENT_PREFIX = "persistent_landing/images"
FORMATTED_PREFIX = "images"  # top-level folder in formatted-zone bucket
FORMATTED_ZONE = "formatted-zone"

client = Minio(
    "localhost:9000",
    access_key="admin",
    secret_key="password123",
    secure=False
)

# Ensure formatted-zone bucket exists
if not client.bucket_exists(FORMATTED_ZONE):
    client.make_bucket(FORMATTED_ZONE)
    print("✅ Created formatted-zone bucket")
else:
    print("✅ Formatted-zone bucket already exists")

# ==============================
# 2️⃣ Scan existing formatted images
# ==============================
existing_formatted_uuids = set()
for obj in client.list_objects(FORMATTED_ZONE, prefix=FORMATTED_PREFIX + "/", recursive=True):
    match = re.match(r".*/([a-f0-9\-]+)\.jpg", obj.object_name)
    if match:
        existing_formatted_uuids.add(match.group(1))

print(f"📦 Found {len(existing_formatted_uuids)} existing formatted images.")

# ==============================
# 3️⃣ List all files under Persistent Landing (any extension)
# ==============================
persistent_files = list(client.list_objects(LANDING_ZONE, prefix=PERSISTENT_PREFIX, recursive=True))
print(f"🔍 Found {len(persistent_files)} files in Persistent Landing.")

processed = 0
skipped = 0
failed = 0

for obj in tqdm(persistent_files, desc="Processing images"):
    try:
        # Extract UUID from filename (e.g., *.jpg, *.png, *.webp, etc.)
        match = re.match(rf"{PERSISTENT_PREFIX}/.+?/([a-f0-9\-]+)\.\w+$", obj.object_name)
        if not match:
            print(f"⚠️ Skipping invalid filename: {obj.object_name}")
            skipped += 1
            continue

        img_uuid = match.group(1)

        # Skip if already processed
        if img_uuid in existing_formatted_uuids:
            skipped += 1
            continue

        # ==============================
        # 4️⃣ Download image
        # ==============================
        data = client.get_object(LANDING_ZONE, obj.object_name)
        img_bytes = data.read()
        data.close()
        data.release_conn()

        # ==============================
        # 5️⃣ Convert to JPEG (always)
        # ==============================
        try:
            img = Image.open(io.BytesIO(img_bytes))
            buf = io.BytesIO()
            img.convert("RGB").save(buf, format="JPEG")
            img_bytes = buf.getvalue()
        except UnidentifiedImageError:
            print(f"⚠️ Could not identify as image: {obj.object_name}")
            failed += 1
            continue

        # ==============================
        # 6️⃣ Construct formatted path
        # ==============================
        formatted_path = obj.object_name.replace(PERSISTENT_PREFIX, FORMATTED_PREFIX)
        formatted_path = re.sub(r"\.\w+$", ".jpg", formatted_path)  # force .jpg extension

        # ==============================
        # 7️⃣ Upload formatted image
        # ==============================
        client.put_object(
            FORMATTED_ZONE,
            formatted_path,
            data=io.BytesIO(img_bytes),
            length=len(img_bytes),
            content_type="image/jpeg"
        )

        existing_formatted_uuids.add(img_uuid)
        processed += 1
        print(f"✅ Uploaded: {formatted_path}")

    except Exception as e:
        print(f"⚠️ Error processing {obj.object_name}: {e}")
        failed += 1
        continue

# ==============================
# 8️⃣ Summary
# ==============================
print("\n✅ All Persistent Landing images processed.")
print(f"🟢 Processed successfully: {processed}")
print(f"🟡 Skipped (existing/invalid): {skipped}")
print(f"🔴 Failed conversions: {failed}")


✅ Formatted-zone bucket already exists
📦 Found 736 existing formatted images.
🔍 Found 736 files in Persistent Landing.


Processing images: 100%|█████████████████████████████████████████████████████████| 736/736 [00:00<00:00, 367151.25it/s]


✅ All Persistent Landing images processed.
🟢 Processed successfully: 0
🟡 Skipped (existing/invalid): 736
🔴 Failed conversions: 0



