In [2]:
"""
-Arman Bazarchi-
Trusted Zone — Images notebook
Goal:
 - Read images from formatted-zone/images/
 - Only keep images referenced in trusted metadata
 - Apply generic cleaning tasks:
    * Remove duplicates by UUID
    * Skip images with invalid names or missing metadata
    * Normalize format/resolution/aspect ratio
 - Save cleaned images into trusted-zone/images/
"""

from minio import Minio
from PIL import Image
import io, os, shutil, re
import pandas as pd
from tqdm import tqdm

# -----------------------
# 1️⃣ Configuration
# -----------------------
MINIO_ENDPOINT = "localhost:9000"
ACCESS_KEY = "admin"
SECRET_KEY = "password123"

FORMATTED_ZONE = "formatted-zone"
TRUSTED_ZONE = "trusted-zone"

FORMATTED_IMG_PREFIX = "images/"
TRUSTED_IMG_PREFIX = "images/"
TRUSTED_META_PREFIX = "metadata/"

# Connect to MinIO
client = Minio(MINIO_ENDPOINT, access_key=ACCESS_KEY, secret_key=SECRET_KEY, secure=False)
if not client.bucket_exists(TRUSTED_ZONE):
    client.make_bucket(TRUSTED_ZONE)
    print(f"✅ Created trusted zone bucket: {TRUSTED_ZONE}")

# -----------------------
# 2️⃣ Load trusted metadata
# -----------------------
trusted_meta_files = [
    obj.object_name for obj in client.list_objects(TRUSTED_ZONE, prefix=TRUSTED_META_PREFIX, recursive=True)
    if obj.object_name.lower().endswith(".csv") and "trusted_metadata_" in obj.object_name
]

if not trusted_meta_files:
    raise SystemExit("⚠️ No trusted metadata found.")

# Take the latest trusted metadata CSV
trusted_meta_files.sort(reverse=True)
latest_meta_file = trusted_meta_files[0]
local_meta = "temp_trusted_metadata.csv"
client.fget_object(TRUSTED_ZONE, latest_meta_file, local_meta)
trusted_df = pd.read_csv(local_meta)
os.remove(local_meta)

# Keep set of valid UUIDs
valid_uuids = set(trusted_df["uuid"].dropna())
print(f"📦 Found {len(valid_uuids)} valid UUIDs in trusted metadata.")

# -----------------------
# 3️⃣ Scan existing images in Trusted Zone (to avoid duplicates)
# -----------------------
existing_trusted_uuids = set()
for obj in client.list_objects(TRUSTED_ZONE, prefix=TRUSTED_IMG_PREFIX, recursive=True):
    match = re.match(r".*/([a-f0-9\-]+)\.jpg$", obj.object_name, re.IGNORECASE)
    if match:
        existing_trusted_uuids.add(match.group(1))

print(f"📦 Found {len(existing_trusted_uuids)} existing images in Trusted Zone.")

# -----------------------
# 4️⃣ Process formatted images
# -----------------------
formatted_images = list(client.list_objects(FORMATTED_ZONE, prefix=FORMATTED_IMG_PREFIX, recursive=True))
print(f"🔍 Found {len(formatted_images)} images in Formatted Zone.")

for obj in tqdm(formatted_images, desc="Processing images"):
    try:
        # Extract UUID from filename
        match = re.match(r".*/([a-f0-9\-]+)\.\w+$", obj.object_name)
        if not match:
            print(f"⚠️ Skipping invalid filename: {obj.object_name}")
            continue

        img_uuid = match.group(1)

        # Skip if not in trusted metadata or already in trusted images
        if img_uuid not in valid_uuids:
            continue
        if img_uuid in existing_trusted_uuids:
            continue

        # Download image
        data = client.get_object(FORMATTED_ZONE, obj.object_name)
        img_bytes = data.read()
        data.close()
        data.release_conn()

        img = Image.open(io.BytesIO(img_bytes))

        # Optional cleaning: convert to JPEG if needed
        if img.format.upper() != "JPEG":
            buf = io.BytesIO()
            img.convert("RGB").save(buf, format="JPEG")
            img_bytes = buf.getvalue()

        # Optional: normalize size/aspect ratio
        # Example: resize max dimension to 1024px while preserving aspect ratio
        max_dim = 1024
        if max(img.size) > max_dim:
            img.thumbnail((max_dim, max_dim))
            buf = io.BytesIO()
            img.save(buf, format="JPEG")
            img_bytes = buf.getvalue()

        # Construct trusted image path
        trusted_path = obj.object_name.replace(FORMATTED_IMG_PREFIX, TRUSTED_IMG_PREFIX)

        # Upload to Trusted Zone
        client.put_object(
            TRUSTED_ZONE,
            trusted_path,
            data=io.BytesIO(img_bytes),
            length=len(img_bytes),
            content_type="image/jpeg"
        )

        existing_trusted_uuids.add(img_uuid)
        print(f"✅ Uploaded trusted image: {trusted_path}")

    except Exception as e:
        print(f"⚠️ Error processing {obj.object_name}: {e}")
        continue

print("✅ All trusted images processed and uploaded.")


📦 Found 736 valid UUIDs in trusted metadata.
📦 Found 736 existing images in Trusted Zone.
🔍 Found 736 images in Formatted Zone.


Processing images: 100%|█████████████████████████████████████████████████████████| 736/736 [00:00<00:00, 143795.78it/s]

✅ All trusted images processed and uploaded.



