In [20]:
"""
-Arman Bazarchi-
Persistent_Landing Zone
...



"""


from minio import Minio
import pandas as pd
import io
import os
from datetime import datetime
import re
import sys

# ==============================
# 1️⃣ Configuration
# ==============================
MINIO_ENDPOINT = "localhost:9000"
ACCESS_KEY = "admin"
SECRET_KEY = "password123"
ROOT_BUCKET = "landing-zone"                 # Main bucket
TEMP_PREFIX = "temporal-landing"             # Source (subbucket)
PERSIST_PREFIX = "persistent_landing"        # Destination (subbucket)
TEMP_METADATA_PATH = f"{TEMP_PREFIX}/metadata/metadata_final.csv"

# ==============================
# 2️⃣ Connect to MinIO
# ==============================
client = Minio(
    MINIO_ENDPOINT,
    access_key=ACCESS_KEY,
    secret_key=SECRET_KEY,
    secure=False
)

# ==============================
# 3️⃣ Validate buckets
# ==============================
if not client.bucket_exists(ROOT_BUCKET):
    sys.exit("❌ ERROR: Root bucket 'Landing' does not exist in MinIO.")

temporal_exists = any(
    obj.object_name.startswith(f"{TEMP_PREFIX}/")
    for obj in client.list_objects(ROOT_BUCKET, recursive=False)
)
if not temporal_exists:
    sys.exit("❌ ERROR: 'Temporal_Landing' does not exist inside 'Landing' bucket.")

# Create Persistent_Landing if missing
persistent_exists = any(
    obj.object_name.startswith(f"{PERSIST_PREFIX}/")
    for obj in client.list_objects(ROOT_BUCKET, recursive=False)
)
if not persistent_exists:
    client.put_object(
        ROOT_BUCKET,
        f"{PERSIST_PREFIX}/.init",
        data=io.BytesIO(b"init"),
        length=4,
        content_type="text/plain"
    )
    print(f"✅ Created 'Persistent_Landing' inside '{ROOT_BUCKET}'.")

# ==============================
# 4️⃣ Load metadata from Temporal_Landing
# ==============================
print("📥 Loading metadata from Temporal_Landing...")
LOCAL_METADATA = "temp_metadata.csv"
try:
    client.fget_object(ROOT_BUCKET, TEMP_METADATA_PATH, LOCAL_METADATA)
except Exception as e:
    sys.exit(f"❌ ERROR: Failed to find metadata at {TEMP_METADATA_PATH} → {e}")

metadata_df = pd.read_csv(LOCAL_METADATA)
print(f"✅ Loaded metadata with {len(metadata_df)} records.")

# ==============================
# 5️⃣ Scan existing Persistent images by UUID
# ==============================
print("🔍 Checking existing images in Persistent_Landing...")
existing_persistent_uuids = set()
for obj in client.list_objects(ROOT_BUCKET, prefix=f"{PERSIST_PREFIX}/images/", recursive=True):
    match = re.match(rf"{PERSIST_PREFIX}/images/.+?/([a-f0-9\-]+)\.jpg", obj.object_name)
    if match:
        existing_persistent_uuids.add(match.group(1))

print(f"📦 Found {len(existing_persistent_uuids)} existing images in Persistent_Landing.")
metadata_df["persistent_path"] = None

# ==============================
# 6️⃣ Move images from Temporal → Persistent
# ==============================
timestamp = datetime.now().strftime("%Y_%m_%d_%H:%M")
moved_records = []

for idx, row in metadata_df.iterrows():
    try:
        img_uuid = row.get("uuid")
        if not img_uuid:
            continue

        # Skip if image already exists in Persistent
        if img_uuid in existing_persistent_uuids:
            print(f"⏩ Skipping duplicate UUID: {img_uuid}")
            continue

        object_name = row.get("temporal_path") 
        src_path = object_name

        kingdom = str(row.get("kingdom", "Unknown")).replace(" ", "_")
        cls = str(row.get("class", "Unknown")).replace(" ", "_")
        family = str(row.get("family", "Unknown")).replace(" ", "_")
        specie = str(row.get("species", "Unknown")).replace(" ", "_")

        # Destination path using UUID
        dest_path = f"{PERSIST_PREFIX}/images/{kingdom}/{cls}/{family}/{specie}/{img_uuid}.jpg"
        metadata_df.loc[idx, "persistent_path"] = dest_path
        
        # Download image from Temporal
        data = client.get_object(ROOT_BUCKET, src_path)
        image_bytes = data.read()
        data.close()
        data.release_conn()

        # Upload image to Persistent
        client.put_object(
            ROOT_BUCKET,
            dest_path,
            data=io.BytesIO(image_bytes),
            length=len(image_bytes),
            content_type="image/jpeg"
        )

        
        moved_records.append(row)
        existing_persistent_uuids.add(img_uuid)

        print(f"✅ Moved {src_path} → {dest_path}")

    except Exception as e:
        print(f"⚠️ Error processing {object_name}: {e}")
        continue

# ==============================
# 7️⃣ Save or merge metadata files by (Kingdom, Class)
# ==============================
if not metadata_df.empty:
    print("🧠 Updating kingdom-class based metadata files...")

    for (kingdom_name, cls_name), group in metadata_df.groupby(["kingdom", "class"]):
        kingdom_safe = str(kingdom_name).replace(" ", "_")
        cls_safe = str(cls_name).replace(" ", "_")

        # Always create a new filename with current timestamp for updated metadata
        timestamp = datetime.now().strftime("%Y_%m_%d_%H:%M")
        metadata_filename = f"{kingdom_safe}_{cls_safe}_metadata_{timestamp}.csv"
        local_metadata_file = f"{kingdom_safe}_{cls_safe}_metadata_temp.csv"
        persistent_metadata_dir = f"{PERSIST_PREFIX}/metadata/"

        # Look for any existing metadata file for this kingdom+class
        existing_metadata_files = [
            obj.object_name for obj in client.list_objects(ROOT_BUCKET, prefix=persistent_metadata_dir, recursive=True)
            if re.match(rf"{persistent_metadata_dir}{kingdom_safe}_{cls_safe}_metadata_.*\.csv", obj.object_name)
        ]

        if existing_metadata_files:
            # Take the latest metadata file (any timestamp)
            existing_metadata_files.sort(reverse=True)
            existing_metadata_path = existing_metadata_files[0]
            existing_local_file = f"existing_{kingdom_safe}_{cls_safe}.csv"
            client.fget_object(ROOT_BUCKET, existing_metadata_path, existing_local_file)

            # Read existing metadata
            existing_df = pd.read_csv(existing_local_file)

            # Delete old file from Persistent
            client.remove_object(ROOT_BUCKET, existing_metadata_path)
            os.remove(existing_local_file)

            # Merge Temporal rows that are not already in Persistent
            new_rows = group[~group["uuid"].isin(existing_df["uuid"])]
            merged_df = pd.concat([existing_df, new_rows], ignore_index=True)
        else:
            # No existing metadata → just use all rows from Temporal
            group["persistent_path"]
            merged_df = group

        # Save merged metadata with updated timestamp
        merged_df.to_csv(local_metadata_file, index=False)
        persistent_metadata_path_new = f"{persistent_metadata_dir}{metadata_filename}"
        client.fput_object(
            ROOT_BUCKET,
            persistent_metadata_path_new,
            local_metadata_file,
            content_type="text/csv"
        )

        os.remove(local_metadata_file)
        print(f"📤 Updated/uploaded metadata for '{kingdom_safe}-{cls_safe}' → {persistent_metadata_path_new}")



    # Cleanup local temp
    os.remove(LOCAL_METADATA)
    print("🧹 Cleaned up local metadata files.")

else:
    print("⚠️ No new images moved; skipping metadata upload.")

print("✅ Persistent Landing Zone completed successfully.")


✅ Created 'Persistent_Landing' inside 'landing-zone'.
📥 Loading metadata from Temporal_Landing...
✅ Loaded metadata with 736 records.
🔍 Checking existing images in Persistent_Landing...
📦 Found 0 existing images in Persistent_Landing.
✅ Moved temporal-landing/images/d0caebda-85d3-46fc-978e-58ca4c7d5cfd.jpg → persistent_landing/images/Animalia/Squamata/Colubridae/constrictor/d0caebda-85d3-46fc-978e-58ca4c7d5cfd.jpg
✅ Moved temporal-landing/images/cc3e8cd0-b993-4fdf-ae35-27880edf1597.jpg → persistent_landing/images/Animalia/Squamata/Colubridae/sipedon/cc3e8cd0-b993-4fdf-ae35-27880edf1597.jpg
✅ Moved temporal-landing/images/bafa2323-81f5-4783-bb58-6b5d26a2e1b7.jpg → persistent_landing/images/Animalia/Squamata/Colubridae/constrictor/bafa2323-81f5-4783-bb58-6b5d26a2e1b7.jpg
✅ Moved temporal-landing/images/9b074b8a-3fa2-4da5-9925-7f28ab1b9e36.jpg → persistent_landing/images/Animalia/Squamata/Colubridae/dendrophila/9b074b8a-3fa2-4da5-9925-7f28ab1b9e36.jpg
✅ Moved temporal-landing/images/521e2