In [1]:
"""
-Arman Bazarchi-
Persistent_Landing Zone
here we retrieve the stored raw data in temporal-landing and place them more organized in persistent-landing.
connects to minIO and creates a persistent_landing subbucket in landing-zone bucket,
raises an error if the temporal_landing subbucket or landing-zone bucket does not exist.
saves text data in a csv file in a folder 'metadata',
saves each image in its specie folder inside its family folder, inside its class, inside the kingdom it belongs to.
so we ensure organized data having easy access to each one.
example path of each image: landing-zone/persistent_landing/images/{kingdom}/{class}/{family}/{specie}/{img_uuid}.jpg
it avoids storing duplicate data in persistent_landing.
in end removes temporary files from local storage and delets the temporal_landing as we hav now moved the data to persistent.

"""


from minio import Minio
import pandas as pd
import io
import os
from datetime import datetime
import re
import sys

# ==============================
# 1.  Configuration
# ==============================
MINIO_ENDPOINT = "localhost:9000"
ACCESS_KEY = "admin"
SECRET_KEY = "password123"
ROOT_BUCKET = "landing-zone"                 # Main bucket
TEMP_PREFIX = "temporal-landing"             # Source (subbucket)
PERSIST_PREFIX = "persistent_landing"        # Destination (subbucket)
TEMP_METADATA_PATH = f"{TEMP_PREFIX}/metadata/metadata_final.csv"


#  Connect to MinIO
client = Minio(
    MINIO_ENDPOINT,
    access_key=ACCESS_KEY,
    secret_key=SECRET_KEY,
    secure=False
)


#  Validate buckets
if not client.bucket_exists(ROOT_BUCKET):
    sys.exit("❌ ERROR: Root bucket 'Landing' does not exist in MinIO.")

temporal_exists = any(
    obj.object_name.startswith(f"{TEMP_PREFIX}/")
    for obj in client.list_objects(ROOT_BUCKET, recursive=False)
)
if not temporal_exists:
    sys.exit("❌ ERROR: 'Temporal_Landing' does not exist inside 'Landing' bucket.")

# Create Persistent_Landing if missing
persistent_exists = any(
    obj.object_name.startswith(f"{PERSIST_PREFIX}/")
    for obj in client.list_objects(ROOT_BUCKET, recursive=False)
)
if not persistent_exists:
    client.put_object(
        ROOT_BUCKET,
        f"{PERSIST_PREFIX}/.init",
        data=io.BytesIO(b"init"),
        length=4,
        content_type="text/plain"
    )
    print(f"✅ Created 'Persistent_Landing' inside '{ROOT_BUCKET}'.")

# ==============================
# 2. Load metadata from Temporal_Landing
# ==============================
print("📥 Loading metadata from Temporal_Landing...")
LOCAL_METADATA = "temp_metadata.csv"
try:
    client.fget_object(ROOT_BUCKET, TEMP_METADATA_PATH, LOCAL_METADATA)
except Exception as e:
    sys.exit(f"❌ ERROR: Failed to find metadata at {TEMP_METADATA_PATH} → {e}")

metadata_df = pd.read_csv(LOCAL_METADATA)
print(f"✅ Loaded metadata with {len(metadata_df)} records.")

# ==============================
# 3. Scan existing Persistent images by UUID
# ==============================
print("🔍 Checking existing images in Persistent_Landing...")
existing_persistent_uuids = set()
for obj in client.list_objects(ROOT_BUCKET, prefix=f"{PERSIST_PREFIX}/images/", recursive=True):
    match = re.match(rf"{PERSIST_PREFIX}/images/.+?/([a-f0-9\-]+)\.jpg", obj.object_name)
    if match:
        existing_persistent_uuids.add(match.group(1))

print(f"📦 Found {len(existing_persistent_uuids)} existing images in Persistent_Landing.")
metadata_df["persistent_path"] = None

# ==============================
# 4. Move images from Temporal -> Persistent
# ==============================
timestamp = datetime.now().strftime("%Y_%m_%d_%H:%M")
moved_records = []

for idx, row in metadata_df.iterrows():
    try:
        img_uuid = row.get("uuid")
        if not img_uuid:
            continue

        # Skip if image already exists in Persistent
        if img_uuid in existing_persistent_uuids:
            print(f"⏩ Skipping duplicate UUID: {img_uuid}")
            continue

        object_name = row.get("temporal_path") 
        src_path = object_name

        kingdom = str(row.get("kingdom", "Unknown")).replace(" ", "_")
        cls = str(row.get("class", "Unknown")).replace(" ", "_")
        family = str(row.get("family", "Unknown")).replace(" ", "_")
        specie = str(row.get("species", "Unknown")).replace(" ", "_")

        # Destination path 
        dest_path = f"{PERSIST_PREFIX}/images/{kingdom}/{cls}/{family}/{specie}/{img_uuid}.jpg"
        metadata_df.loc[idx, "persistent_path"] = dest_path
        
        # Download image from Temporal
        data = client.get_object(ROOT_BUCKET, src_path)
        image_bytes = data.read()
        data.close()
        data.release_conn()

        # Upload image to Persistent
        client.put_object(
            ROOT_BUCKET,
            dest_path,
            data=io.BytesIO(image_bytes),
            length=len(image_bytes),
            content_type="image/jpeg"
        )

        
        moved_records.append(row)
        existing_persistent_uuids.add(img_uuid)

        print(f"✅ Moved {src_path} → {dest_path}")

    except Exception as e:
        print(f"⚠️ Error processing {object_name}: {e}")
        continue

# ==============================
# 5. Save or merge metadata files by (Kingdom, Class)
# we store for each class of a kingdom a seperate metadata
# if storing different classes or kingdoms in future, we will have seperat for each for easy access
# ==============================
if not metadata_df.empty:
    print("🧠 Updating kingdom-class based metadata files...")

    for (kingdom_name, cls_name), group in metadata_df.groupby(["kingdom", "class"]):
        kingdom_safe = str(kingdom_name).replace(" ", "_")
        cls_safe = str(cls_name).replace(" ", "_")

        #  create a new filename with current timestamp for updated metadata
        timestamp = datetime.now().strftime("%Y_%m_%d_%H:%M")
        metadata_filename = f"{kingdom_safe}_{cls_safe}_metadata_{timestamp}.csv"
        local_metadata_file = f"{kingdom_safe}_{cls_safe}_metadata_temp.csv"
        persistent_metadata_dir = f"{PERSIST_PREFIX}/metadata/"

        # Look for any existing metadata file for this kingdom+class
        existing_metadata_files = [
            obj.object_name for obj in client.list_objects(ROOT_BUCKET, prefix=persistent_metadata_dir, recursive=True)
            if re.match(rf"{persistent_metadata_dir}{kingdom_safe}_{cls_safe}_metadata_.*\.csv", obj.object_name)
        ]

        if existing_metadata_files:
            # Take the latest metadata file (any timestamp)
            existing_metadata_files.sort(reverse=True)
            existing_metadata_path = existing_metadata_files[0]
            existing_local_file = f"existing_{kingdom_safe}_{cls_safe}.csv"
            client.fget_object(ROOT_BUCKET, existing_metadata_path, existing_local_file)

            # Read existing metadata
            existing_df = pd.read_csv(existing_local_file)

            # Delete old file from Persistent
            client.remove_object(ROOT_BUCKET, existing_metadata_path)
            os.remove(existing_local_file)

            # Merge Temporal rows that are not already in Persistent
            new_rows = group[~group["uuid"].isin(existing_df["uuid"])]
            merged_df = pd.concat([existing_df, new_rows], ignore_index=True)
        else:
            # No existing metadata -> use all rows from Temporal
            group["persistent_path"]
            merged_df = group

        # Save merged metadata with updated timestamp
        merged_df.to_csv(local_metadata_file, index=False)
        persistent_metadata_path_new = f"{persistent_metadata_dir}{metadata_filename}"
        client.fput_object(
            ROOT_BUCKET,
            persistent_metadata_path_new,
            local_metadata_file,
            content_type="text/csv"
        )

        os.remove(local_metadata_file)
        print(f"📤 Updated/uploaded metadata for '{kingdom_safe}-{cls_safe}' → {persistent_metadata_path_new}")



    # Cleanup local temp
    os.remove(LOCAL_METADATA)
    print("🧹 Cleaned up local metadata files.")

else:
    print("⚠️ No new images moved; skipping metadata upload.")

# ==============================
# 6. Cleanup Temporal Landing (files only)
# ==============================
print("🧹 Cleaning up Temporal_Landing zone (files only)...")

try:
    temporal_objects = list(client.list_objects(ROOT_BUCKET, prefix=f"{TEMP_PREFIX}/", recursive=True))
    if not temporal_objects:
        print("⚠️ Temporal_Landing is already empty.")
    else:
        deleted_count = 0
        for obj in temporal_objects:
            # Skip the temporal-landing folder itself 
            if (obj.object_name == f"{TEMP_PREFIX}/" or 
                obj.object_name == f"{TEMP_PREFIX}"):
                continue

            client.remove_object(ROOT_BUCKET, obj.object_name)
            print(f"🗑️ Deleted file: {obj.object_name}")
            deleted_count += 1

        print(f"✅ Cleaned up {deleted_count} files from Temporal-Landing (folders kept).")
except Exception as e:
    print(f"⚠️ Warning: Failed to fully clean Temporal_Landing → {e}")



print("✅ Persistent Landing Zone completed successfully.")



📥 Loading metadata from Temporal_Landing...
✅ Loaded metadata with 1817 records.
🔍 Checking existing images in Persistent_Landing...
📦 Found 3115 existing images in Persistent_Landing.
✅ Moved temporal-landing/images/eb0a490a-ed13-4dbc-9653-7d0bf82149d0.jpg → persistent_landing/images/Animalia/Squamata/Viperidae/catenatus/eb0a490a-ed13-4dbc-9653-7d0bf82149d0.jpg
✅ Moved temporal-landing/images/67d697dc-a59d-498b-8fc2-c8d2f98da3ba.jpg → persistent_landing/images/Animalia/Squamata/Colubridae/amoenus/67d697dc-a59d-498b-8fc2-c8d2f98da3ba.jpg
✅ Moved temporal-landing/images/4d5a2761-1837-4fb6-bff9-808bf3ed0636.jpg → persistent_landing/images/Animalia/Squamata/Colubridae/obsoletus/4d5a2761-1837-4fb6-bff9-808bf3ed0636.jpg
✅ Moved temporal-landing/images/709ae45d-4ef0-4d2f-b208-7369dafbcab8.jpg → persistent_landing/images/Animalia/Squamata/Colubridae/spiloides/709ae45d-4ef0-4d2f-b208-7369dafbcab8.jpg
✅ Moved temporal-landing/images/aaadd699-6450-43a0-a9ee-101c18498dd9.jpg → persistent_landing/

🧹 Cleaning up Temporal_Landing zone (files only)...
🗑️ Deleted file: temporal-landing/.init
🗑️ Deleted file: temporal-landing/images/0004e8f7-2c8d-4342-8c04-89852367a2d7.jpg
🗑️ Deleted file: temporal-landing/images/00269ea4-650c-42e9-88a8-1c00656e78d8.jpg
🗑️ Deleted file: temporal-landing/images/00636035-da4f-4453-be61-170cca45ccca.jpg
🗑️ Deleted file: temporal-landing/images/00aac900-8316-4db0-9995-7bea17d6d9d3.jpg
🗑️ Deleted file: temporal-landing/images/00b58a52-ff0f-4d5f-96e1-6f3a85d46fc0.jpg
🗑️ Deleted file: temporal-landing/images/01568de4-e3c9-4c9e-87ca-e8c848b54aad.jpg
🗑️ Deleted file: temporal-landing/images/0158760c-c915-4e0f-874d-a19dfbee64e5.jpg
🗑️ Deleted file: temporal-landing/images/019a61a6-c79c-419e-a6f9-3c37379aef9b.jpg
🗑️ Deleted file: temporal-landing/images/01e97792-ca98-49f8-8cca-81094c957ed0.jpg
🗑️ Deleted file: temporal-landing/images/023ffe86-2f1c-4a11-ad4d-93fc2827a91d.jpg
🗑️ Deleted file: temporal-landing/images/02595bb3-474a-4f62-be9a-f44090d9ed1d.jpg
🗑️ Del