In [1]:
"""
-Arman Bazarchi-
Exploitation Zone — Images notebook

 - Read images from trusted-zone/images/...
 - Only embed images whose UUID exists in metadata_embeddings to cross-check with text data
 - Avoid duplicates by checking existing uuids in image_embeddings collection
 - Store embeddings persistently for similarity search
 - Enrich image embeddings with metadata (species, family, and other columns that we need)
   to have easier access to this data

 - if  we embed images in just 1 folder, all in one , huge data, but 
   later in query we can ask "what life is in this image?" and it must search in 
   huge data and find similars to it.
   However
   if we embed images in seperate collections based on class for example, then 
   in query user must ask "which reptile" or "which mamal" so user must specify the class himself
   then query would be faster because it would only search in that specific class, but
   is not aligned with our aim for this project, as we aim to ask which 'animal is this'

   it would make sense to have them seperately based on kingdom (animalia, plantea, fungi), a user can 
   specifie to this part for sure, but also we are not aiming to store other than animals for this project,
   but would be better choice for a bigger project!
   

# - removes any temporary file from local storage
"""


from minio import Minio
from PIL import Image
import io, os, re, tempfile
import chromadb
import pandas as pd
from datetime import datetime
from langchain_experimental.open_clip import OpenCLIPEmbeddings  

# -----------------------
# 1. Configuration
# -----------------------
def process_exploitation_images(
    MINIO = "localhost:9000",
    ACCESS_KEY = "admin",
    SECRET_KEY = "password123"):

    TRUSTED_ZONE = "trusted-zone"
    IMAGES_PREFIX = "images/"
    
    # set the working directory
    try:
        SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) # in orchestrated
    except NameError:
        SCRIPT_DIR = os.getcwd() # in notebook
        
    CHROMA_DB = os.path.join(SCRIPT_DIR, "../Exploitation-Zone/exploitation_db")

    IMAGE_COLLECTION_NAME = "image_embeddings"    
    
    #  Connect to MinIO & check Trusted Zone
    client = Minio(MINIO, access_key=ACCESS_KEY, secret_key=SECRET_KEY, secure=False)
    
    # raise error trusted zone does not exist
    if not client.bucket_exists(TRUSTED_ZONE):
        raise SystemExit(f" Trusted zone bucket '{TRUSTED_ZONE}' does not exist. Cannot continue.")
    
    # -----------------------
    # 2. Load metadata CSV for enrichment
    # -----------------------
    print(" Loading metadata CSV for enrichment...")
    
    # Find the latest metadata file
    metadata_objs = [
        obj.object_name for obj in client.list_objects(TRUSTED_ZONE, prefix="metadata/", recursive=True)
        if obj.object_name.lower().endswith(".csv")]
    
    if not metadata_objs:
        raise SystemExit(" No metadata CSV files found in trusted-zone.")
    
    # Use the latest metadata file
    metadata_objs.sort(reverse=True)
    latest_meta = metadata_objs[0]
    print(f" Loading metadata: {latest_meta}")
    
    # Download and load metadata
    resp = client.get_object(TRUSTED_ZONE, latest_meta)
    data = resp.read()
    resp.close()
    resp.release_conn()
    metadata_df = pd.read_csv(io.BytesIO(data))
    
    # Create metadata lookup dictionary by UUID
    # we take these columns to enrich the metadata of each image embeddings
    metadata_lookup = {}
    for _, row in metadata_df.iterrows():
        uuid_val = row.get('uuid')
        if pd.notna(uuid_val):
            metadata_lookup[uuid_val] = {
                'species': row.get('species', ''),
                'family': row.get('family', ''),
                'class': row.get('class', ''),
                'kingdom': row.get('kingdom', ''),
                'path': row.get('formatted_path', ''),
                'scientific_name': row.get('scientific_name', ''),
                'common': row.get('common', ''),
                'genus': row.get('genus', ''),
                'order': row.get('order', ''),
                'phylum': row.get('phylum', '')
            }
    
    print(f" Loaded metadata for {len(metadata_lookup)} records")
    
    
    
    #  Connect to ChromaDB and collections
    chroma_client = chromadb.PersistentClient(path=CHROMA_DB)
    
    # Metadata embeddings collection 
    metadata_collection = chroma_client.get_or_create_collection(
        name="metadata_embeddings",
        metadata={"description": "Embeddings for trusted metadata records"}
    )
    
    # Image embeddings collection
    image_collection = chroma_client.get_or_create_collection(
        name=IMAGE_COLLECTION_NAME,
        metadata={"description": "Embeddings for trusted images"}
    )
    
    # -----------------------
    # 3. Get existing UUIDs here 'ids' from metadata_embeddings
    # -----------------------
    meta_existing_count = metadata_collection.count()
    meta_ids = []
    
    if meta_existing_count > 0:
        batch_size = 500
        offset = 0
        while True:
            batch = metadata_collection.get(limit=batch_size, offset=offset)
            if not batch["ids"]:
                break
            meta_ids.extend(batch["ids"])
            offset += batch_size
    
    meta_ids = set(meta_ids)
    if not meta_ids:
        raise SystemExit(" No UUIDs found in metadata_embeddings. Cannot proceed with image embedding.")
    print(f" UUIDs available from metadata_embeddings: {len(meta_ids)}")
    
    # -----------------------
    # 4. Get existing UUIDs from image_embeddings
    # -----------------------
    image_existing_count = image_collection.count()
    existing_image_ids = []
    
    if image_existing_count > 0:
        batch_size = 500
        offset = 0
        while True:
            batch = image_collection.get(limit=batch_size, offset=offset)
            if not batch["ids"]:
                break
            existing_image_ids.extend(batch["ids"])
            offset += batch_size
    
    existing_image_ids = set(existing_image_ids)
    print(f" Existing embeddings in image collection: {len(existing_image_ids)}")
    
    
    
    #  List images in trusted-zone/images/
    trusted_objects = list(client.list_objects(TRUSTED_ZONE, prefix=IMAGES_PREFIX, recursive=True))
    print(f" Scanned Trusted Zone images: found {len(trusted_objects)} files under '{IMAGES_PREFIX}'.")
    
    # Extract UUIDs and filter: must exist in metadata, not already embedded
    uuid_re = re.compile(r".*/([a-f0-9\-]{36})\.\w+$", flags=re.I)
    candidates = []
    for obj in trusted_objects:
        m = uuid_re.search(obj.object_name)
        if not m:
            continue
        uid = m.group(1)
        if uid not in meta_ids:      # skip images not in metadata
            continue
        if uid in existing_image_ids:  # skip already embedded images
            continue
        candidates.append((uid, obj.object_name))
    
    print(f" Candidates for embedding: {len(candidates)} images (present in metadata, not yet embedded).")
    
    # -----------------------
    # 5. Generate embeddings and store
    # -----------------------
    if not candidates:
        print(" No new images to embed. Exiting.")
    else:
        #  OpenCLIP embedder 
        clip_embd = OpenCLIPEmbeddings(
            model_name="ViT-B-32",              # we use ViT-B-32 because dataset is huge later can chenge to ViT-g-14 for higher quality
            checkpoint="laion2b_s34b_b79k"
        )
    
        uuids, docs, embeddings = [], [], []
    
        for uid, path in candidates:
            try:
                # Download image from MinIO
                data = client.get_object(TRUSTED_ZONE, path)
                img_bytes = data.read()
                data.close()
                data.release_conn()
    
                #  Create a temp file
                with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
                    tmp_file.write(img_bytes)
                    tmp_path = tmp_file.name
    
                #  Generate embedding from image file path
                embedding = clip_embd.embed_image([tmp_path])[0]
    
                #  Remove temp file immediately after use
                os.remove(tmp_path)
    
                if not embedding or len(embedding) == 0:
                    print(f" No embedding generated for {path}")
                    continue
    
                # Store data for insertion
                uuids.append(uid)
                docs.append(path)
                embeddings.append(embedding)
                print(f"Success embedding of {uid}")
    
            except Exception as e:
                print(f" Failed to process {path}: {e}")
                continue
    
        #  Add to ChromaDB along with enriched metadata
        if uuids and embeddings:
            # Create enriched metadata for each embedding
            enriched_metadatas = []
            for i, (uid, path) in enumerate(zip(uuids, docs)):
                # Get metadata for this UUID
                meta_info = metadata_lookup.get(uid, {})
                
                # Create enriched metadata
                enriched_meta = {
                    "path": path,
                    "species": meta_info.get('species', ''),
                    "family": meta_info.get('family', ''),
                    "class": meta_info.get('class', ''),
                    "kingdom": meta_info.get('kingdom', ''),
                    "scientific_name": meta_info.get('scientific_name', ''),
                    "common": meta_info.get('common', ''),
                    "genus": meta_info.get('genus', ''),
                    "order": meta_info.get('order', ''),
                    "phylum": meta_info.get('phylum', ''),
                    "trusted_path": meta_info.get('trusted_path', '')
                }
                enriched_metadatas.append(enriched_meta)
            
            image_collection.add(
                ids=uuids,
                embeddings=embeddings,
                metadatas=enriched_metadatas,
                documents=docs
            )
            print(f" Added {len(uuids)} new image embeddings with enriched metadata to collection '{IMAGE_COLLECTION_NAME}'.")
        else:
            print(" No valid embeddings generated.")
    
    
    #  Summary
    total_count = image_collection.count()
    print(f"Collection '{IMAGE_COLLECTION_NAME}' now contains {total_count} total records.")
    print(" Exploitation Images processing complete.")
    
    
    
process_exploitation_images();
    


📥 Loading metadata CSV for enrichment...
📂 Loading metadata: metadata/trusted_metadata_2025_10_24_05_52_59.csv
✅ Loaded metadata for 6098 records
📊 UUIDs available from metadata_embeddings: 6098
📊 Existing embeddings in image collection: 4932
📂 Scanned Trusted Zone images: found 6098 files under 'images/'.
🎯 Candidates for embedding: 1166 images (present in metadata, not yet embedded).
Success embedding of 327bfdb5-c2ad-4cf6-a078-5a00cae3ddc4
Success embedding of e60f5bd1-b134-4dfe-83ea-fa08b882ce33
Success embedding of cc4447d6-6047-4e61-b202-4efa89f0c224
Success embedding of 38cc660f-9560-423d-aeb6-4edb7c286b99
Success embedding of 492425e8-6a9f-406d-8ca4-4ba2924f0a6f
Success embedding of 9f207d4c-cbe4-4771-814a-60c4be894605
Success embedding of a76d1023-a49a-4227-a12c-d0e0864ea165
Success embedding of b5d61b57-2cc2-49a9-9842-d9429348d397
Success embedding of b6403418-af4b-4059-90e3-4caaa4f8c333
Success embedding of b6b2d4b7-c631-4427-a22c-2a551238c0e7
Success embedding of c9686dd2-3