In [17]:
"""
-Arman Bazarchi-
Exploitation Zone — Images notebook
Goal:
 - Read images from trusted-zone/images/
 - Only embed images whose UUID exists in metadata_embeddings
 - Avoid duplicates by checking existing uuids in image_embeddings collection
 - Store embeddings persistently for similarity search
"""

# -----------------------
# 1️⃣ Imports & Setup
# -----------------------
from minio import Minio
from PIL import Image
import io, os, re, tempfile
import chromadb
from datetime import datetime
from langchain_experimental.open_clip import OpenCLIPEmbeddings  # ✅ NEW import

# -----------------------
# 2️⃣ Configuration
# -----------------------
MINIO_ENDPOINT = "localhost:9000"
ACCESS_KEY = "admin"
SECRET_KEY = "password123"

TRUSTED_ZONE = "trusted-zone"
IMAGES_PREFIX = "images/"

CHROMA_DB_DIR = "exploitation_db"
IMAGE_COLLECTION_NAME = "image_embeddings"

# -----------------------
# 3️⃣ Connect to MinIO & check Trusted Zone
# -----------------------
client = Minio(MINIO_ENDPOINT, access_key=ACCESS_KEY, secret_key=SECRET_KEY, secure=False)

if not client.bucket_exists(TRUSTED_ZONE):
    raise SystemExit(f"⚠️ Trusted zone bucket '{TRUSTED_ZONE}' does not exist. Cannot continue.")

# -----------------------
# 4️⃣ Connect to ChromaDB and collections
# -----------------------
chroma_client = chromadb.PersistentClient(path=CHROMA_DB_DIR)

# Metadata embeddings collection (source of valid UUIDs)
metadata_collection = chroma_client.get_or_create_collection(
    name="metadata_embeddings",
    metadata={"description": "Embeddings for trusted metadata records"}
)

# Image embeddings collection
image_collection = chroma_client.get_or_create_collection(
    name=IMAGE_COLLECTION_NAME,
    metadata={"description": "Embeddings for trusted images"}
)

# -----------------------
# 5️⃣ Get existing UUIDs from metadata_embeddings
# -----------------------
meta_existing_count = metadata_collection.count()
meta_ids = []

if meta_existing_count > 0:
    batch_size = 500
    offset = 0
    while True:
        batch = metadata_collection.get(limit=batch_size, offset=offset)
        if not batch["ids"]:
            break
        meta_ids.extend(batch["ids"])
        offset += batch_size

meta_ids = set(meta_ids)
if not meta_ids:
    raise SystemExit("⚠️ No UUIDs found in metadata_embeddings. Cannot proceed with image embedding.")
print(f"📊 UUIDs available from metadata_embeddings: {len(meta_ids)}")

# -----------------------
# 6️⃣ Get existing UUIDs from image_embeddings
# -----------------------
image_existing_count = image_collection.count()
existing_image_ids = []

if image_existing_count > 0:
    batch_size = 500
    offset = 0
    while True:
        batch = image_collection.get(limit=batch_size, offset=offset)
        if not batch["ids"]:
            break
        existing_image_ids.extend(batch["ids"])
        offset += batch_size

existing_image_ids = set(existing_image_ids)
print(f"📊 Existing embeddings in image collection: {len(existing_image_ids)}")

# -----------------------
# 6️⃣ List images in trusted-zone/images/
# -----------------------
trusted_objects = list(client.list_objects(TRUSTED_ZONE, prefix=IMAGES_PREFIX, recursive=True))
print(f"📂 Scanned Trusted Zone images: found {len(trusted_objects)} files under '{IMAGES_PREFIX}'.")

# Extract UUIDs and filter: must exist in metadata, not already embedded
uuid_re = re.compile(r".*/([a-f0-9\-]{36})\.\w+$", flags=re.I)
candidates = []
for obj in trusted_objects:
    m = uuid_re.search(obj.object_name)
    if not m:
        continue
    uid = m.group(1)
    if uid not in meta_ids:      # skip images not in metadata
        continue
    if uid in existing_image_ids:  # skip already embedded images
        continue
    candidates.append((uid, obj.object_name))

print(f"🎯 Candidates for embedding: {len(candidates)} images (present in metadata, not yet embedded).")

# -----------------------
# 7️⃣ Generate embeddings and store
# -----------------------
if not candidates:
    print("⚠️ No new images to embed. Exiting.")
else:
    # Initialize OpenCLIP embedder ✅
    clip_embd = OpenCLIPEmbeddings(
        model_name="ViT-B-32",              # or "ViT-g-14" for higher quality
        checkpoint="laion2b_s34b_b79k"
    )

    uuids, docs, embeddings = [], [], []

    for uid, path in candidates:
        try:
            # Download image from MinIO
            data = client.get_object(TRUSTED_ZONE, path)
            img_bytes = data.read()
            data.close()
            data.release_conn()

            # ✅ Use cross-platform temp file
            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
                tmp_file.write(img_bytes)
                tmp_path = tmp_file.name

            # ✅ Generate embedding from image file path
            embedding = clip_embd.embed_image([tmp_path])[0]

            # ✅ Remove temp file immediately after use
            os.remove(tmp_path)

            if not embedding or len(embedding) == 0:
                print(f"⚠️ No embedding generated for {path}")
                continue

            # Store data for insertion
            uuids.append(uid)
            docs.append(path)
            embeddings.append(embedding)
            print(f"Success embedding of {uid}")

        except Exception as e:
            print(f"⚠️ Failed to process {path}: {e}")
            continue

    # ✅ Add to ChromaDB
    if uuids and embeddings:
        image_collection.add(
            ids=uuids,
            embeddings=embeddings,
            metadatas=[{"path": p} for p in docs],
            documents=docs
        )
        print(f"✅ Added {len(uuids)} new image embeddings to collection '{IMAGE_COLLECTION_NAME}'.")
    else:
        print("⚠️ No valid embeddings generated.")

# -----------------------
# 8️⃣ Summary
# -----------------------
total_count = image_collection.count()
print(f"📊 Collection '{IMAGE_COLLECTION_NAME}' now contains {total_count} total records.")
print("✅ Exploitation Images processing complete.")


📊 UUIDs available from metadata_embeddings: 736
📊 Existing embeddings in image collection: 736
📂 Scanned Trusted Zone images: found 736 files under 'images/'.
🎯 Candidates for embedding: 0 images (present in metadata, not yet embedded).
⚠️ No new images to embed. Exiting.
📊 Collection 'image_embeddings' now contains 736 total records.
✅ Exploitation Images processing complete.
