In [4]:
"""
-Arman Bazarchi-
Exploitation Zone ‚Äî Metadata notebook
Goal:
 - Read trusted metadata CSV(s) from trusted-zone/metadata/
 - Create text embeddings for metadata using ChromaDB
 - Avoid duplicates by checking existing uuids in the collection
 - Store embeddings persistently for similarity search
"""

# -----------------------
# 1Ô∏è‚É£ Imports & Setup
# -----------------------
from minio import Minio
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
import os, io
from datetime import datetime

# -----------------------
# 2Ô∏è‚É£ Configuration
# -----------------------
MINIO_ENDPOINT = "localhost:9000"
ACCESS_KEY = "admin"
SECRET_KEY = "password123"

TRUSTED_ZONE = "trusted-zone"
TRUSTED_META_PREFIX = "metadata/"

CHROMA_DB_DIR = "exploitation_db"
COLLECTION_NAME = "metadata_embeddings"

# -----------------------
# 3Ô∏è‚É£ Connect to MinIO
# -----------------------
client = Minio(MINIO_ENDPOINT, access_key=ACCESS_KEY, secret_key=SECRET_KEY, secure=False)

if not client.bucket_exists(TRUSTED_ZONE):
    raise SystemExit(f"‚ö†Ô∏è Trusted zone bucket '{TRUSTED_ZONE}' does not exist. Cannot continue.")


# List all metadata CSVs from trusted-zone
metadata_objs = [
    obj.object_name for obj in client.list_objects(TRUSTED_ZONE, prefix=TRUSTED_META_PREFIX, recursive=True)
    if obj.object_name.lower().endswith(".csv")
]

if not metadata_objs:
    raise SystemExit("‚ö†Ô∏è No trusted metadata files found in trusted-zone.")

# Use the latest trusted metadata
metadata_objs.sort(reverse=True)
latest_meta = metadata_objs[0]
print(f"üìÇ Loading trusted metadata: {latest_meta}")

# Download to memory
resp = client.get_object(TRUSTED_ZONE, latest_meta)
data = resp.read()
resp.close()
resp.release_conn()
metadata_df = pd.read_csv(io.BytesIO(data))
print(f"‚úÖ Loaded trusted metadata with {len(metadata_df)} rows.")

# -----------------------
# 4Ô∏è‚É£ Combine text columns for embedding
# -----------------------
text_cols = ["kingdom", "phylum", "class", "order", "family", "genus", "species", "scientific_name", "common"]
metadata_df["combined_text"] = metadata_df[text_cols].fillna("").agg(" ".join, axis=1).str.strip()

# Drop rows without valid UUID or text
metadata_df = metadata_df.dropna(subset=["uuid", "combined_text"])
metadata_df = metadata_df[metadata_df["combined_text"].str.len() > 0]
print(f"üßπ Cleaned metadata for embedding: {len(metadata_df)} valid rows.")

# -----------------------
# 5Ô∏è‚É£ Connect to ChromaDB (persistent)
# -----------------------
chroma_client = chromadb.PersistentClient(path=CHROMA_DB_DIR)

# Create or load the collection
collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"description": "Embeddings for trusted metadata records"}
)

# Use default text embedding function
text_embedder = embedding_functions.DefaultEmbeddingFunction()

# -----------------------
# 6Ô∏è‚É£ Avoid duplicates ‚Äî check existing UUIDs
# -----------------------
existing_count = collection.count()

# Get all existing UUIDs (if collection is not empty)
existing_ids = []
if existing_count > 0:
    batch_size = 500
    offset = 0
    while True:
        batch = collection.get(limit=batch_size, offset=offset)
        if not batch["ids"]:
            break
        existing_ids.extend(batch["ids"])
        offset += batch_size

existing_ids = set(existing_ids)
print(f"üìä Existing embeddings in collection: {len(existing_ids)}")

# Filter new rows
new_df = metadata_df[~metadata_df["uuid"].isin(existing_ids)]
print(f"üÜï New records to embed: {len(new_df)}")

# -----------------------
# 7Ô∏è‚É£ Generate and add embeddings
# -----------------------
if new_df.empty:
    print("‚ö†Ô∏è No new metadata to embed. No data was added.")
else:
    texts = new_df["combined_text"].tolist()
    uuids = new_df["uuid"].tolist()

    print("üß† Generating embeddings...")
    embeddings = text_embedder(texts)

    # Store in ChromaDB
    collection.add(
        ids=uuids,
        embeddings=embeddings,
        metadatas=new_df.to_dict(orient="records"),
        documents=texts
    )

    print(f"‚úÖ Added {len(uuids)} new embeddings to collection '{COLLECTION_NAME}'.")

# -----------------------
# 8Ô∏è‚É£ Summary
# -----------------------
final_count = collection.count()
added_count = final_count - existing_count

print("\nüìä ===== Summary =====")
print(f"üóÇÔ∏è Collection: {COLLECTION_NAME}")
print(f"üîπ Previously had: {existing_count}")
print(f"üîπ New added: {added_count}")
print(f"üîπ Total now: {final_count}")
print("=======================")
print("‚úÖ Exploitation Metadata processing complete.")


üìÇ Loading trusted metadata: metadata/trusted_metadata_2025_10_13_02_48_03.csv
‚úÖ Loaded trusted metadata with 736 rows.
üßπ Cleaned metadata for embedding: 736 valid rows.
üìä Existing embeddings in collection: 736
üÜï New records to embed: 0
‚ö†Ô∏è No new metadata to embed. No data was added.

üìä ===== Summary =====
üóÇÔ∏è Collection: metadata_embeddings
üîπ Previously had: 736
üîπ New added: 0
üîπ Total now: 736
‚úÖ Exploitation Metadata processing complete.
