In [1]:
"""
-Arman Bazarchi-
Exploitation Zone — Metadata notebook

 - Read trusted metadata CSV(s) from trusted-zone
 - merges values of needed columns of a row into a single string for embedding
 - we keep up to kingdom because model can be used to hold and integrate data of different kingdoms
 - (Animalia, Plante, Fungi)
 - Create text embeddings for text metadata using ChromaDB
 - Avoid duplicates by checking existing uuids in the collection
 - Store embeddings persistently for similarity search
 - we store them in a chroma directory 'exploitation_db' in the 'metadata_embeddings' collection.
"""


from minio import Minio
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
import os, io
from datetime import datetime

# -----------------------
# 1. Configuration
# -----------------------
MINIO_ENDPOINT = "localhost:9000"
ACCESS_KEY = "admin"
SECRET_KEY = "password123"

TRUSTED_ZONE = "trusted-zone"
TRUSTED_META_PREFIX = "metadata/"

CHROMA_DB_DIR = "exploitation_db"
COLLECTION_NAME = "metadata_embeddings"


#  Connect to MinIO
client = Minio(MINIO_ENDPOINT, access_key=ACCESS_KEY, secret_key=SECRET_KEY, secure=False)

# raise error if no trusted-zone 
if not client.bucket_exists(TRUSTED_ZONE):
    raise SystemExit(f"⚠️ Trusted zone bucket '{TRUSTED_ZONE}' does not exist. Cannot continue.")


# List all metadata CSVs from trusted-zone
metadata_objs = [
    obj.object_name for obj in client.list_objects(TRUSTED_ZONE, prefix=TRUSTED_META_PREFIX, recursive=True)
    if obj.object_name.lower().endswith(".csv")]

# raise error if no metadata available
if not metadata_objs:
    raise SystemExit("⚠️ No trusted metadata files found in trusted-zone.")

# Use the latest trusted metadata
metadata_objs.sort(reverse=True)
latest_meta = metadata_objs[0]
print(f"📂 Loading trusted metadata: {latest_meta}")

# Download to memory
resp = client.get_object(TRUSTED_ZONE, latest_meta)
data = resp.read()
resp.close()
resp.release_conn()
metadata_df = pd.read_csv(io.BytesIO(data))
print(f"✅ Loaded trusted metadata with {len(metadata_df)} rows.")

# -----------------------
# 2. Combine text columns for embedding
# -----------------------
text_cols = ["kingdom", "phylum", "class", "order", "family", "genus", "species", "scientific_name", "common"]
metadata_df["combined_text"] = metadata_df[text_cols].fillna("").agg(" ".join, axis=1).str.strip()

# Drop rows without valid UUID or text
metadata_df = metadata_df.dropna(subset=["uuid", "combined_text"])
metadata_df = metadata_df[metadata_df["combined_text"].str.len() > 0]
print(f"🧹 Cleaned metadata for embedding: {len(metadata_df)} valid rows.")


#  Connect to ChromaDB 
chroma_client = chromadb.PersistentClient(path=CHROMA_DB_DIR)

# Create or load the collection
collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"description": "Embeddings for trusted metadata records"}
)

# Use default text embedding function
text_embedder = embedding_functions.DefaultEmbeddingFunction()

# -----------------------
# 3. Avoid duplicates — check existing UUIDs
# -----------------------
existing_count = collection.count()

# Get all existing UUIDs here stored as 'ids' (if collection is not empty)
existing_ids = []
if existing_count > 0:
    batch_size = 500
    offset = 0
    while True:
        batch = collection.get(limit=batch_size, offset=offset)
        if not batch["ids"]:
            break
        existing_ids.extend(batch["ids"])
        offset += batch_size

existing_ids = set(existing_ids)
print(f"📊 Existing embeddings in collection: {len(existing_ids)}")

# Filter new rows
new_df = metadata_df[~metadata_df["uuid"].isin(existing_ids)]
print(f"🆕 New records to embed: {len(new_df)}")

# -----------------------
# 4. Generate and add embeddings
# -----------------------
if new_df.empty:
    print("⚠️ No new metadata to embed. No data was added.")
else:
    texts = new_df["combined_text"].tolist()
    uuids = new_df["uuid"].tolist()

    print("🧠 Generating embeddings...")
    embeddings = text_embedder(texts)

    # Store in ChromaDB
    collection.add(
        ids=uuids,
        embeddings=embeddings,
        metadatas=new_df.to_dict(orient="records"),
        documents=texts
    )

    print(f"✅ Added {len(uuids)} new embeddings to collection '{COLLECTION_NAME}'.")


#  Summary
final_count = collection.count()
added_count = final_count - existing_count

print("\n📊 ===== Summary =====")
print(f"🗂️ Collection: {COLLECTION_NAME}")
print(f"🔹 Previously had: {existing_count}")
print(f"🔹 New added: {added_count}")
print(f"🔹 Total now: {final_count}")
print("=======================")
print("✅ Exploitation Metadata processing complete.")


📂 Loading trusted metadata: metadata/trusted_metadata_2025_10_20_00_18_19.csv
✅ Loaded trusted metadata with 4932 rows.
🧹 Cleaned metadata for embedding: 4932 valid rows.
📊 Existing embeddings in collection: 0
🆕 New records to embed: 4932
🧠 Generating embeddings...
✅ Added 4932 new embeddings to collection 'metadata_embeddings'.

📊 ===== Summary =====
🗂️ Collection: metadata_embeddings
🔹 Previously had: 0
🔹 New added: 4932
🔹 Total now: 4932
✅ Exploitation Metadata processing complete.
