In [None]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

import utils.extractors as extractors
import utils.millers as millers

load_dotenv()

In [None]:
df = extractors.fetch_sample_transactions(limit=1_000)

# Basic sanity check
assert 'transaction_description' in df.columns and 'category_name' in df.columns, "Missing required columns"

In [None]:
# Step 2: Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

descriptions = df['description'].astype(str).tolist()

embeddings = model.encode(descriptions, convert_to_numpy=True)

In [None]:
# Step 3: Initialize Chroma DB (local, in-memory or persistent)
chroma_client = chromadb.PersistentClient(path="./chroma_db")

In [None]:
collection = chroma_client.get_or_create_collection(name="transactions")

In [None]:
metadata = df.loc[:, ['category_name']].reset_index().to_dict(orient='records')
metadata[0:10]


In [None]:
# Step 4: Insert into Chroma
# Chroma requires ids, embeddings, and metadata
collection.add(
    documents=descriptions,
    embeddings=embeddings,
    metadatas=metadata,
    ids=[f"txn_{i}" for i in range(len(descriptions))]
)

print(f"✅ Added {len(descriptions)} transactions to Chroma DB.")

# Find Nearest Neighbors

In [None]:
query_text = df.iloc[22].description
query_text

In [None]:
query_embedding = model.encode([query_text], convert_to_numpy=True)

results = collection.query(
    query_embeddings=query_embedding,
    n_results=5
)

for doc, meta, score in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
    print(f"→ Match: {doc:25} | Category: {meta['category_name']:25} | Score: {score:.4f}")