# Embedding & Retrieval Evaluation

This notebook benchmarks embedding models and chunk sizes for retrieval quality using your project's Chroma index and collected feedback as a small labeled set. Metrics: Precision@K, Recall@K, and MRR.

In [None]:
# Standard imports
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Project imports (uses your existing pipeline)
from app.feedback_manager import _load_feedback
from app.ingest import process_pdf
from app.embeddings import TextImageEmbedder

# Config
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
PDF_PATH = os.path.join(BASE_DIR, "samples", "vdoc_rag_test.pdf")  # replace with a real sample PDF path
STORAGE_DIR = os.path.join(BASE_DIR, "storage", "chroma_db")

MODELS_TO_TEST = [
    "all-MiniLM-L6-v2",
    "multi-qa-MiniLM-L6-cos-v1",
    "paraphrase-MiniLM-L3-v2",
    os.path.join(BASE_DIR, "models", "vdoc_feedback_tuned", "latest"),
]
CHUNK_SIZES = [200, 500, 800]  # in characters
TOP_K = 5

print("Notebook configured. If the tuned model path does not exist, it will be skipped in runs.")

: 

In [None]:
# Load feedback (if available)
feedback = _load_feedback()
print(f"Loaded {len(feedback)} feedback entries.")
if feedback:
    sample_queries = [f['question'] for f in feedback]
    sample_answers = [f['answer'] for f in feedback]
else:
    # fallback small test set
    sample_queries = [
        "What is the trend in yearly sales?",
        "Who scored highest in the table?",
        "What is the event date?",
    ]
    sample_answers = ["increasing", "Charlie", "November 20, 2025"]

# Small helper to preview feedback structure
if feedback:
    display(pd.DataFrame(feedback)[['timestamp','question','answer','correctness']].tail(10))

In [None]:
# Helper: process the PDF into chunks (optional - heavy).
def load_chunks(pdf_path):
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF not found: {pdf_path}")
    print("Processing PDF into chunks (this may take a while)...")
    docs = process_pdf(pdf_path)
    texts = [d['text'] for d in docs]
    return texts

# Try to load sample chunks if available, otherwise create toy chunks from feedback answers
try:
    chunks = load_chunks(PDF_PATH)
    print(f"Total chunks from PDF: {len(chunks)}")
except Exception as e:
    print("Could not process PDF, falling back to feedback-derived tiny corpus:", e)
    # fallback corpus built from sample answers/queries for quick runs
    chunks = [
        "Yearly sales have been increasing steadily from 2018 to 2024, with a notable jump in 2021.",
        "Charlie achieved the highest score in the table with 98 points.",
        "The event will be held on November 20, 2025 at the downtown auditorium.",
    ]
    print(f"Using fallback chunks: {len(chunks)} items")

In [None]:
# Evaluation function (Precision@K, Recall@K, MRR)
def evaluate_model(model_name, chunks, queries, answers, chunk_size, top_k=TOP_K):
    print(f"\nðŸ§  Evaluating {model_name} (chunk size {chunk_size})")
    # Skip model if path does not exist (for tuned model)
    if os.path.isabs(model_name) and not os.path.exists(model_name):
        print(f"- Skipping (path not found): {model_name}")
        return None

    model = SentenceTransformer(model_name)

    # Split chunks by size
    split_chunks = []
    for ch in chunks:
        for i in range(0, len(ch), chunk_size):
            split_chunks.append(ch[i:i+chunk_size])
    chunk_embeddings = model.encode(split_chunks, normalize_embeddings=True, show_progress_bar=False)

    precision_scores, recall_scores, mrr_scores = [], [], []

    # Precompute reference counts for recall denominator
    total_relevant_counts = []
    for ans in answers:
        total_relevant_counts.append(sum(1 for c in split_chunks if ans.lower() in c.lower()))

    for q, ans in tqdm(list(zip(queries, answers)), total=len(queries), desc=f"Evaluating {model_name}"):
        qvec = model.encode([q], normalize_embeddings=True)
        sims = cosine_similarity(qvec, chunk_embeddings)[0]
        top_indices = np.argsort(sims)[::-1][:top_k]
        retrieved_chunks = [split_chunks[i] for i in top_indices]

        relevant = [1 if ans.lower() in c.lower() else 0 for c in retrieved_chunks]
        precision = sum(relevant) / top_k
        recall = sum(relevant) / max(1, total_relevant_counts.pop(0))
        mrr = 0.0
        for rank, rel in enumerate(relevant, start=1):
            if rel == 1:
                mrr = 1.0 / rank
                break

        precision_scores.append(precision)
        recall_scores.append(recall)
        mrr_scores.append(mrr)

    return {
        "model": model_name,
        "chunk_size": chunk_size,
        "precision": float(np.mean(precision_scores)),
        "recall": float(np.mean(recall_scores)),
        "mrr": float(np.mean(mrr_scores)),
    }

In [None]:
# Run evaluation across models and chunk sizes
results = []
for model_name in MODELS_TO_TEST:
    for cs in CHUNK_SIZES:
        res = evaluate_model(model_name, chunks, sample_queries, sample_answers, cs)
        if res:
            results.append(res)

df = pd.DataFrame(results)
if not df.empty:
    display(df)
else:
    print("No results to show (models may have been skipped).")

In [None]:
# Visualization
if not df.empty:
    plt.figure(figsize=(8,5))
    for m in df['model'].unique():
        subset = df[df['model'] == m]
        plt.plot(subset['chunk_size'], subset['precision'], marker='o', label=f"{m} (Precision)")
    plt.title('Precision@5 vs Chunk Size')
    plt.xlabel('Chunk Size (characters)')
    plt.ylabel('Precision@5')
    plt.legend()
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(8,5))
    for m in df['model'].unique():
        subset = df[df['model'] == m]
        plt.plot(subset['chunk_size'], subset['recall'], marker='s', label=f"{m} (Recall)")
    plt.title('Recall@5 vs Chunk Size')
    plt.xlabel('Chunk Size (characters)')
    plt.ylabel('Recall@5')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# Save results to CSV for reporting
output_csv = os.path.join(BASE_DIR, 'notebooks', 'embedding_benchmark_results.csv')
if not df.empty:
    df.to_csv(output_csv, index=False)
    print(f"âœ… Benchmark results saved to {output_csv}")
else:
    print("No data to save.")