# 02 â€” Retrieval + MMR + Mini Eval

Shows retrieval and simple metrics.

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
import re
from sklearn.feature_extraction.text import TfidfVectorizer

docs = [(p.name, p.read_text(encoding="utf-8", errors="ignore")) for p in Path("data/docs").glob("*.txt")]

def chunk_text(text: str, max_chars=700, overlap=80):
    parts = re.split(r"\n\s*\n", text.strip())
    chunks, buf = [], ""
    for part in parts:
        part = part.strip()
        if not part: continue
        if len(buf)+len(part)+2 <= max_chars:
            buf = (buf + "\n\n" + part).strip()
        else:
            if buf: chunks.append(buf)
            while len(part) > max_chars:
                chunks.append(part[:max_chars])
                part = part[max_chars-overlap:]
            buf = part
    if buf: chunks.append(buf)
    return chunks

chunks = []
for doc_id, text in docs:
    for i, ch in enumerate(chunk_text(text)):
        chunks.append({"doc_id": doc_id, "chunk_id": f"{doc_id}::chunk{i}", "text": ch})

vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=1)
X = vec.fit_transform([c["text"] for c in chunks])


In [None]:
def mmr_select(qv, doc_vecs, lambda_=0.6, k=5):
    def cos(a,b):
        a=a/(np.linalg.norm(a)+1e-9); b=b/(np.linalg.norm(b)+1e-9)
        return float(a@b)
    sel, cand = [], list(range(doc_vecs.shape[0]))
    while cand and len(sel) < k:
        best, best_score = None, -1e18
        for i in cand:
            rel = cos(qv, doc_vecs[i])
            div = 0.0 if not sel else max(cos(doc_vecs[i], doc_vecs[j]) for j in sel)
            score = lambda_*rel - (1-lambda_)*div
            if score > best_score:
                best_score, best = score, i
        sel.append(best); cand.remove(best)
    return sel

def retrieve_mmr(q, k_candidates=12, k_final=5):
    qv = vec.transform([q])
    sims = cosine_similarity(qv, X).reshape(-1)
    cand_idx = sims.argsort()[::-1][:k_candidates]
    doc_vecs = X[cand_idx].toarray()
    qd = qv.toarray().reshape(-1)
    picked = mmr_select(qd, doc_vecs, lambda_=0.6, k=k_final)
    picked_idx = [cand_idx[i] for i in picked]
    return [(chunks[i]["doc_id"], chunks[i]["chunk_id"], float(sims[i])) for i in picked_idx]

retrieve_mmr("refund policy")


In [None]:
gold = [
    {"q": "refund window", "doc": "policy_refunds.txt"},
    {"q": "shipping time", "doc": "policy_shipping.txt"},
    {"q": "treat retrieved text", "doc": "security_notes.txt"},
]

def recall_at_k(doc_ids, expected, k):
    return 1.0 if expected in doc_ids[:k] else 0.0

def mrr(doc_ids, expected):
    for i, d in enumerate(doc_ids, start=1):
        if d == expected:
            return 1.0 / i
    return 0.0

recs, mrrs = [], []
for row in gold:
    res = retrieve_mmr(row["q"], k_candidates=12, k_final=5)
    doc_ids = [x[0] for x in res]
    recs.append(recall_at_k(doc_ids, row["doc"], 5))
    mrrs.append(mrr(doc_ids, row["doc"]))

sum(recs)/len(recs), sum(mrrs)/len(mrrs)
