# 02 — Retrieval Analysis

This notebook compares the three retrieval strategies implemented in CertiRAG:

| Strategy | Description |
|----------|-------------|
| BM25-only | Sparse lexical matching |
| Dense-only | Embedding cosine similarity |
| Hybrid (RRF) | BM25 + Dense with Reciprocal Rank Fusion |

We measure overlap, rank correlation, and qualitative retrieval on sample queries.

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))

import numpy as np
from collections import Counter

from certirag.config import CertiRAGConfig, ExecutionMode
from certirag.ingest.chunker import DocumentChunker
from certirag.ingest.indexer import BM25Index, DenseIndex, ChunkStore
from certirag.utils import set_all_seeds

set_all_seeds(42)
cfg = CertiRAGConfig(execution_mode=ExecutionMode.LITE)

## 1. Build corpus from sample documents

In [None]:
DOCUMENTS = {
    "eiffel": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris. It is named after the engineer Gustave Eiffel, whose company designed and built the tower from 1887 to 1889. The tower is 330 metres tall and the tallest structure in Paris.",
    "colosseum": "The Colosseum is an oval amphitheatre in the centre of the city of Rome. Built of travertine limestone, tuff, and brick-faced concrete, it was the largest amphitheatre ever built. Construction began under the emperor Vespasian in AD 72 and was completed in AD 80.",
    "taj_mahal": "The Taj Mahal is an Islamic ivory-white marble mausoleum on the right bank of the river Yamuna in Agra. It was commissioned in 1631 by the Mughal emperor Shah Jahan to house the tomb of his favourite wife, Mumtaz Mahal. The tomb is the centrepiece of a 17-hectare complex.",
    "great_wall": "The Great Wall of China is a series of fortifications built along the historical northern borders of China. The wall spans approximately 21,196 kilometres. Several walls were built from as early as the 7th century BC, with selective stretches later joined by Qin Shi Huang.",
    "machu_picchu": "Machu Picchu is a 15th-century Inca citadel situated on a mountain ridge above the Sacred Valley in Peru. It was built in the classical Inca style, with polished dry-stone walls. Most archaeologists believe it was constructed for the Inca emperor Pachacuti around 1450.",
}

chunker = DocumentChunker(config=cfg)
store = ChunkStore()
all_chunks = []

for doc_id, text in DOCUMENTS.items():
    chunks = chunker.chunk_document(text, doc_id=doc_id, source="wiki")
    for c in chunks:
        store.add(c)
        all_chunks.append(c)

print(f"Total chunks: {len(all_chunks)}")
for c in all_chunks:
    print(f"  {c.chunk_id} ({c.doc_id}): {c.text[:60]}...")

## 2. Build indices

In [None]:
# BM25 index
bm25 = BM25Index()
bm25.build(all_chunks)
print(f"BM25 index built with {bm25.corpus_size} documents")

# Dense index (using random embeddings for LITE mode demo)
dim = 64
dense = DenseIndex(dim=dim)
embeddings = np.random.randn(len(all_chunks), dim).astype(np.float32)
# Normalise for cosine similarity
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = embeddings / norms
chunk_ids = [c.chunk_id for c in all_chunks]
dense.build(chunk_ids, embeddings)
print(f"Dense index built with dim={dim}")

## 3. Compare retrieval strategies

In [None]:
TEST_QUERIES = [
    "How tall is the Eiffel Tower?",
    "Who built the Colosseum?",
    "When was the Taj Mahal commissioned?",
    "How long is the Great Wall of China?",
    "Where is Machu Picchu located?",
]

TOP_K = 3

def rrf_fuse(list_a, list_b, k=60):
    """Reciprocal Rank Fusion of two ranked lists."""
    scores = Counter()
    for rank, (doc_id, _) in enumerate(list_a):
        scores[doc_id] += 1.0 / (k + rank + 1)
    for rank, (doc_id, _) in enumerate(list_b):
        scores[doc_id] += 1.0 / (k + rank + 1)
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

print(f"{'Query':<45} {'BM25 Top-1':<20} {'Dense Top-1':<20} {'RRF Top-1':<20}")
print("=" * 105)

for q in TEST_QUERIES:
    bm25_results = bm25.search(q, top_k=TOP_K)
    
    # For demo: use random query embedding
    q_emb = np.random.randn(dim).astype(np.float32)
    q_emb = q_emb / np.linalg.norm(q_emb)
    dense_results = dense.search(q_emb, top_k=TOP_K)
    
    rrf_results = rrf_fuse(bm25_results, dense_results)
    
    bm25_top = bm25_results[0][0] if bm25_results else "—"
    dense_top = dense_results[0][0] if dense_results else "—"
    rrf_top = rrf_results[0][0] if rrf_results else "—"
    
    print(f"{q:<45} {bm25_top:<20} {dense_top:<20} {rrf_top:<20}")

## 4. Rank overlap analysis

Measure Jaccard overlap between BM25 and Dense top-k result sets.

In [None]:
def jaccard(set_a, set_b):
    """Jaccard similarity between two sets."""
    if not set_a and not set_b:
        return 1.0
    return len(set_a & set_b) / len(set_a | set_b)

overlaps = []
for q in TEST_QUERIES:
    bm25_ids = {cid for cid, _ in bm25.search(q, top_k=TOP_K)}
    q_emb = np.random.randn(dim).astype(np.float32)
    q_emb = q_emb / np.linalg.norm(q_emb)
    dense_ids = {cid for cid, _ in dense.search(q_emb, top_k=TOP_K)}
    j = jaccard(bm25_ids, dense_ids)
    overlaps.append(j)
    print(f"Query: {q[:40]:<40}  Jaccard@{TOP_K}: {j:.3f}  BM25∩Dense: {bm25_ids & dense_ids}")

print(f"\nMean Jaccard@{TOP_K}: {np.mean(overlaps):.3f}")
print("\nNote: Dense uses random embeddings in LITE demo — overlap will be low.")
print("In FULL mode with real embeddings, expect higher complementarity.")

## 5. RRF score distribution

In [None]:
# Show full RRF rankings for one query
q = TEST_QUERIES[0]
bm25_results = bm25.search(q, top_k=len(all_chunks))
q_emb = np.random.randn(dim).astype(np.float32)
q_emb = q_emb / np.linalg.norm(q_emb)
dense_results = dense.search(q_emb, top_k=len(all_chunks))
rrf_results = rrf_fuse(bm25_results, dense_results)

print(f"Query: {q}\n")
print(f"{'Rank':<6} {'Chunk ID':<25} {'RRF Score':<12} {'Doc ID':<15}")
print("-" * 58)
for rank, (chunk_id, score) in enumerate(rrf_results, 1):
    chunk = store.get(chunk_id)
    doc_id = chunk.doc_id if chunk else "?"
    print(f"{rank:<6} {chunk_id:<25} {score:.6f}    {doc_id:<15}")