# Demo: Hybrid Search (Qdrant + BM25)

Demonstrates `rag.retrieval.HybridRetriever` - combining dense and sparse retrieval with RRF fusion.

In [None]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset
from langchain_core.documents import Document

from rag.retrieval import HybridRetriever, HybridConfig

## Load Data

In [None]:
corpus = load_dataset("rag-datasets/rag-mini-bioasq", "text-corpus", split="passages")
corpus = corpus.filter(lambda x: x["passage"] and x["passage"] != "nan")

docs = [
    Document(page_content=row["passage"], metadata={"doc_id": row["id"]})
    for row in corpus
]
print(f"Loaded {len(docs):,} documents")

## Index & Search

In [None]:
config = HybridConfig(
    collection_name="bioasq-hybrid-demo",
    dense_weight=0.5,
)

retriever = HybridRetriever(config)
retriever.index(docs, force_recreate=True)

In [None]:
query = "What genes are associated with Hirschsprung disease?"

print(f"Query: {query}\n")
print("=" * 60)

for i, doc in enumerate(retriever.search(query, k=5), 1):
    print(f"\n[{i}] doc_id={doc.metadata['doc_id']}")
    print(doc.page_content[:200] + "...")

## Compare: Dense vs Sparse vs Hybrid

In [None]:
def get_doc_ids(results):
    return [d.metadata["doc_id"] for d in results[:5]]

print("Dense (Qdrant):", get_doc_ids(retriever.search_dense(query)))
print("Sparse (BM25): ", get_doc_ids(retriever.search_sparse(query)))
print("Hybrid (RRF):  ", get_doc_ids(retriever.search(query)))

## Evaluation (optional)

In [None]:
from rag.evaluation import evaluate_retriever

qrels_ds = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages", split="test")
queries = qrels_ds["question"]
qrels = [set(eval(ids)) for ids in qrels_ds["relevant_passage_ids"]]

# Wrap for evaluation
class RetrieverWrapper:
    def __init__(self, retriever, method="hybrid"):
        self.retriever = retriever
        self.method = method
    
    def invoke(self, query):
        if self.method == "dense":
            return self.retriever.search_dense(query, k=50)
        elif self.method == "sparse":
            return self.retriever.search_sparse(query, k=50)
        else:
            return self.retriever.search(query, k=50)

for method in ["dense", "sparse", "hybrid"]:
    wrapper = RetrieverWrapper(retriever, method)
    metrics = evaluate_retriever(wrapper, queries, qrels, k=10, n_samples=100)
    print(f"{method:8s}: {metrics}")