# Imports & Setup

In [1]:
from typing import List, Dict
from collections import defaultdict
import json
import numpy as np

from qdrant_client import QdrantClient
from qdrant_client.models import SparseVector
from sentence_transformers import SentenceTransformer

In [2]:
import sys
import os

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Connect to Qdrant

In [3]:
q_client = QdrantClient(url="http://localhost:6333")
DENSE_COLLECTION = "google-io-transcripts"  # If you have dense-only collection
SPARSE_COLLECTION = "sparse_collection"      # If you have sparse-only collection
HYBRID_COLLECTION = "hybrid_collection"      # Recommended for hybrid search

# Load Models (Dense + Sparse)

#### Dense Embedding Model

In [4]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [5]:
def embed_query(text: str):
    return embedding_model.encode(text).tolist()

# Sparse Query Builder

In [6]:
import pickle
from retrieval.retrievers.BM25 import BM25Encoder

In [7]:
# In run_index_sparse.py, run_index_hybrid.py, run_query_hybrid.py, and notebook:

with open("../data/models/bm25_encoder.pkl", "rb") as f:
    bm25_encoder = pickle.load(f)

#### Query Encoding Functions

In [8]:
def tokenize(text: str):
    """Must match tokenization used during indexing"""
    return text.lower().split()

def bm25_query(query: str):
    """Encode query as sparse vector using BM25"""
    tokens = tokenize(query)
    return bm25_encoder.encode_query(tokens)

# Dense & Sparse Retrieval Functions

In [9]:
def dense_retrieve(query: str, k: int = 10):
    """Retrieve using dense embeddings only"""
    vector = embed_query(query)
    hits = q_client.query_points(
        collection_name=DENSE_COLLECTION,
        query=vector,
        limit=k,
    )
    return hits.points


In [10]:
def sparse_retrieve(query: str, k: int = 10):
    """Retrieve using sparse BM25 vectors only"""
    sparse_vec = bm25_query(query)
    hits = q_client.query_points(
        collection_name=SPARSE_COLLECTION,
        query=sparse_vec,
        using="text",
        limit=k,
    )
    return hits.points

In [83]:
def hybrid_retrieve_rrf(query: str, k_dense: int = 50, k_sparse: int = 50, k: int = 10):
    """
    Hybrid retrieval using Reciprocal Rank Fusion (RRF)
    Retrieves from separate dense and sparse collections, then fuses
    """
    # Get results from both retrievers
    dense_results = dense_retrieve(query, k=k_dense)
    sparse_results = sparse_retrieve(query, k=k_sparse)
    
    # Apply RRF fusion
    return rrf([dense_results, sparse_results], k_final=k)

In [86]:
from qdrant_client.models import Prefetch

In [87]:
def hybrid_retrieve(query: str, k: int = 10):
    """Native hybrid retrieval using Qdrant's built-in hybrid search"""
    dense_vector = embed_query(query)
    sparse_vector = bm25_query(query)
    
    response = q_client.query_points(
        collection_name=HYBRID_COLLECTION,
        prefetch=[
            Prefetch(
                query=sparse_vector,
                using="text",
                limit=50,
            )
        ],
        query=dense_vector,
        using="dense",
        limit=k,
    )
    
    # Convert to dictionary format
    results = []
    for point in response.points:
        results.append({
            "id": point.id,
            "score": point.score,
            "payload": point.payload
        })
    
    return results

# Update the collection name
HYBRID_COLLECTION = "hybrid_collection"

# Reciprocal Rank Fusion

In [13]:
def rrf(rankings: List, k_final: int = 10, k_rrf: int = 60):
    """
    Reciprocal Rank Fusion algorithm
    
    Args:
        rankings: List of result lists from different retrievers
        k_final: Number of final results to return
        k_rrf: RRF constant (typically 60)
    """
    scores = defaultdict(float)
    payloads = {}
    
    for ranking in rankings:
        for rank, hit in enumerate(ranking):
            # Use hit.id for Qdrant point ID
            doc_id = hit.id
            scores[doc_id] += 1.0 / (k_rrf + rank + 1)
            payloads[doc_id] = hit.payload
    
    # Sort by score and return top k_final
    fused_results = sorted(
        [
            {"id": doc_id, "score": score, "payload": payloads[doc_id]}
            for doc_id, score in scores.items()
        ],
        key=lambda x: x["score"],
        reverse=True
    )
    
    return fused_results[:k_final]

# Hybrid Retrieval

In [84]:
# # Choose Your Hybrid Retrieval Method

# Use this for RRF-based hybrid (works with separate collections)
# hybrid_retrieve = hybrid_retrieve_rrf

# OR use this for native Qdrant hybrid (requires hybrid_collection)
hybrid_retrieve = hybrid_retrieve_native

### Manual Sanity Check

In [20]:
# Check collections
collections = q_client.get_collections()

print("Available collections:")
for collection in collections.collections:
    print(f"  - {collection.name}")

Available collections:
  - google-io-transcripts
  - hybrid_collection
  - sparse_collection


In [89]:
query = "What is Gemini?"
print(f"Query: {query}\n")

results = hybrid_retrieve(query, k=5)

for i, r in enumerate(results, 1):
    print(f"{i}. Score: {r['score']:.4f}")
    print(f"   Text: {r['payload']['text'][:500]}...")
    print()

Query: What is Gemini?

1. Score: 0.4081
   Text: [MUSIC PLAYING] [CHEERING] [AUDIENCE SCREAMING] LUCIANO MARTINS: Hey, folks. Good morning. It's a pleasure to
be here with you all. I'm Luciano Martins. I'm Brazilian. [CHEERING] I'm an AI Developer
Advocate at Google DeepMind. And I'm here with
my friend Shrestha. SHRESTHA BASU
MALLICK: Thank you. Luciano Hi, everyone. I am Shreshta Basu Mallick. I'm the Product Lead for
the Gemini Developer API. And it looks like I should have
brought an Indian contingent here. [LAUGHING] Thank you. LUCIANO MA...

2. Score: 0.3980
   Text: when we think about the key
standard benchmarks, Shrestha? SHRESTHA BASU
MALLICK: Yeah, we'll go to the benchmarks
in a second. But the message
that I want everyone to take away from this
slide is whether it's a super complex task, or whether
it's on-device processing, you now have a Gemini model
that you can use for it. And it's pretty powerful. And it's pretty price effective. LUCIANO MARTINS: Yeah,
maybe it is al

# Evaluation

In [90]:
# Load ground truth
with open("../data/eval/ground_truth_gpt5nano.json") as f:
    ground_truth = json.load(f)

In [91]:
print(f"Loaded {len(ground_truth)} evaluation queries")

Loaded 10 evaluation queries


#### Evaluation metrics

In [93]:
# Debug: see what's actually in the payload
query = "What is Gemini?"
results = hybrid_retrieve(query, k=5)

print("First result payload:")
print(results[0]["payload"])
print("\nAvailable keys:")
print(results[0]["payload"].keys())

First result payload:
{'doc_id': 'gHHjDRDNUNU__chunk_000', 'text': "[MUSIC PLAYING] [CHEERING] [AUDIENCE SCREAMING] LUCIANO MARTINS: Hey, folks. Good morning. It's a pleasure to\nbe here with you all. I'm Luciano Martins. I'm Brazilian. [CHEERING] I'm an AI Developer\nAdvocate at Google DeepMind. And I'm here with\nmy friend Shrestha. SHRESTHA BASU\nMALLICK: Thank you. Luciano Hi, everyone. I am Shreshta Basu Mallick. I'm the Product Lead for\nthe Gemini Developer API. And it looks like I should have\nbrought an Indian contingent here. [LAUGHING] Thank you. LUCIANO MARTINS: Yay! OK, so the idea of\nthis conversation is we want to share with you\nsome of the new things you have available to develop your\nsolutions using Gemini models and the Gemini API. How many developers\nyou have here? Amazing. OK, so we can start talking\nabout the Gemini models universe. We started Gemini\nby the end of 2023. And since then, we\nhave done a lot of work together between many different\nGoogle DeepMi

In [61]:
# Debug the actual structure
query = "What is Gemini?"
results = hybrid_retrieve(query, k_final=5)

print(f"Type of results: {type(results)}")
print(f"Number of results: {len(results)}")
print(f"\nFirst result type: {type(results[0])}")
print(f"First result: {results[0]}")
print(f"\nFirst result keys: {results[0].keys() if isinstance(results[0], dict) else 'Not a dict'}")

Type of results: <class 'list'>
Number of results: 5

First result type: <class 'dict'>
First result: {'id': 42, 'score': 0.01639344262295082, 'payload': {'doc_id': 'gHHjDRDNUNU__chunk_022', 'video_id': 'gHHjDRDNUNU', 'title': 'Google I/O 2025 - Keynote', 'timestamp_start': 2746.57, 'timestamp_end': 2767.29, 'text': 'more of the other features and\npossibilities with the models. And the last one is the Gemini\nCookbook, a GitHub repository that our team curates and keep\nupdating with a lot of sample experience for you. Thank you so much, Shrestha. Thank you so much you all. SHRESTHA BASU MALLICK: Thank you\nall for coming out to hear us. [CHEERING] [MUSIC PLAYING]', 'source': 'youtube', 'speaker': 'unknown'}}

First result keys: dict_keys(['id', 'score', 'payload'])


In [62]:
# Check the payload structure
print(f"Payload keys: {results[0]['payload'].keys()}")
print(f"Full payload: {results[0]['payload']}")

Payload keys: dict_keys(['doc_id', 'video_id', 'title', 'timestamp_start', 'timestamp_end', 'text', 'source', 'speaker'])
Full payload: {'doc_id': 'gHHjDRDNUNU__chunk_022', 'video_id': 'gHHjDRDNUNU', 'title': 'Google I/O 2025 - Keynote', 'timestamp_start': 2746.57, 'timestamp_end': 2767.29, 'text': 'more of the other features and\npossibilities with the models. And the last one is the Gemini\nCookbook, a GitHub repository that our team curates and keep\nupdating with a lot of sample experience for you. Thank you so much, Shrestha. Thank you so much you all. SHRESTHA BASU MALLICK: Thank you\nall for coming out to hear us. [CHEERING] [MUSIC PLAYING]', 'source': 'youtube', 'speaker': 'unknown'}


In [63]:
def get_original_id(result):
    """Helper to get document ID"""
    return result["payload"]["doc_id"]

In [64]:
def recall_at_k(results: List[Dict], relevant_ids: List[str], k: int) -> int:
    """Binary recall: 1 if any relevant doc in top-k, else 0"""
    retrieved = [r["payload"]["doc_id"] for r in results[:k]]
    return int(any(rid in retrieved for rid in relevant_ids))

In [65]:
def mrr(results: List[Dict], relevant_ids: List[str]) -> float:
    """Mean Reciprocal Rank: 1/rank of first relevant doc"""
    for i, r in enumerate(results, start=1):
        if r["payload"]["doc_id"] in relevant_ids:
            return 1.0 / i
    return 0.0

In [66]:
def precision_at_k(results: List[Dict], relevant_ids: List[str], k: int) -> float:
    """Proportion of retrieved docs that are relevant"""
    retrieved = [r["payload"]["doc_id"] for r in results[:k]]
    relevant_retrieved = sum(1 for rid in retrieved if rid in relevant_ids)
    return relevant_retrieved / k if k > 0 else 0.0

# Run evaluation

In [75]:
# # Debug: Check what's actually being returned
# query = ground_truth[0]["query"]
# print(f"Test query: {query}\n")

# results = hybrid_retrieve(query, k_final=5)

# print(f"Number of results: {len(results)}")
# print(f"\nFirst result:")
# print(f"  Type: {type(results[0])}")
# print(f"  Keys: {results[0].keys() if isinstance(results[0], dict) else 'Not a dict'}")
# print(f"  Full result: {results[0]}")

# if isinstance(results[0], dict) and 'payload' in results[0]:
#     print(f"\nPayload keys: {results[0]['payload'].keys()}")
#     print(f"Full payload: {results[0]['payload']}")

Test query: What were the main topics and components discussed in Google's AI Stack for Developers session at Google I/O, including foundation models and the developer frameworks mentioned?

Number of results: 5

First result:
  Type: <class 'dict'>
  Keys: dict_keys(['id', 'score', 'payload'])
  Full result: {'id': 0, 'score': 0.01639344262295082, 'payload': {'doc_id': '4TE-KFXvhAk__chunk_000', 'video_id': '4TE-KFXvhAk', 'title': "Google I/O 2025 - What's New in AI", 'timestamp_start': 0.0, 'timestamp_end': 139.04, 'text': "[MUSIC PLAYING] JOANA CARRASQUEIRA:\nHello, everyone. My name is Joana Carrasqueira,\nand I lead Developer Relations at Google DeepMind. JOSH GORDON: Hi, everyone. I'm Josh. JOANA CARRASQUEIRA:\nAnd we're very excited to welcome you to\nour session, Google's AI Stack for Developers. We'll start by giving you a\nquick overview of Google's AI stack. Who's at I/O for the first time? Can I see some hands up? Oh, OK. Welcome to Google\nI/O. It's a pleasure to have you w

In [94]:
# In your notebook, run the evaluation
recalls_5 = []
recalls_10 = []
mrrs = []
precisions_5 = []

for idx, item in enumerate(ground_truth):
    query = item["query"]
    relevant_ids = item["relevant_doc_ids"]
    
    # Retrieve
    results = hybrid_retrieve(query, k=5)
    
    # Skip if no results
    if not results:
        continue
    
    # Calculate metrics
    try:
        retrieved_5 = [r["payload"]["doc_id"] for r in results[:5]]
        retrieved_10 = [r["payload"]["doc_id"] for r in results[:10]]
        
        recalls_5.append(int(any(rid in retrieved_5 for rid in relevant_ids)))
        recalls_10.append(int(any(rid in retrieved_10 for rid in relevant_ids)))
        
        # MRR
        mrr_score = 0.0
        for i, r in enumerate(results, start=1):
            if r["payload"]["doc_id"] in relevant_ids:
                mrr_score = 1.0 / i
                break
        mrrs.append(mrr_score)
        
        # Precision@5
        relevant_count = sum(1 for rid in retrieved_5 if rid in relevant_ids)
        precisions_5.append(relevant_count / 5.0)
        
    except (KeyError, TypeError) as e:
        print(f"Error on query {idx}: {e}")
        continue

print(f"\n{'='*50}")
print("EVALUATION RESULTS")
print(f"{'='*50}")
print(f"Queries evaluated: {len(recalls_5)}/{len(ground_truth)}")
print(f"Recall@5:      {np.mean(recalls_5):.4f}")
print(f"Recall@10:     {np.mean(recalls_10):.4f}")
print(f"MRR:           {np.mean(mrrs):.4f}")
print(f"Precision@5:   {np.mean(precisions_5):.4f}")
print(f"{'='*50}")


EVALUATION RESULTS
Queries evaluated: 10/10
Recall@5:      0.6000
Recall@10:     0.6000
MRR:           0.3667
Precision@5:   0.1200


#### Optional: Compare Dense vs Sparse vs Hybrid

In [95]:
print("\n" + "="*50)
print("ABLATION STUDY")
print("="*50)


ABLATION STUDY


##### Dense only

In [99]:
dense_recalls = []
dense_mrrs = []
for item in ground_truth:
    results_dense = dense_retrieve(item["query"], k=5)
    # Convert to same format as RRF results
    results_formatted = [
        {"payload": hit.payload, "score": hit.score}
        for hit in results_dense
    ]
    dense_recalls.append(recall_at_k(results_formatted, item["relevant_doc_ids"], k=5))
    dense_mrrs.append(mrr(results_formatted, item["relevant_doc_ids"]))

print(f"\nDense Only:")
print(f"  Recall@5: {np.mean(dense_recalls):.4f}")
print(f"  MRR:      {np.mean(dense_mrrs):.4f}")


Dense Only:
  Recall@5: 0.8000
  MRR:      0.6250


##### Sparse only

In [100]:
sparse_recalls = []
sparse_mrrs = []
for item in ground_truth:
    results_sparse = sparse_retrieve(item["query"], k=5)
    results_formatted = [
        {"payload": hit.payload, "score": hit.score}
        for hit in results_sparse
    ]
    sparse_recalls.append(recall_at_k(results_formatted, item["relevant_doc_ids"], k=5))
    sparse_mrrs.append(mrr(results_formatted, item["relevant_doc_ids"]))

KeyError: 'doc_id'

In [98]:
print(f"\nSparse Only:")
print(f"  Recall@5: {np.mean(sparse_recalls):.4f}")
print(f"  MRR:      {np.mean(sparse_mrrs):.4f}")

print(f"\nHybrid (RRF):")
print(f"  Recall@5: {np.mean(recalls_5):.4f}")
print(f"  MRR:      {np.mean(mrrs):.4f}")


Sparse Only:
  Recall@5: nan
  MRR:      nan

Hybrid (RRF):
  Recall@5: 0.6000
  MRR:      0.3667


# Some checks

In [101]:
# Compare collection sizes
collections = q_client.get_collections()
for coll in collections.collections:
    info = q_client.get_collection(coll.name)
    print(f"{coll.name}: {info.points_count} documents")

hybrid_collection: 99 documents
google-io-transcripts: 99 documents
sparse_collection: 99 documents


In [102]:
# Dense-only retrieval using the hybrid_collection
def dense_only_retrieve(query: str, k: int = 10):
    """Retrieve using ONLY dense vectors from hybrid_collection"""
    dense_vector = embed_query(query)
    
    response = q_client.query_points(
        collection_name="hybrid_collection",  # Same collection as hybrid!
        query=dense_vector,
        using="dense",
        limit=k,
    )
    
    results = []
    for point in response.points:
        results.append({
            "id": point.id,
            "score": point.score,
            "payload": point.payload
        })
    
    return results

# Evaluate dense-only on hybrid_collection
dense_recalls = []
dense_mrrs = []

for item in ground_truth:
    results = dense_only_retrieve(item["query"], k=10)
    
    if not results:
        continue
    
    retrieved_5 = [r["payload"]["doc_id"] for r in results[:5]]
    
    dense_recalls.append(int(any(rid in retrieved_5 for rid in item["relevant_doc_ids"])))
    
    mrr_score = 0.0
    for i, r in enumerate(results, start=1):
        if r["payload"]["doc_id"] in item["relevant_doc_ids"]:
            mrr_score = 1.0 / i
            break
    dense_mrrs.append(mrr_score)

print("\n" + "="*60)
print("FAIR COMPARISON (same collection, same data)")
print("="*60)
print(f"\nDense Only (hybrid_collection):")
print(f"  Recall@5: {np.mean(dense_recalls):.4f}")
print(f"  MRR:      {np.mean(dense_mrrs):.4f}")

print(f"\nHybrid (sparse prefetch → dense rerank):")
print(f"  Recall@5: 0.6000")
print(f"  MRR:      0.3667")

print(f"\nDifference:")
print(f"  Recall@5: {np.mean(dense_recalls) - 0.6000:+.4f}")
print(f"  MRR:      {np.mean(dense_mrrs) - 0.3667:+.4f}")
print("="*60)


FAIR COMPARISON (same collection, same data)

Dense Only (hybrid_collection):
  Recall@5: 0.8000
  MRR:      0.6393

Hybrid (sparse prefetch → dense rerank):
  Recall@5: 0.6000
  MRR:      0.3667

Difference:
  Recall@5: +0.2000
  MRR:      +0.2726


#### Comment:

Excellent! Now we have a clear answer: The hybrid search is hurting performance on this dataset. Dense-only is significantly better (80% vs 60% recall).
This is happening because the sparse prefetch is filtering out relevant documents before the dense reranking can find them.
Let's diagnose and fix this:

In [106]:
# Analyze where hybrid fails vs dense succeeds
query = ground_truth[0]["query"]
relevant_ids = ground_truth[0]["relevant_doc_ids"]

print(f"Query: {query[:100]}...\n")
print(f"Relevant doc IDs: {relevant_ids}\n")

# Dense-only results
dense_results = dense_only_retrieve(query, k=10)
print("=== DENSE-ONLY TOP 10 ===")
for i, r in enumerate(dense_results[:10], 1):
    is_relevant = "✓" if r["payload"]["doc_id"] in relevant_ids else "✗"
    print(f"{i}. {is_relevant} {r['payload']['doc_id']} (score: {r['score']:.4f})")

# Sparse-only results from hybrid_collection
sparse_vector = bm25_query(query)
sparse_results = q_client.query_points(
    collection_name="hybrid_collection",  # Use hybrid_collection
    query=sparse_vector,
    using="text",
    limit=50,
)
print("\n=== SPARSE TOP 50 ===")
relevant_in_sparse = []
for i, hit in enumerate(sparse_results.points[:50], 1):
    is_relevant = "✓" if hit.payload["doc_id"] in relevant_ids else "✗"
    if is_relevant == "✓":
        relevant_in_sparse.append(i)
        print(f"{i}. {is_relevant} {hit.payload['doc_id']} (score: {hit.score:.4f})")

if not relevant_in_sparse:
    print("❌ NO RELEVANT DOCS IN SPARSE TOP-50!")
else:
    print(f"\n✓ Relevant docs found at positions: {relevant_in_sparse}")

# Hybrid results
hybrid_results = hybrid_retrieve(query, k=10)
print("\n=== HYBRID TOP 10 ===")
for i, r in enumerate(hybrid_results[:10], 1):
    is_relevant = "✓" if r["payload"]["doc_id"] in relevant_ids else "✗"
    print(f"{i}. {is_relevant} {r['payload']['doc_id']} (score: {r['score']:.4f})")

Query: What were the main topics and components discussed in Google's AI Stack for Developers session at Go...

Relevant doc IDs: ['4TE-KFXvhAk__chunk_000']

=== DENSE-ONLY TOP 10 ===
1. ✓ 4TE-KFXvhAk__chunk_000 (score: 0.6397)
2. ✗ 4TE-KFXvhAk__chunk_017 (score: 0.5994)
3. ✗ 4TE-KFXvhAk__chunk_019 (score: 0.5812)
4. ✗ 4TE-KFXvhAk__chunk_001 (score: 0.5470)
5. ✗ gHHjDRDNUNU__chunk_007 (score: 0.5117)
6. ✗ 4TE-KFXvhAk__chunk_006 (score: 0.4782)
7. ✗ 4TE-KFXvhAk__chunk_003 (score: 0.4651)
8. ✗ Uh-7YX8tkxI__chunk_000 (score: 0.4489)
9. ✗ 4TE-KFXvhAk__chunk_007 (score: 0.4397)
10. ✗ o7Bv4r08FBM__chunk_000 (score: 0.4363)

=== SPARSE TOP 50 ===
❌ NO RELEVANT DOCS IN SPARSE TOP-50!

=== HYBRID TOP 10 ===
1. ✗ 4TE-KFXvhAk__chunk_017 (score: 0.5994)
2. ✗ 4TE-KFXvhAk__chunk_006 (score: 0.4782)
3. ✗ 4TE-KFXvhAk__chunk_003 (score: 0.4651)
4. ✗ Uh-7YX8tkxI__chunk_000 (score: 0.4489)
5. ✗ 4TE-KFXvhAk__chunk_007 (score: 0.4397)
6. ✗ gHHjDRDNUNU__chunk_017 (score: 0.4155)
7. ✗ gHHjDRDNUNU__chunk_006 