In [1]:
"""
Load environment variables and validate required secrets.

"""

import os
from dotenv import load_dotenv
 
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY or not OPENAI_API_KEY.strip():
    raise EnvironmentError(
        "OPENAI_API_KEY not found. Please set it in the .env file."
    )

print("Environment variables loaded successfully.")


Environment variables loaded successfully.


In [2]:
"""
Load and validate project configuration from parameters.toml.

"""

from pathlib import Path
import tomllib

CONFIG_PATH = Path("parameters.toml")

if not CONFIG_PATH.exists():
    raise FileNotFoundError(
        "parameters.toml not found. Please ensure it exists at project root."
    )

with open(CONFIG_PATH, "rb") as f:
    config = tomllib.load(f)

print("Configuration loaded successfully.")
config


Configuration loaded successfully.


{'llm': {'provider': 'openai', 'chat_model': 'gpt-4o-mini'},
 'rag': {'corpus_path': '../data/AI Engineering.pdf',
  'chunk_size': 512,
  'chunk_overlap': 64,
  'top_k': 5},
 'vector_store': {'type': 'qdrant',
  'mode': 'local',
  'url': 'http://localhost:6333',
  'collection_name': 'knowledge_base_chunks',
  'storage_path': './qdrant_storage'},
 'dense_vector': {'name': 'dense',
  'model': 'sentence-transformers/all-MiniLM-L6-v2',
  'distance': 'cosine'},
 'sparse_vector': {'enabled': True, 'name': 'sparse', 'model': 'Qdrant/bm25'}}

In [3]:
import fitz  # PyMuPDF
from pathlib import Path
from typing import List, Dict


class PDFDocumentLoader:
    """
    Responsible for:
    1. Opening a PDF safely
    2. Counting pages
    3. Extracting page-wise text with metadata
    """

    def __init__(self, pdf_path: str):
        self.pdf_path = Path(pdf_path)

        if not self.pdf_path.exists():
            raise FileNotFoundError(f"PDF not found at path: {self.pdf_path}")

    def get_total_pages(self) -> int:
        """Return total number of pages in the PDF"""
        with fitz.open(self.pdf_path) as pdf_doc:
            return pdf_doc.page_count

    def extract_documents(self) -> List[Dict]:
        """
        Extract text page-wise and return documents list
        Each item is RAG-ready (text + metadata)
        """
        documents = []

        with fitz.open(self.pdf_path) as pdf_doc:
            total_pages = pdf_doc.page_count

            for page_index in range(total_pages):
                page = pdf_doc.load_page(page_index)
                text = page.get_text("text")

                # Skip empty / non-text pages
                if not text or not text.strip():
                    continue

                documents.append(
                    {
                        "text": text.strip(),
                        "metadata": {
                            "source": self.pdf_path.name,
                            "page": page_index + 1,
                            "char_count": len(text)
                        }
                    }
                )

        return documents


In [4]:
loader = PDFDocumentLoader(config["rag"]["corpus_path"])

total_pages = loader.get_total_pages()
print(f"Total pages in PDF: {total_pages}")

documents = loader.extract_documents()
print(f"Extracted text from {len(documents)} pages")


Total pages in PDF: 535
Extracted text from 529 pages


In [5]:
from typing import List, Dict


class DocumentInspector:
    """
    Utility class to inspect extracted documents
    before chunking and embedding.
    """

    def __init__(self, documents: List[Dict]):
        if not documents:
            raise ValueError("No documents provided for inspection.")
        self.documents = documents

    def preview(
        self,
        sample_size: int = 3,
        max_chars: int = 500
    ) -> None:
        """
        Print a small sample of extracted text with metadata.

        :param sample_size: Number of documents to inspect
        :param max_chars: Max characters to print per document
        """
        print(f"\nInspecting {min(sample_size, len(self.documents))} document(s):\n")

        for idx, doc in enumerate(self.documents[:sample_size], start=1):
            text_preview = doc["text"][:max_chars]

            print("=" * 80)
            print(f"Sample #{idx}")
            print(f"Source : {doc['metadata'].get('source')}")
            print(f"Page   : {doc['metadata'].get('page')}")
            print(f"Chars  : {doc['metadata'].get('char_count')}")
            print("-" * 80)
            print(text_preview)
            print("..." if len(doc["text"]) > max_chars else "")
            print("=" * 80)


In [6]:
inspector = DocumentInspector(documents)
inspector.preview(sample_size=2, max_chars=400)



Inspecting 2 document(s):

Sample #1
Source : AI Engineering.pdf
Page   : 1
Chars  : 73
--------------------------------------------------------------------------------
Chip Huyen
 AI Engineering
Building Applications 
with Foundation Models

Sample #2
Source : AI Engineering.pdf
Page   : 2
Chars  : 2340
--------------------------------------------------------------------------------
9
7 8 1 0 9 8 1 6 6 3 0 4
5 7 9 9 9
ISBN:   978-1-098-16630-4
US $79.99	   CAN $99.99
DATA
Foundation models have enabled many new AI use cases while lowering the barriers to entry for 
building AI products. This has transformed AI from an esoteric discipline into a powerful development 
tool that anyone can use—including those with no prior AI experience.
In this accessible guide, author Chip Huy
...


In [7]:
import re
from typing import List, Dict


class TextNormalizer:
    """
    Normalize raw PDF text to improve downstream
    chunking and retrieval quality.
    """

    _MULTIPLE_NEWLINES = re.compile(r"\n{3,}")
    _MULTIPLE_SPACES = re.compile(r"[ \t]{2,}")
    _SPACE_BEFORE_NEWLINE = re.compile(r"[ \t]+\n")
    _LINE_WRAP = re.compile(r"(?<!\n)\n(?!\n)")

    def normalize_text(self, text: str) -> str:
        if not text:
            return text

        # Normalize line endings
        text = text.replace("\r\n", "\n").replace("\r", "\n")

        # Remove trailing spaces before newlines
        text = self._SPACE_BEFORE_NEWLINE.sub("\n", text)

        # Remove line-wrapped newlines (inside paragraphs)
        text = self._LINE_WRAP.sub(" ", text)

        # Collapse excessive newlines (keep paragraphs)
        text = self._MULTIPLE_NEWLINES.sub("\n\n", text)

        # Collapse repeated spaces
        text = self._MULTIPLE_SPACES.sub(" ", text)

        return text.strip()

    def normalize_documents(self, documents: List[Dict]) -> List[Dict]:
        return [
            {
                "text": self.normalize_text(doc["text"]),
                "metadata": doc["metadata"]
            }
            for doc in documents
        ]


In [8]:
normalizer = TextNormalizer()
normalized_documents = normalizer.normalize_documents(documents)
normalized_documents[1]

{'text': '9 7 8 1 0 9 8 1 6 6 3 0 4 5 7 9 9 9 ISBN: 978-1-098-16630-4 US $79.99 CAN $99.99 DATA Foundation models have enabled many new AI use cases while lowering the barriers to entry for building AI products. This has transformed AI from an esoteric discipline into a powerful development tool that anyone can use—including those with no prior AI experience. In this accessible guide, author Chip Huyen discusses AI engineering: the process of building applications with readily available foundation models. AI application developers will discover how to navigate the AI landscape, including models, datasets, evaluation benchmarks, and the seemingly infinite number of application patterns. The book also introduces a practical framework for developing an AI application and efficiently deploying it. • Understand what AI engineering is and how it differs from traditional machine learning engineering • Learn the process for developing an AI application, the challenges at each step, and approac

In [9]:
class DocumentFilter:
    """
    Filters out low-value or noisy pages
    before chunking and embedding.
    """

    def __init__(self, min_char_count: int = 200):
        self.min_char_count = min_char_count

    def is_useful(self, doc: dict) -> bool:
        text = doc["text"].lower()

        # Filter very small pages
        if doc["metadata"]["char_count"] < self.min_char_count:
            return False

        # Filter common front-matter patterns
        noise_markers = [
            "isbn",
            "copyright",
            "all rights reserved",
            "table of contents",
            "price",
            "publisher"
        ]

        if any(marker in text for marker in noise_markers):
            return False

        return True

    def filter_documents(self, documents: list) -> list:
        return [doc for doc in documents if self.is_useful(doc)]


In [10]:
filterer = DocumentFilter(min_char_count=200)
clean_documents = filterer.filter_documents(normalized_documents)

print(f"Before filtering: {len(normalized_documents)}")
print(f"After filtering : {len(clean_documents)}")


Before filtering: 529
After filtering : 484


In [11]:
clean_documents[1]

{'text': 'This is the definitive segue into AI engineering from one of the greats of ML engineering! Chip has seen through successful projects and careers at every stage of a company and for the first time ever condensed her expertise for new AI Engineers entering the field. —swyx, Curator, AI.Engineer AI Engineering is a practical guide that provides the most up-to-date information on AI development, making it approachable for novice and expert leaders alike. This book is an essential resource for anyone looking to build robust and scalable AI systems. —Vicki Reyzelman, Chief AI Solutions Architect, Mave Sparks AI Engineering is a comprehensive guide that serves as an essential reference for both understanding and implementing AI systems in practice. —Han Lee, Director—Data Science, Moody’s AI Engineering is an essential guide for anyone building software with Generative AI! It demystifies the technology, highlights the importance of evaluation, and shares what should be done to achie

In [12]:
from typing import List, Dict
import math


class TextChunker:
    """
    Chunk normalized documents into token-aware chunks
    suitable for dense + sparse retrieval.
    """

    def __init__(
        self,
        chunk_size: int = 300,
        overlap: int = 50
    ):
        if overlap >= chunk_size:
            raise ValueError("overlap must be smaller than chunk_size")

        self.chunk_size = chunk_size
        self.overlap = overlap

    def _estimate_tokens(self, text: str) -> int:
        """
        Approximate token count.
        (Roughly 4 chars per token for English)
        """
        return math.ceil(len(text) / 4)

    def _split_into_chunks(self, text: str) -> List[str]:
        """
        Split text into overlapping chunks.
        """
        words = text.split()
        chunks = []

        start = 0
        while start < len(words):
            end = start + self.chunk_size
            chunk_words = words[start:end]
            chunks.append(" ".join(chunk_words))

            start = end - self.overlap
            if start < 0:
                start = 0

        return chunks

    def chunk_documents(self, documents: List[Dict]) -> List[Dict]:
        """
        Convert documents into chunks while preserving metadata.
        """
        chunked_docs = []

        for doc in documents:
            text = doc["text"]
            base_metadata = doc["metadata"]

            chunks = self._split_into_chunks(text)

            for idx, chunk_text in enumerate(chunks):
                chunked_docs.append(
                    {
                        "text": chunk_text,
                        "metadata": {
                            **base_metadata,
                            "chunk_id": f"{base_metadata['source']}_p{base_metadata['page']}_c{idx}",
                            "chunk_index": idx,
                            "chunk_char_count": len(chunk_text),
                            "chunk_token_estimate": self._estimate_tokens(chunk_text)
                        }
                    }
                )

        return chunked_docs


In [13]:
chunker = TextChunker(
    chunk_size=300,   # tokens (approx)
    overlap=50
)

chunks = chunker.chunk_documents(clean_documents)

print(f"Total chunks created: {len(chunks)}")
print(chunks[10])


Total chunks created: 855
{'text': '• Various neural network architectures, including feedforward, recurrent, and transformer. • Metrics such as accuracy, F1, precision, recall, cosine similarity, and cross entropy. If you don’t know them yet, don’t worry—this book has either brief, high-level explanations or pointers to resources that can get you up to speed. Who This Book Is For This book is for anyone who wants to leverage foundation models to solve real-world problems. This is a technical book, so the language of this book is geared toward technical roles, including AI engineers, ML engineers, data scientists, engineering managers, and technical product managers. This book is for you if you can relate to one of the following scenarios: • You’re building or optimizing an AI application, whether you’re starting from scratch or looking to move beyond the demo phase into a production-ready stage. You may also be facing issues like hallucinations, security, latency, or costs, and need t

In [15]:
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")
print(client.get_collections())


collections=[CollectionDescription(name='knowledge_base_chunks')]


In [16]:
import tomllib
from pathlib import Path
from typing import Dict


class ConfigLoader:
    def __init__(self, path: str):
        self.path = Path(path)

    def load(self) -> Dict:
        with self.path.open("rb") as f:
            return tomllib.load(f)


In [17]:
from qdrant_client import models
print(models.Document)


<class 'qdrant_client.http.models.models.Document'>


In [18]:
import tomllib
from pathlib import Path
from typing import Dict


class ConfigLoader:
    def __init__(self, path: str):
        self.path = Path(path)

    def load(self) -> Dict:
        with self.path.open("rb") as f:
            return tomllib.load(f)


In [19]:
from qdrant_client import QdrantClient, models


class QdrantHybridCollectionManager:
    """
    Manages lifecycle of a hybrid (dense + sparse) Qdrant collection.
    Responsible ONLY for schema, not data ingestion.
    """

    def __init__(self, client: QdrantClient, config: dict):
        self.client = client

        self.collection_name = config["vector_store"]["collection_name"]

        self.dense_cfg = config["dense_vector"]
        self.sparse_cfg = config["sparse_vector"]

    def recreate_collection(self) -> None:
        """
        Delete existing collection (if any) and create a fresh hybrid collection.
        """

        if self.client.collection_exists(self.collection_name):
            self.client.delete_collection(self.collection_name)

        self.client.create_collection(
            collection_name=self.collection_name,
            vectors_config={
                self.dense_cfg["name"]: models.VectorParams(
                    size=self._dense_vector_size(),
                    distance=models.Distance[
                        self.dense_cfg["distance"].upper()
                    ],
                )
            },
            sparse_vectors_config={
                self.sparse_cfg["name"]: models.SparseVectorParams()
            },
        )

    def _dense_vector_size(self) -> int:
        """
        Resolve dense vector size using FastEmbed.
        """
        return self.client.get_embedding_size(
            self.dense_cfg["model"]
        )


In [20]:
from qdrant_client import QdrantClient

# Load config
config = ConfigLoader("parameters.toml").load()

# Init Qdrant client
client = QdrantClient(
    url=config["vector_store"]["url"]
)

# Manage collection
collection_manager = QdrantHybridCollectionManager(
    client=client,
    config=config
)

collection_manager.recreate_collection()

print("Hybrid collection created (dense + sparse)")


Hybrid collection created (dense + sparse)


In [21]:
from typing import List, Dict
from qdrant_client import models


class HybridDocumentBuilder:
    """
    Converts text chunks into FastEmbed-backed Qdrant documents
    (dense + sparse).
    """

    def __init__(self, dense_cfg: dict, sparse_cfg: dict):
        self.dense_name = dense_cfg["name"]
        self.dense_model = dense_cfg["model"]

        self.sparse_name = sparse_cfg["name"]
        self.sparse_model = sparse_cfg["model"]

    def build(self, chunks: List[Dict]):
        """
        Returns:
            documents: list of {vector_name: models.Document}
            payloads:  list of metadata dicts
        """
        documents = []
        payloads = []

        for chunk in chunks:
            text = chunk["text"]

            documents.append(
                {
                    self.dense_name: models.Document(
                        text=text,
                        model=self.dense_model,
                    ),
                    self.sparse_name: models.Document(
                        text=text,
                        model=self.sparse_model,
                    ),
                }
            )

            payloads.append(
                {
                    **chunk["metadata"],
                    "text": text,
                }
            )

        return documents, payloads


In [22]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
import uuid


class QdrantIngestor:
    """
    Handles ingestion of hybrid documents into Qdrant.
    """

    def __init__(self, client: QdrantClient, collection_name: str):
        self.client = client
        self.collection = collection_name

    def ingest(self, documents: list, payloads: list) -> None:
        if len(documents) != len(payloads):
            raise ValueError("Documents and payloads length mismatch")

        points = []

        for i in range(len(documents)):
            vector = documents[i]

            # validation
            if not isinstance(vector, dict):
                raise ValueError(
                    f"Expected hybrid vector dict, got {type(vector)} at index {i}"
                )

            if "dense" not in vector or "sparse" not in vector:
                raise ValueError(
                    f"Hybrid vector must contain 'dense' and 'sparse' keys at index {i}"
                )

            points.append(
                PointStruct(
                    id=str(uuid.uuid4()),
                    vector={
                        "dense": vector["dense"],
                        "sparse": vector["sparse"],
                    },
                    payload=payloads[i],
                )
            )

        # SAFE, explicit ingestion
        self.client.upsert(
            collection_name=self.collection,
            points=points,
        )


In [23]:
from qdrant_client import QdrantClient

# Load config
config = ConfigLoader("parameters.toml").load()

client = QdrantClient(
    url=config["vector_store"]["url"]
)

# Build hybrid documents
builder = HybridDocumentBuilder(
    dense_cfg=config["dense_vector"],
    sparse_cfg=config["sparse_vector"],
)

documents, payloads = builder.build(chunks)

# Ingest into Qdrant
ingestor = QdrantIngestor(
    client=client,
    collection_name=config["vector_store"]["collection_name"],
)

ingestor.ingest(documents, payloads)

print("Ingestion completed (dense + sparse)")


ResponseHandlingException: timed out

In [None]:
from qdrant_client import models


class HybridQueryBuilder:
    """
    Builds dense + sparse query objects for FastEmbed-based hybrid search.
    """

    def __init__(self, dense_cfg: dict, sparse_cfg: dict):
        self.dense_name = dense_cfg["name"]
        self.dense_model = dense_cfg["model"]

        self.sparse_name = sparse_cfg["name"]
        self.sparse_model = sparse_cfg["model"]

    def build_prefetch(self, query: str, limit: int):
        """
        Builds prefetch queries for dense + sparse retrieval.
        """
        return [
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=self.dense_model,
                ),
                using=self.dense_name,
                limit=limit,
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=self.sparse_model,
                ),
                using=self.sparse_name,
                limit=limit,
            ),
        ]


In [None]:
from qdrant_client import QdrantClient, models
from typing import List, Dict


class HybridSearcher:
    """
    Executes hybrid search (Dense + Sparse + RRF) against Qdrant.
    """

    def __init__(
        self,
        client: QdrantClient,
        collection_name: str,
        query_builder: HybridQueryBuilder,
    ):
        self.client = client
        self.collection = collection_name
        self.query_builder = query_builder

    def search(
        self,
        query: str,
        top_k: int = 5,
        prefetch_k: int = 20,
    ) -> List[Dict]:
        """
        Perform hybrid search using Reciprocal Rank Fusion (RRF).
        """

        prefetch = self.query_builder.build_prefetch(
            query=query,
            limit=prefetch_k,
        )

        result = self.client.query_points(
            collection_name=self.collection,
            query=models.FusionQuery(
                fusion=models.Fusion.RRF
            ),
            prefetch=prefetch,
            limit=top_k,
            with_payload=True,
        )

        return [point.payload for point in result.points]


In [None]:
from qdrant_client import QdrantClient

# Load config
config = ConfigLoader("parameters.toml").load()

client = QdrantClient(
    url=config["vector_store"]["url"]
)

# Build query components
query_builder = HybridQueryBuilder(
    dense_cfg=config["dense_vector"],
    sparse_cfg=config["sparse_vector"],
)

searcher = HybridSearcher(
    client=client,
    collection_name=config["vector_store"]["collection_name"],
    query_builder=query_builder,
)

# Run hybrid search
results = searcher.search(
    query="What is prompt engineering?",
    top_k=config["rag"]["top_k"],
)

results[:3]


[{'source': 'AI Engineering.pdf',
  'page': 235,
  'char_count': 1981,
  'chunk_id': 'AI Engineering.pdf_p235_c0',
  'chunk_index': 0,
  'chunk_char_count': 1951,
  'chunk_token_estimate': 488,
  'text': '1 In its short existence, prompt engineering has managed to generate an incredible amount of animosity. Com‐ plaints about how prompt engineering is not a real thing have gathered thousands of supporting comments; see 1, 2, 3, 4. When I told people that my upcoming book has a chapter on prompt engineering, many rolled their eyes. CHAPTER 5 Prompt Engineering Prompt engineering refers to the process of crafting an instruction that gets a model to generate the desired outcome. Prompt engineering is the easiest and most com‐ mon model adaptation technique. Unlike finetuning, prompt engineering guides a model’s behavior without changing the model’s weights. Thanks to the strong base capabilities of foundation models, many people have successfully adapted them for applications using prompt

In [None]:
from typing import List, Dict


class RAGPromptBuilder:
    """
    Builds a grounded RAG prompt from retrieved chunks.
    """

    SYSTEM_PROMPT = (
        "You are a helpful AI assistant. "
        "Answer the question using ONLY the provided context. "
        "If the answer is not contained in the context, say "
        "'I don't know based on the provided document.'"
    )

    def build(self, query: str, contexts: List[Dict]) -> List[Dict]:
        context_text = "\n\n".join(
            f"[Context {i+1}]\n{ctx['text']}"
            for i, ctx in enumerate(contexts)
        )

        user_prompt = f"""
Question:
{query}

Context:
{context_text}

Answer:
""".strip()

        return [
            {"role": "system", "content": self.SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ]


In [None]:
from openai import OpenAI
from tenacity import retry, stop_after_attempt, wait_exponential


class OpenAIAnswerGenerator:
    """
    Generates answers using OpenAI chat models.
    """

    def __init__(self, model: str):
        self.client = OpenAI()
        self.model = model

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(min=2, max=10),
    )
    def generate(self, messages: list) -> str:
        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=0.2,
        )

        return response.choices[0].message.content.strip()


In [None]:
class RAGPipeline:
    """
    Full RAG pipeline:
    Query → Hybrid Search → OpenAI Answer
    """

    def __init__(
        self,
        searcher,
        prompt_builder: RAGPromptBuilder,
        generator: OpenAIAnswerGenerator,
        top_k: int,
    ):
        self.searcher = searcher
        self.prompt_builder = prompt_builder
        self.generator = generator
        self.top_k = top_k

    def answer(self, query: str) -> str:
        contexts = self.searcher.search(
            query=query,
            top_k=self.top_k,
        )

        messages = self.prompt_builder.build(
            query=query,
            contexts=contexts,
        )

        return self.generator.generate(messages)


In [None]:
# Load config
config = ConfigLoader("parameters.toml").load()

# Build RAG components
prompt_builder = RAGPromptBuilder()

generator = OpenAIAnswerGenerator(
    model=config["llm"]["chat_model"]
)

rag = RAGPipeline(
    searcher=searcher, 
    prompt_builder=prompt_builder,
    generator=generator,
    top_k=config["rag"]["top_k"],
)


answer = rag.answer(
    "What is reward model?"
)

print(answer)


A reward model takes in a (prompt, response) pair and scores how good the response is given the prompt. It outputs a score indicating the quality of the response, and has been successfully used in reinforcement learning from human feedback (RLHF) for many years.


In [None]:
answer = rag.answer(
    "what is agent"
)

print(answer)

An agent is anything that can perceive its environment through sensors and act upon that environment through actuators. It is characterized by the environment it operates in and the set of actions it can perform.


In [None]:
# Evaluation Runner

In [None]:
!pip install pandas

In [24]:
import time
from statistics import mean
import pandas as pd
from qdrant_client import QdrantClient, models


In [31]:
client = QdrantClient(url="http://localhost:6333")

COLLECTION = "knowledge_base_chunks"
TOP_K = 5
RELEVANCE_THRESHOLD = 0.6


queries = [
    "What is prompt engineering?",
    "Explain vector databases",
    "What is retrieval augmented generation?",
    "Why hybrid search is better?",
]


In [32]:
def dense_only(query: str):
    return client.query_points(
        collection_name=COLLECTION,
        query=models.Document(
            text=query,
            model="sentence-transformers/all-MiniLM-L6-v2",
        ),
        using="dense",
        limit=TOP_K,
    ).points


In [33]:
def sparse_only(query: str):
    return client.query_points(
        collection_name=COLLECTION,
        query=models.Document(
            text=query,
            model="Qdrant/bm25",
        ),
        using="sparse",
        limit=TOP_K,
    ).points


In [34]:
def hybrid(query: str):
    return client.query_points(
        collection_name=COLLECTION,
        query=models.FusionQuery(
            fusion=models.Fusion.RRF
        ),
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="sentence-transformers/all-MiniLM-L6-v2",
                ),
                using="dense",
                limit=20,
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="sparse",
                limit=20,
            ),
        ],
        limit=TOP_K,
    ).points


In [35]:
def evaluate(name, retriever_fn):
    latencies = []
    avg_scores = []
    max_scores = []
    relevant_ratios = []

    for q in queries:
        start = time.perf_counter()
        results = retriever_fn(q)
        latency = (time.perf_counter() - start) * 1000
        latencies.append(latency)

        scores = [p.score for p in results if p.score is not None]

        if scores:
            avg_scores.append(mean(scores))
            max_scores.append(max(scores))
            relevant_ratios.append(
                sum(s >= RELEVANCE_THRESHOLD for s in scores) / len(scores)
            )
        else:
            avg_scores.append(0)
            max_scores.append(0)
            relevant_ratios.append(0)

    return {
        "retriever": name,
        "avg_latency_ms": round(mean(latencies), 2),
        "avg_similarity_score": round(mean(avg_scores), 4),
        "max_similarity_score": round(mean(max_scores), 4),
        "relevant_ratio": round(mean(relevant_ratios), 2),
    }


In [36]:
results = []

results.append(evaluate("Dense Only", dense_only))
results.append(evaluate("Sparse Only", sparse_only))
results.append(evaluate("Hybrid (RRF)", hybrid))


NOTE:
The similarity scores shown below are NOT directly comparable across retrievers.

Dense retrievers use cosine similarity (bounded between 0 and 1).
Sparse retrievers (BM25 / SPLADE) return relevance scores based on term statistics and are unbounded.
Hybrid retrievers use rank fusion (RRF), where the final score is a fusion score and does not represent semantic similarity.

Therefore, absolute score values should not be compared across retrieval strategies.
Evaluation should be interpreted in terms of ranking quality and latency, not raw scores.


In [None]:
df = pd.DataFrame(results)
df


Unnamed: 0,retriever,avg_latency_ms,avg_similarity_score,max_similarity_score,relevant_ratio
0,Dense Only,921.99,0.5598,0.6565,0.35
1,Sparse Only,307.88,3.8873,4.1842,1.0
2,Hybrid (RRF),38.23,0.4796,0.7917,0.2
