In [1]:
# Install dependencies into *this notebook kernel* (prefer %pip over !python -m pip)
%pip install -q pypdf langchain langchain_community langchain_openai langchain_chroma rank_bm25

Note: you may need to restart the kernel to use updated packages.


#### Initialize OpeAI LLM

In [2]:
import os
from langchain_openai import ChatOpenAI

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 

chat_model = ChatOpenAI(model="gpt-5-nano-2025-08-07", temperature=0, openai_api_key=OPENAI_API_KEY)

#### Initialize Embedding Model

In [3]:
from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=OPENAI_API_KEY)

#### Load PDF

In [4]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("Research.pdf")

docs = loader.load()




#### Split Document into chuncks

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)

chunks = text_splitter.split_documents(docs)

In [6]:
len(chunks)

15

#### Create Semantic Search Retriver

In [7]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(chunks,embedding_model)

vector_store_retriver= vector_store.as_retriever( search_kwargs={"k":2})

In [8]:
vector_store_retriver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000002C313432570>, search_kwargs={'k': 2})

#### Create Keyword Search Retiever

In [9]:
from langchain_community.retrievers import BM25Retriever

keyword_retriever = BM25Retriever.from_documents(chunks)

keyword_retriever.k = 2

In [10]:
keyword_retriever

BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000002C35A44AEA0>, k=2)

#### Create Hybrid search Retriever

In [11]:
# Hybrid search: combine semantic + keyword retrievers (no Embedchain client required)

from collections import defaultdict

try:
    # BaseRetriever is available in most LangChain installs via langchain-core
    from langchain_core.retrievers import BaseRetriever
except Exception:
    BaseRetriever = object  # fallback (still lets you call `hybrid_search(...)` directly)

def _retrieve(retriever, query: str):
    """Compatibility helper: works across different LangChain retriever interfaces."""
    if hasattr(retriever, "invoke"):
        return retriever.invoke(query)
    return retriever.get_relevant_documents(query)

def _doc_key(doc):
    # stable key for de-duplication across retrievers
    meta_items = tuple(sorted((doc.metadata or {}).items()))
    return (doc.page_content, meta_items)

def hybrid_search(
    query: str,
    *,
    vector_retriever,
    keyword_retriever,
    k: int = 4,
    weights=(0.5, 0.5),
):
    sem_docs = _retrieve(vector_retriever, query)
    key_docs = _retrieve(keyword_retriever, query)

    scores = defaultdict(float)
    docs_by_key = {}

    for rank, doc in enumerate(sem_docs):
        key = _doc_key(doc)
        docs_by_key[key] = doc
        scores[key] += weights[0] * (1.0 / (rank + 1))

    for rank, doc in enumerate(key_docs):
        key = _doc_key(doc)
        docs_by_key[key] = doc
        scores[key] += weights[1] * (1.0 / (rank + 1))

    ranked_keys = sorted(scores.keys(), key=lambda key: scores[key], reverse=True)
    return [docs_by_key[key] for key in ranked_keys[:k]]

class HybridRetriever(BaseRetriever):
    vector_retriever: any
    keyword_retriever: any
    k: int = 4
    weights: tuple = (0.5, 0.5)

    def _get_relevant_documents(self, query: str, *, run_manager=None):
        return hybrid_search(
            query,
            vector_retriever=self.vector_retriever,
            keyword_retriever=self.keyword_retriever,
            k=self.k,
            weights=self.weights,
        )

ensemble_retriever = HybridRetriever(
    vector_retriever=vector_store_retriver,
    keyword_retriever=keyword_retriever,
    k=4,
    weights=(0.5, 0.5),
)

ensemble_retriever

  warn(


HybridRetriever(vector_retriever=VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000002C313432570>, search_kwargs={'k': 2}), keyword_retriever=BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000002C35A44AEA0>, k=2))