In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [8]:
documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords,",
    "Keyword-based search relies on sparse embeddings."
]

In [3]:
query = "keyword-based search"

In [10]:
import re
def preprocess(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  return text

In [12]:
preprocessedQuery = preprocess(query)
preprocessedQuery

'keywordbased search'

In [11]:
preproceesed = [preprocess(doc) for doc in documents]
preproceesed

['this is a list which containing sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [17]:
vector = TfidfVectorizer()
x = vector.fit_transform(preproceesed)
x.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [21]:
vector.get_feature_names_out()

array(['analysis', 'are', 'containing', 'document', 'documents',
       'embeddings', 'extracting', 'for', 'important', 'involves', 'is',
       'keywordbased', 'keywords', 'list', 'on', 'relies', 'sample',
       'search', 'sparse', 'this', 'which'], dtype=object)

In [18]:
x.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [19]:
embeddedQuery = vector.transform([preprocessedQuery])
embeddedQuery.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [33]:
similarity = cosine_similarity(x, embeddedQuery)
similarity

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [34]:
np.argsort(similarity, axis = 0)


array([[0],
       [2],
       [3],
       [1]])

In [35]:
rankedIndices = np.argsort(similarity, axis=0)[::-1].flatten()

In [36]:
rankedIndices

array([1, 3, 2, 0])

In [38]:
rankedDocs = [documents[i] for i in rankedIndices]

In [39]:
for i, doc in enumerate(rankedDocs):
  print(f"Rank {i+1}: {doc}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords,
Rank 4: This is a list which containing sample documents.


# Using Langchain

In [1]:
path = "/content/1.pdf"

In [None]:
%pip install pypdf
%pip install langchain_community

In [5]:
from google.colab import files
uploaded = files.upload()

Saving 1.pdf to 1.pdf


In [6]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(path)

In [9]:
docs = loader.load()

In [10]:
%pip install langchain



In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=30)
chunks = splitter.split_documents(docs)

In [None]:
%pip install langchain-huggingface

In [24]:
from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [25]:
from google.colab import userdata
userdata.get('HUGGINGFACEHUB_API_TOKEN')

'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'

In [None]:
%pip install chromadb

In [17]:
from langchain_community.vectorstores import Chroma

In [27]:
vectorStore = Chroma.from_documents(chunks, embedding)

In [32]:
similarityRetriever = vectorStore.as_retriever(search_type = "similarity", search_kwargs = {"k": 3})

In [29]:
%pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [35]:
from langchain_community.retrievers import BM25Retriever

In [38]:
keywordRetriever = BM25Retriever.from_documents(chunks)
keywordRetriever.k = 3

In [43]:
def hybrid_retrieve(
    query,
    denseRetriever,
    sparseRetriever,
    denseWeight=0.5,
    sparseWeight=0.5,
    rrf_k=60
):
    scores = {}

    # Dense retrieval
    denseDocs = denseRetriever.invoke(query)
    for rank, doc in enumerate(denseDocs):
        key = doc.page_content
        scores[key] = scores.get(key, 0) + denseWeight / (rank + 1 + rrf_k)

    # Sparse retrieval
    sparse_docs = sparseRetriever.invoke(query)
    for rank, doc in enumerate(sparse_docs):
        key = doc.page_content
        scores[key] = scores.get(key, 0) + sparseWeight / (rank + 1 + rrf_k)

    # Sort by final score
    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    return [doc for doc, _ in ranked_docs]


In [46]:
query = "WHAT ARE THE DIFFERENCES IN THE UNINSURED RATE BY STATE IN 2022"

results = hybrid_retrieve(
    query=query,
    denseRetriever=similarityRetriever,
    sparseRetriever=keywordRetriever,
    denseWeight=0.5,
    sparseWeight=0.5
)

for i, r in enumerate(results):
    print(f"{i+1}. {r[:100]}")


1. decreases (Appendix Table B-2). 
DIFFERENCES IN THE 
UNINSURED RATE BY STATE 
IN 2022
In 2022, unins
2. CHANGES IN THE UNINSURED 
RATE BY STATE FROM 2021 
TO 2022
From 2021 to 2022, uninsured rates 
decre
3. percent), and New Mexico had 
the highest (Figure 4). 
• Twenty-seven states had lower 
uninsured ra
4. Medicaid coverage was 22.7 per-
cent in the group of states that 
expanded Medicaid eligibility and 


In [50]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [52]:
parser = StrOutputParser()

In [51]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace, HuggingFacePipeline
llm = HuggingFaceEndpoint(
    repo_id = "Qwen/Qwen3-4B-Instruct-2507",
    task = "text-generation",
)
model = ChatHuggingFace(llm = llm)


In [60]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate

similarityPrompt = PromptTemplate(
    template="""
    Answer the question using only the context below.
    Context:
    {context}
    Question:
    {question}
    """,
    input_variables=["context", "question"]
)

similarityChain = (
    {
        "context": lambda x: "\n\n".join(
            doc.page_content
            for doc in similarityRetriever.invoke(x["question"])
        ),
        "question": RunnablePassthrough()
    }
    | similarityPrompt
    | model
    | parser
)


In [62]:
response1 = similarityChain.invoke({
    "question": "WHAT ARE THE DIFFERENCES IN THE UNINSURED RATE BY STATE IN 2022"
})
response1


'The differences in the uninsured rate by state in 2022 ranged from a low of 2.4 percent to a higher rate in New Mexico, which had the highest uninsured rate (Figure 4). Twenty-seven states had lower uninsured rates in 2022 compared to 2021, while Maine was the only state whose uninsured rate increased.'

In [63]:
sparsePrompt = PromptTemplate(
    template="""
    Answer the question using only the context below.
    Context:
    {context}
    Question:
    {question}
    """,
    input_variables=["context", "question"]
)

sparseChain = (
    {
        "context": lambda x: "\n\n".join(
            doc.page_content
            for doc in keywordRetriever.invoke(x["question"])
        ),
        "question": RunnablePassthrough()
    }
    | similarityPrompt
    | model
    | parser
)


In [65]:
response2 = sparseChain.invoke({
    "question": "WHAT ARE THE DIFFERENCES IN THE UNINSURED RATE BY STATE IN 2022"
})
response2

'In 2022, uninsured rates at the time of interview ranged across states from a low of 2.4 percent.'

In [66]:
hybridPrompt = PromptTemplate(
    template="""
Answer the question using the hybrid-retrieved context below.

Context:
{context}

Question:
{question}
""",
    input_variables=["context", "question"]
)
hybridChain = (
    {
        "context": lambda x: "\n\n".join(
            hybrid_retrieve(
                x["question"],
                similarityRetriever,
                keywordRetriever
            )
        ),
        "question": RunnablePassthrough()
    }
    | hybridPrompt
    | model
    | parser
)


In [67]:
response3 = hybridChain.invoke({
    "question": "WHAT ARE THE DIFFERENCES IN THE UNINSURED RATE BY STATE IN 2022"
})
response3

'In 2022, the uninsured rates varied across states, ranging from a low of 2.4 percent to a higher rate in New Mexico, which had the highest uninsured rate (specific value not provided in the context). The exact rates for each state are not fully detailed, but it is noted that the uninsured rate in Maine increased from 2021 to 2022, while 27 states experienced a decrease in their uninsured rates. The specific values beyond the low of 2.4 percent and the high in New Mexico are not explicitly listed in the provided context.'