In [1]:
!pip install langchain langchain-ollama langchain-huggingface langchain-community faiss-cpu rank-bm25 datasets tqdm gradio torch sentence-transformers

Collecting langchain-huggingface
  Using cached langchain_huggingface-1.0.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0.0,>=0.33.4 (from langchain-huggingface)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
INFO: pip is looking at multiple versions of langchain-huggingface to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.0.0-py3-none-any.whl.metadata (2.1 kB)
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting huggingface-hub>=0.33.4 (from langchain-huggingface)
  Using cached huggingface_hub-1.1.4-py3-none-any.whl.metadata (13 kB)
Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Using cached huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
Installing collected packages: huggingface-hub, langchain-huggingface
  Attempting uninstall: huggingface-hub
    Found existing install

In [2]:
# Cell 2: Imports & Setup
import numpy as np
from tqdm import tqdm
from datasets import load_dataset

from langchain_ollama import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from rank_bm25 import BM25Okapi
import gradio as gr

print("Setting up Ollama + Models...")
llm = ChatOllama(model="llama3:latest", temperature=0.0)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

Setting up Ollama + Models...



In [3]:
# Cell 3: Load HotpotQA dataset
print("Loading HotpotQA dataset...")
dataset = load_dataset("hotpot_qa", "fullwiki", split="validation")

documents = []
questions = []

print("Processing documents...")
for item in tqdm(dataset):
    questions.append({
        "question": item["question"],
        "answer": item["answer"],
        "supporting_titles": set(item["supporting_facts"]["title"])
    })
    
    for title, sentences in zip(item["context"]["title"], item["context"]["sentences"]):
        for sent in sentences:
            documents.append(Document(
                page_content=sent.strip(),
                metadata={"title": title}
            ))

# Remove duplicate sentences
unique_docs = {doc.page_content: doc for doc in documents}
documents = list(unique_docs.values())
print(f"Total unique documents: {len(documents)} | Questions: {len(questions)}")

Loading HotpotQA dataset...
Processing documents...


100%|██████████| 7405/7405 [00:18<00:00, 402.42it/s]


Total unique documents: 273710 | Questions: 7405


In [4]:
# Cell 4: Build FAISS + BM25 indexes
print("Building FAISS + BM25 indexes...")
vectorstore = FAISS.from_documents(documents, embeddings)
tokenized_corpus = [doc.page_content.lower().split() for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)
print("Indexes ready!")

Building FAISS + BM25 indexes...
Indexes ready!


In [8]:
# Cell 5: Prompts & Chains
decompose_template = "Break this question into 2-3 simple sub-questions:\nQuestion: {question}\nSub-questions:\n1."
generate_template = """Using only the context below, answer the question.

Context:
{context}

Question: {question}
Answer:"""

decompose_prompt = PromptTemplate.from_template(decompose_template)
generate_prompt = PromptTemplate.from_template(generate_template)

decompose_chain = decompose_prompt | llm
generate_chain = generate_prompt | llm

In [9]:
# Cell 6: Hybrid Retrieval Function
def retrieve(query, k=8):
    # Dense (FAISS)
    dense_docs = vectorstore.similarity_search(query, k=k)
    # Sparse (BM25)
    bm25_scores = bm25.get_scores(query.lower().split())
    bm25_docs = [documents[i] for i in bm25_scores.argsort()[::-1][:k]]
    
    # Merge & deduplicate
    seen = set()
    merged = []
    for doc in dense_docs + bm25_docs:
        if doc.page_content not in seen:
            seen.add(doc.page_content)
            merged.append(doc)
    return merged[:k]

In [12]:
# Cell 7 ko replace kar de iss se (sirf ye part change kar):

def multi_hop_rag(question):
    print(f"\nQuestion: {question}")
    
    # Step 1: Decompose
    sub_text = decompose_chain.invoke({"question": question}).content
    subs = []
    for line in sub_text.split("\n"):
        line = line.strip()
        if line and any(c.isdigit() for c in line[:3]):
            sub_q = line.split(".", 1)[-1].strip(" :-")
            if sub_q:
                subs.append(sub_q)
    
    if not subs:
        subs = [question]
    subs = subs[:3]
    print("Sub-questions:", subs)
    
    # Step 2: Retrieve + Strict context limiting
    context_parts = []
    for sq in subs:
        docs = retrieve(sq, k=5)  # k=6 → k=5 kar diya
        context_parts.extend([d.page_content for d in docs])
    
    # YE LINE SABSE ZAROORI HAI → 8k-10k tokens max
    context = "\n\n".join(context_parts)
    context = context[:24000]  # 32k → 24k (safe zone for 8B on CPU/GPU)
    
    # Optional: Agar phir bhi crash ho to 16000 kar dena
    # context = context[:16000]
    
    print(f"Final context length: ~{len(context.split())} words")
    
    # Step 3: Final answer
    answer = generate_chain.invoke({"question": question, "context": context}).content
    print("Answer generated!")
    return answer

In [None]:
# Cell 9: Gradio Interface
def gradio_fn(question):
    return multi_hop_rag(question)

demo = gr.Interface(
    fn=gradio_fn,
    inputs=gr.Textbox(label="Ask a Multi-Hop Question", lines=2, 
                      placeholder="e.g. Which magazine named the discoverer of general relativity Person of the Century in 1999?"),
    outputs=gr.Textbox(label="Answer"),
    title="Multi-Hop RAG with Llama3 (Ollama)",
    description="HotpotQA FullWiki • Hybrid Retrieval (FAISS + BM25) • 100% Local & Private",
    examples=[
        ["Which magazine named the discoverer of general relativity as Person of the Century in 1999?"],
        ["Who directed the film that won the Academy Award for Best Picture in 1994?"],
        ["Are the birthplace of Barack Obama and the capital of Hawaii the same state?"],
        ["What is the name of the university whose football team is called the Crimson Tide?"],
    ],
    allow_flagging="never",
    theme=gr.themes.Soft()
)

demo.launch(share=True)  # share=True → public link, share=False → localhost only



* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://38a7fbb7e4ffa1ee25.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)





Question: Which magazine named the discoverer of general relativity as Person of the Century in 1999?
Sub-questions: ['Who was named Person of the Century by a magazine in 1999?', 'What was the name of the magazine that made this designation?']
Final context length: ~116 words
Answer generated!


In [15]:
# Cell 13: FAST Retrieval Evaluation (sirf 200 questions pe — 10–15 min max)

def evaluate_retrieval_fast(sample_size=200, top_ks=[1, 3, 5, 8, 10]):
    print(f"FAST EVALUATION on {sample_size} questions (out of {len(questions)})...")
    
    import random
    indices = random.sample(range(len(questions)), sample_size)
    
    results = defaultdict(list)
    
    for idx in tqdm(indices, desc="Evaluating"):
        q = questions[idx]["question"]
        true_titles = get_ground_truth_titles(idx)
        
        retrieved_docs = retrieve(q, k=max(top_ks))
        retrieved_titles_list = [doc.metadata["title"] for doc in retrieved_docs]
        
        for k in top_ks:
            retrieved_k = set(retrieved_titles_list[:k])
            relevant_retrieved = len(retrieved_k & true_titles)
            total_relevant = len(true_titles)
            
            # Precision, Recall
            precision = relevant_retrieved / k if k > 0 else 0
            recall = relevant_retrieved / total_relevant if total_relevant > 0 else 0
            results[f"Precision@{k}"].append(precision)
            results[f"Recall@{k}"].append(recall)
            
            # NDCG
            dcg = sum((1 / math.log2(r+1)) for r in range(1, k+1) if retrieved_titles_list[r-1] in true_titles)
            idcg = sum(1 / math.log2(i+1) for i in range(1, min(k, total_relevant)+1))
            ndcg = dcg / idcg if idcg > 0 else 0
            results[f"NDCG@{k}"].append(ndcg)
    
    # Print table
    print("\n" + "="*60)
    print(f"RETRIEVAL METRICS (Sample={sample_size} questions)")
    print("="*60)
    print(f"{'Metric':<12} {'K=1':>8} {'K=3':>8} {'K=5':>8} {'K=8':>8} {'K=10':>8}")
    print("-"*60)
    for metric in ["Precision@", "Recall@", "NDCG@"]:
        row = f"{metric:<12}"
        for k in top_ks:
            key = f"{metric}{k}"
            if key in results:
                avg = sum(results[key]) / len(results[key])
                row += f"{avg*100:8.2f}%"
        print(row)
    
   


evaluate_retrieval_fast(sample_size=200, top_ks=[1, 3, 5, 8, 10])

FAST EVALUATION on 200 questions (out of 7405)...


Evaluating:  16%|█▌        | 32/200 [01:32<08:34,  3.06s/it]


Question: Are the birthplace of Barack Obama and the capital of Hawaii the same state?


Evaluating:  20%|██        | 40/200 [04:23<1:15:04, 28.15s/it]

Sub-questions: ["Is Barack Obama's birthplace located in the state of Hawaii?"]


Evaluating:  20%|██        | 41/200 [04:36<1:02:37, 23.63s/it]

Final context length: ~46 words


Evaluating:  23%|██▎       | 46/200 [07:33<1:39:44, 38.86s/it]

Answer generated!


Evaluating: 100%|██████████| 200/200 [17:29<00:00,  5.25s/it] 



RETRIEVAL METRICS (Sample=200 questions)
Metric            K=1      K=3      K=5      K=8     K=10
------------------------------------------------------------
Precision@     53.50%   23.50%   15.90%   10.62%    8.80%
Recall@        26.75%   35.25%   39.75%   42.50%   44.00%
NDCG@          53.50%   43.14%   47.64%   50.64%   51.83%


MRR: 100%|██████████| 200/200 [07:03<00:00,  2.12s/it]


Mean Reciprocal Rank (MRR): 0.5996



