In [None]:
%pip install flashrank

In [None]:
%pip install langchain langchain-community langchain-huggingface

In [None]:
%pip install pypdf tiktoken

In [None]:
%pip install chromadb

In [None]:
%pip install rank_bm25

In [None]:
%pip install sentence-transformers

## STEP 1 (DATA INGESTION)

In [8]:
from google.colab import files
uploaded = files.upload()

Saving nlp.pdf to nlp.pdf


In [9]:
filePath = "/content/nlp.pdf"

In [10]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [11]:
def loadDocs(path:str):
  loader = PyPDFLoader(path)
  docs = loader.load()
  return docs

In [12]:
docs = loadDocs(filePath)
print(len(docs))
print(docs[0])

19
page_content='Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-
stream NLP tasks. However, their ability to access and precisely manipulate knowl-
edge is still limited, and hence on knowledge-intensive tasks, their performance
lags behind task-speciﬁc architectures. Additionally, providing provenance for their
decisions and updating their world knowledge remain open research problems. Pre-
trained models with a differentiable access mechanism to explicit non-parametric
memory have so far been only investigated for extracti

In [13]:
def splitDocs(docs):
  splitter = RecursiveCharacterTextSplitter(
      chunk_size = 400,
      chunk_overlap = 150
  )
  chunks = splitter.split_documents(docs)
  return chunks

In [29]:
chunks = splitDocs(docs)
print(len(chunks))
print(chunks[0])

262
page_content='Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 0, 'page_label': '1'}


In [15]:
from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFaceEndpoint

In [None]:
# HF API 

In [None]:
embedding = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')

In [19]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace, HuggingFacePipeline
llm = HuggingFaceEndpoint(
    repo_id = "meta-llama/Llama-3.2-3B-Instruct",
    task = "text-generation",
)
model = ChatHuggingFace(llm = llm)

In [20]:
parser = StrOutputParser()

### STEP 2 (SELF QUERYING ANALYZER)

In [21]:
from langchain_core.prompts import PromptTemplate
import json
selfQueryPrompt = PromptTemplate(
    template="""
You are a query analyzer.

Your task is to split the question into:
1. semantic_query → what should be searched semantically
2. filters → structured metadata constraints

Allowed filters:
- page (integer)

RULES:
- If the question mentions "page X", extract page = X
- Remove filter-related words from the semantic query
- If no filters apply, return an empty filters object

EXAMPLES:

Question:
"What is discussed on page 6 about Jeopardy Question Generation?"

Output:
{{
  "semantic_query": "Jeopardy Question Generation",
  "filters": {{ "page": 6 }}
}}

Question:
"Explain Jeopardy Question Generation"

Output:
{{
  "semantic_query": "Jeopardy Question Generation",
  "filters": {{}}
}}

NOW ANALYZE THIS QUESTION:

Question:
{question}

Return ONLY valid JSON.
""",
    input_variables=["question"]
)


In [22]:
def selfQueryAnalyzer(question):
    chain = selfQueryPrompt | model | parser
    response = chain.invoke({"question": question})

    try:
        parsed = json.loads(response)
        semanticQuery = parsed.get("semantic_query", question)
        filters = parsed.get("filters", {})
    except Exception as e:
        print("Parsing failed:", e)
        semanticQuery = question
        filters = {}

    return semanticQuery, filters


In [23]:
q1 = "What is discussed on page 6 about Jeopardy Question Generation?"
q2 = "Explain Jeopardy Question Generation"
q3 = "What are the results in Table 2?"
q4 = "What are the models discussed on page no 3,4 of this research paper?"
semantic_query, filters = selfQueryAnalyzer(q1)

print("Semantic Query:", semantic_query)
print("Filters:", filters)


Semantic Query: Jeopardy Question Generation
Filters: {'page': 6}


### STEP 3 (HYPOTHETICAL DOCUMENT EMBEDDING)

In [30]:
HyDEPrompt = PromptTemplate(
    template="""
You are generating a hypothetical passage that could appear inside a research paper.

STRICT RULES:
- Write in academic / research-paper style
- Use terminology that would realistically appear in the given document
- Do NOT introduce methods, models, or techniques not mentioned in the paper
- Do NOT explain generally like a textbook
- Do NOT mention GANs, templates, or unrelated NLP methods
- Focus on factual, descriptive language

TASK:
Write a short paragraph (5–7 sentences) that could plausibly appear in the paper
to address the following topic.

Topic:
{semantic_query}

Hypothetical passage:
""",
    input_variables=["semantic_query"]
)


def GenerateHyDE(semantic_query):
    chain = HyDEPrompt | model | parser
    hypotheticalDoc = chain.invoke(
        {"semantic_query": semantic_query}
    )
    return hypotheticalDoc.strip()

In [31]:
question = "What is discussed on page 6 about Jeopardy Question Generation?"

semantic_query, filters = selfQueryAnalyzer(question)

hydeDoc = GenerateHyDE(semantic_query)

print("Semantic Query:")
print()
print(semantic_query)
print()
print("HyDE Output:")
print(hydeDoc)
print(len(hydeDoc))

Semantic Query:

Jeopardy Question Generation

HyDE Output:
The Jeopardy question generation task has garnered significant attention in recent years, with various studies investigating the optimization of question form and difficulty. Research has shown that the performance of Jeopardy contestants is strongly correlated with the quality and relevance of the generated questions (Keller et al., 2019). In an effort to improve question quality, this study employs a revised version of the "Question Form Generator" (QFG) algorithm, which incorporates a weighted combination of linguistic features, including part-of-speech tags and sentence structures, to produce more informative and nuanced questions. By analyzing the correlation between question difficulty and contestant performance, we have identified a significant relationship between the QFG's ability to generate questions with higher median difficulty and top-performing contestants. Our findings suggest that the QFG algorithm has the pot

## STEP 4 (MERGER RETRIEVER)

In [38]:
from langchain_community.vectorstores import Chroma
vectorStore = Chroma.from_documents(chunks, embedding)
denseRetriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

In [40]:
from langchain_community.retrievers import BM25Retriever
sparseRetriever = BM25Retriever.from_documents(chunks)
sparseRetriever.k = 10

In [41]:
def denseRetrieverWithHyDE(semantic_query, hyde_doc, denseRetriever):
    combined_query = semantic_query + "\n" + hyde_doc
    return denseRetriever.invoke(combined_query)

In [42]:
def sparseRetrieverOnly(semantic_query, sparseRetriever):
    return sparseRetriever.invoke(semantic_query)

In [43]:
def metadataRetriever(all_chunks, filters):
    if not filters:
        return []

    results = []
    for doc in all_chunks:
        keep = True
        for key, value in filters.items():
            if doc.metadata.get(key) != value:
                keep = False
                break
        if keep:
            results.append(doc)

    return results

In [45]:
def MergerRetriever(
    semantic_query,
    hyde_doc,
    denseRetriever,
    sparseRetriever,
    all_chunks,
    filters,
    weights=(0.4, 0.4, 0.2),
    rrf_k=60
):
    scores = {}
    doc_map = {}

    # Dense
    dense_docs = denseRetrieverWithHyDE(
        semantic_query, hyde_doc, denseRetriever
    )
    for rank, doc in enumerate(dense_docs):
        key = doc.page_content
        doc_map[key] = doc
        scores[key] = scores.get(key, 0) + weights[0] / (rank + 1 + rrf_k)

    # Sparse
    sparse_docs = sparseRetrieverOnly(
        semantic_query, sparseRetriever
    )
    for rank, doc in enumerate(sparse_docs):
        key = doc.page_content
        doc_map[key] = doc
        scores[key] = scores.get(key, 0) + weights[1] / (rank + 1 + rrf_k)

    # Metadata
    meta_docs = metadataRetriever(all_chunks, filters)
    for rank, doc in enumerate(meta_docs):
        key = doc.page_content
        doc_map[key] = doc
        scores[key] = scores.get(key, 0) + weights[2] / (rank + 1 + rrf_k)

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [doc_map[k] for k, _ in ranked]


In [48]:
question = "What is discussed on page 6 about Jeopardy Question Generation?"

semantic_query, filters = selfQueryAnalyzer(question)
hyde_doc = GenerateHyDE(semantic_query)

mergedDocs = MergerRetriever(
    semantic_query=semantic_query,
    hyde_doc=hyde_doc,
    denseRetriever=denseRetriever,
    sparseRetriever=sparseRetriever,
    all_chunks=chunks,
    filters=filters
)

print(len(mergedDocs))
print(mergedDocs[0].page_content[:300])


28
recent approaches that use specialised pre-training objectives on TriviaQA [24]. Despite these being
extractive tasks, we ﬁnd that unconstrained generation outperforms previous extractive approaches.
For knowledge-intensive generation, we experiment with MS-MARCO [1] and Jeopardy question
generation


## STEP 5 (FLASH RERANKING)


In [None]:
!pip install flashrank

In [52]:
from flashrank.Ranker import Ranker, RerankRequest

In [51]:
from flashrank.Ranker import Ranker, RerankRequest
def reranking(query, passages, choice):
    if choice == "Nano":
        ranker = Ranker()
    elif choice == "Small":
        ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="/opt")
    elif choice == "Medium":
        ranker = Ranker(model_name="rank-T5-flan", cache_dir="/opt")
    elif choice == "Large":
        ranker = Ranker(model_name="ms-marco-MultiBERT-L-12", cache_dir="/opt")

    rerankRequest = RerankRequest(
        query=query,
        passages=passages
    )

    results = ranker.rerank(rerankRequest)
    return results


In [53]:
passages = [
    {"id": i, "text": doc.page_content}
    for i, doc in enumerate(mergedDocs)
]
passages[0]

{'id': 0,
 'text': 'recent approaches that use specialised pre-training objectives on TriviaQA [24]. Despite these being\nextractive tasks, we ﬁnd that unconstrained generation outperforms previous extractive approaches.\nFor knowledge-intensive generation, we experiment with MS-MARCO [1] and Jeopardy question\ngeneration, and we ﬁnd that our models generate responses that are more factual, speciﬁc, and'}

In [54]:
rerankingResults = reranking(
    query=semantic_query,
    passages=passages,
    choice="Medium"
)
print(len(rerankingResults))
rerankingResults[0]

rank-T5-flan.zip: 100%|██████████| 73.7M/73.7M [00:01<00:00, 54.0MiB/s]


28


{'id': 4,
 'text': 'Fact Veriﬁcation: Fact Query\nsupports\t(y)\nQuestion Generation\nFact Veriﬁcation:\nLabel Generation\nDocument\nIndex\nDefine\t"middle\tear"(x)\nQuestion Answering:\nQuestion Query\nThe\tmiddle\tear\tincludes\nthe\ttympanic\tcavity\tand\nthe\tthree\tossicles.\t\t(y)\nQuestion Answering:\nAnswer GenerationRetriever pη\n(Non-Parametric)\nz4\nz3\nz2\nz1\nd(z)\nJeopardy Question\nGeneration:\nAnswer Query',
 'score': np.float32(0.65737104)}

### STEP 6 (CROSS-ENCODERS) RERANKING AGAIN FOR MORE ACCURATE RESULTS

In [55]:
flashDocs = [mergedDocs[item["id"]] for item in rerankingResults]
print(flashDocs[0])

page_content='Fact Veriﬁcation: Fact Query
supports	(y)
Question Generation
Fact Veriﬁcation:
Label Generation
Document
Index
Define	"middle	ear"(x)
Question Answering:
Question Query
The	middle	ear	includes
the	tympanic	cavity	and
the	three	ossicles.		(y)
Question Answering:
Answer GenerationRetriever pη
(Non-Parametric)
z4
z3
z2
z1
d(z)
Jeopardy Question
Generation:
Answer Query' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 1, 'page_label': '2'}


In [None]:
from sentence_transformers import CrossEncoder

crossEncoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
pairs = [[question, doc.page_content] for doc in flashDocs]

scores = crossEncoder.predict(pairs)
crossReranked = list(zip(scores, flashDocs))
crossReranked = sorted(crossReranked, reverse=True)


In [57]:
print(len(pairs))
print()
pairs[0]

28



['What is discussed on page 6 about Jeopardy Question Generation?',
 'Fact Veriﬁcation: Fact Query\nsupports\t(y)\nQuestion Generation\nFact Veriﬁcation:\nLabel Generation\nDocument\nIndex\nDefine\t"middle\tear"(x)\nQuestion Answering:\nQuestion Query\nThe\tmiddle\tear\tincludes\nthe\ttympanic\tcavity\tand\nthe\tthree\tossicles.\t\t(y)\nQuestion Answering:\nAnswer GenerationRetriever pη\n(Non-Parametric)\nz4\nz3\nz2\nz1\nd(z)\nJeopardy Question\nGeneration:\nAnswer Query']

In [58]:
print(len(crossReranked))
print(crossReranked[0][0])   # Relevence score
print(crossReranked[0][1].page_content[:200])

28
3.2308133
BART for Jeopardy question generation. Following recent work on diversity-promoting decoding
[33, 59, 39], we also investigate generation diversity by calculating the ratio of distinct ngrams to
total


In [59]:
K = 5
finalRerankedDocs = [doc for _, doc in crossReranked[:K]]
print(len(finalRerankedDocs))
print(finalRerankedDocs[0].page_content[:200])

5
BART for Jeopardy question generation. Following recent work on diversity-promoting decoding
[33, 59, 39], we also investigate generation diversity by calculating the ratio of distinct ngrams to
total


### STEP 7 (WINDOW SEARCH RETRIEVER)

In [60]:
import re
from collections import defaultdict

def BuildSentenceIndex(allChunks):
    pageSentences = defaultdict(list)

    for chunk in allChunks:
        page = chunk.metadata.get("page")
        sentences = re.split(r'(?<=[.!?])\s+', chunk.page_content)

        for s in sentences:
            clean = s.strip()
            if clean:
                pageSentences[page].append(clean)

    return pageSentences


sentenceIndex = BuildSentenceIndex(chunks)

In [63]:
from langchain_core.documents import Document

def SentenceWindowRetriever(rankedChunks,sentenceIndex,windowSize=2):
    expandedDocs = []

    for doc in rankedChunks:
        page = doc.metadata.get("page")
        if page not in sentenceIndex:
            expandedDocs.append(doc)
            continue

        fullSentences = sentenceIndex[page] # Sentences from entire page not chunk

        chunkSentences = re.split(r'(?<=[.!?])\s+',doc.page_content) # Splitting retreived chunk into sentences

        indices = []

        # locating sentence positions in full document
        for i, sent in enumerate(fullSentences):
            for cs in chunkSentences:
                if cs.strip() and cs.strip() in sent:
                    indices.append(i)

        if not indices:
            expandedDocs.append(doc)
            continue

        start = max(0, min(indices) - windowSize)
        end = min(len(fullSentences), max(indices) + windowSize + 1)

        windowed_sentences = fullSentences[start:end]

        expandedDocs.append(
            Document(
                page_content=" ".join(dict.fromkeys(windowed_sentences)),
                metadata=doc.metadata
            )
        )

    return expandedDocs


In [65]:
print("Original length:", len(finalRerankedDocs[0].page_content))

windowDocs = SentenceWindowRetriever(
    rankedChunks=finalRerankedDocs,
    sentenceIndex=sentenceIndex,
    windowSize=2
)

print("Windowed length:", len(windowDocs[0].page_content))

print("\nORIGINAL:\n", finalRerankedDocs[0].page_content)
print()
print("AFTER WINDOWING")
print()
print(windowDocs[0].page_content)


Original length: 384
Windowed length: 1075

ORIGINAL:
 BART for Jeopardy question generation. Following recent work on diversity-promoting decoding
[33, 59, 39], we also investigate generation diversity by calculating the ratio of distinct ngrams to
total ngrams generated by different models. Table 5 shows that RAG-Sequence’s generations are
more diverse than RAG-Token’s, and both are signiﬁcantly more diverse than BART without needing

AFTER WINDOWING

We ﬁnd that the top retrieved document is from a gold article
in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases. in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases. 4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than
BART for Jeopardy question generation. Following recent work on diversity-promoting decoding BART for Jeopardy question generation. Following recent work on diversity-promoti

In [66]:
len(windowDocs)

5

### STEP 8 (PARENT DOCUMENT RETREIVER)

In [67]:
from collections import defaultdict
from langchain_core.documents import Document

def ParentDocumentRetriever(ranked_docs,all_chunks):  # Topk docs and all chunks

    page_to_chunks = defaultdict(list)

    for chunk in all_chunks:
        page = chunk.metadata.get("page")
        page_to_chunks[page].append(chunk)

    parent_docs = []
    seen_pages = set()

    for doc in ranked_docs:
        page = doc.metadata.get("page")

        if page in seen_pages:
            continue

        seen_pages.add(page)

        page_chunks = page_to_chunks.get(page, [])

        merged_text = " ".join(
            chunk.page_content for chunk in page_chunks
        )

        parent_docs.append(
            Document(
                page_content=merged_text,
                metadata=doc.metadata
            )
        )

    return parent_docs


In [68]:
parentDocs = ParentDocumentRetriever(
    ranked_docs=windowDocs,
    all_chunks=chunks
)
print(len(parentDocs))
print("Windowed length:", len(windowDocs[0].page_content))
print("Parent length:", len(parentDocs[0].page_content))
print()
print("PARENT DOC SAMPLE:")
print()
print(parentDocs[0].page_content[:1000])


5
Windowed length: 1075
Parent length: 5803

PARENT DOC SAMPLE:

Document 1: his works are considered classics of American
literature ... His wartime experiences formed the basis for his novel
”A Farewell to Arms” (1929) ...
Document 2: ... artists of the 1920s ”Lost Generation” expatriate
community . His debut novel, ”The Sun Also Rises” , was published
in 1926.
BOS
”
TheSunAlso
R ises
” is a
novel
by this
author
of ” A
Farewellto
Arms
”
Doc 1
Doc 2
Doc 3 in 1926.
BOS
”
TheSunAlso
R ises
” is a
novel
by this
author
of ” A
Farewellto
Arms
”
Doc 1
Doc 2
Doc 3
Doc 4
Doc 5
Figure 2: RAG-Token document posteriorp(zi|x,yi,y−i) for each generated token for input “Hem-
ingway" for Jeopardy generation with 5 retrieved documents. The posterior for document 1 is high ingway" for Jeopardy generation with 5 retrieved documents. The posterior for document 1 is high
when generating “A Farewell to Arms" and for document 2 when generating “The Sun Also Rises".
Table 3: Examples from generation tasks. 

In [69]:
print("Windowed chunk pages:", [doc.metadata["page"] for doc in windowDocs])
print("Parent doc pages:", [doc.metadata["page"] for doc in parentDocs])

Windowed chunk pages: [6, 7, 18, 1, 4]
Parent doc pages: [6, 7, 18, 1, 4]


### STEP 9 (CONTEXT COMPRESSION)

In [70]:
import numpy as np
import re
from langchain_core.documents import Document

def ContextualCompression(
    query,
    docs,
    embeddings,
    similarity_threshold=0.45
):
    queryEmbedding = embeddings.embed_query(query)
    compressedDocs = []

    for doc in docs:
        sentences = re.split(r'(?<=[.!?])\s+', doc.page_content)
        keptSentences = []

        for sent in sentences:
            sent_embedding = embeddings.embed_query(sent)
            similarity = np.dot(queryEmbedding, sent_embedding) / (
                np.linalg.norm(queryEmbedding) * np.linalg.norm(sent_embedding)
            )

            if similarity >= similarity_threshold:
                keptSentences.append(sent)

        if keptSentences:
            compressedDocs.append(
                Document(
                    page_content=" ".join(keptSentences),
                    metadata=doc.metadata
                )
            )
        else:
            compressedDocs.append(doc)

    return compressedDocs


In [71]:
compressedDocs = ContextualCompression(
    query=question,
    docs=parentDocs,
    embeddings=embedding
)


print(len(compressedDocs))
print(compressedDocs[0].page_content)

5
The posterior for document 1 is high ingway" for Jeopardy generation with 5 retrieved documents. 4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than
BART for Jeopardy question generation. Following recent work on diversity-promoting decoding BART for Jeopardy question generation.


### STEP 10 (LongContextReorder) For Solving Lost in Middle Phenomenon

In [72]:
finalDocs = compressedDocs

In [73]:
from langchain_community.document_transformers import LongContextReorder
reorder = LongContextReorder()
finalDocsReordered = reorder.transform_documents(finalDocs)

for i, doc in enumerate(finalDocsReordered):
    print(f"({i+1}) --> {doc.page_content}")


(1) --> The posterior for document 1 is high ingway" for Jeopardy generation with 5 retrieved documents. 4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than
BART for Jeopardy question generation. Following recent work on diversity-promoting decoding BART for Jeopardy question generation.
(2) --> Table 7: Number of instances in the datasets used. *A hidden subset of this data is used for evaluation
Task Train Development Test
Natural Questions 79169 8758 3611
TriviaQA 78786 8838 11314
WebQuestions 3418 362 2033
CuratedTrec 635 134 635
Jeopardy Question Generation 97392 13714 26849
MS-MARCO 153726 12468 101093*
FEVER-3-way 145450 10000 10000
FEVER-2-way 96966 6666 6666 Jeopardy Question Generation 97392 13714 26849
MS-MARCO 153726 12468 101093*
FEVER-3-way 145450 10000 10000
FEVER-2-way 96966 6666 6666
parameters. The best performing "closed-book" (parametric only) open-domain QA model is T5-11B
with 11 Billion trainable paramet

### STEP 9 (AUGMENTATION)

In [74]:
def buildContext(docs):
    return "\n\n---\n\n".join(doc.page_content for doc in docs)
context = buildContext(finalDocsReordered)
context

'The posterior for document 1 is high ingway" for Jeopardy generation with 5 retrieved documents. 4.5 Additional Results\nGeneration Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than\nBART for Jeopardy question generation. Following recent work on diversity-promoting decoding BART for Jeopardy question generation.\n\n---\n\nTable 7: Number of instances in the datasets used. *A hidden subset of this data is used for evaluation\nTask Train Development Test\nNatural Questions 79169 8758 3611\nTriviaQA 78786 8838 11314\nWebQuestions 3418 362 2033\nCuratedTrec 635 134 635\nJeopardy Question Generation 97392 13714 26849\nMS-MARCO 153726 12468 101093*\nFEVER-3-way 145450 10000 10000\nFEVER-2-way 96966 6666 6666 Jeopardy Question Generation 97392 13714 26849\nMS-MARCO 153726 12468 101093*\nFEVER-3-way 145450 10000 10000\nFEVER-2-way 96966 6666 6666\nparameters. The best performing "closed-book" (parametric only) open-domain QA model is T5-11B\nwith 11 Billion traina

### STEP 8 (GENERATION)


In [75]:
def RetrieveContext(question):
    # Self-Query Analyzer

    semanticQuery, filters = selfQueryAnalyzer(question)

    # HyDE (Dense Recall)
    hydeDoc = GenerateHyDE(semanticQuery)

    # Merger Retriever
    denseDocs = denseRetriever.invoke(hydeDoc)
    sparseDocs = sparseRetriever.invoke(semanticQuery)

    if filters:
        metadataDocs = [
            doc for doc in chunks
            if all(doc.metadata.get(k) == v for k, v in filters.items())
        ]
    else:
        metadataDocs = []

    def dedup_docs(docs):
        seen = set()
        unique = []
        for d in docs:
            key = (d.page_content, tuple(sorted(d.metadata.items())))
            if key not in seen:
                seen.add(key)
                unique.append(d)
        return unique

    mergedDocs = dedup_docs(denseDocs + sparseDocs + metadataDocs)

    # Flash Reranking
    passages = [
        {"id": i, "text": doc.page_content}
        for i, doc in enumerate(mergedDocs)
    ]

    rerankingResults = reranking(
        query=semanticQuery,
        passages=passages,
        choice="Medium"
    )

    flashDocs = [
        mergedDocs[item["id"]]
        for item in rerankingResults[:20]
    ]

    # Cross-Encoder
    pairs = [[semanticQuery, doc.page_content] for doc in flashDocs]
    scores = crossEncoder.predict(pairs)

    crossReranked = sorted(zip(scores, flashDocs), reverse=True)
    top_chunks = [doc for _, doc in crossReranked[:5]]


    # Sentence Window
    windowDocs = SentenceWindowRetriever(
        rankedChunks=top_chunks,
        sentenceIndex=sentenceIndex,
        windowSize=2
    )

    # Parent Document
    parentDocs = ParentDocumentRetriever(
        ranked_docs=windowDocs,
        all_chunks=chunks
    )

    # Contextual Compression
    compressedDocs = ContextualCompression(
        query=semanticQuery,
        docs=parentDocs,
        embeddings=embedding,
        similarity_threshold=0.45
    )

    # Long Context Reorder
    finalDocsReordered = reorder.transform_documents(compressedDocs)

    # Augmentation
    context = buildContext(finalDocsReordered)

    return context


In [76]:
def generateResponse(question):
    context = RetrieveContext(question)

    prompt = PromptTemplate(
        template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.

<context>
{context}
</context>

Question: {question}
Answer:
""",
        input_variables=["context", "question"]
    )

    chain = prompt | model | parser
    return chain.invoke({"context": context, "question": question})


In [None]:
while True:
    question = input("Enter your question related to the document: ")

    if not question.strip():
        print("Please enter a valid question.\n")
        continue

    print("\nGenerating answer...\n")
    answer = generateResponse(question)
    print("Answer:", answer)

    continue_chat = input("Do you want to ask another question? (yes/no): ").strip().lower()
    if continue_chat not in ['yes', 'y']:
        print("\nThank you for using the document Q&A system. Goodbye!")
        break

Enter your question related to the document: What is index hot swapping discussed on page 7,8?

Generating answer...

Answer: Index hot-swapping is discussed on page 7 and 8 as an advantage of non-parametric memory models like RAG, where knowledge can be easily updated at test time without requiring any retraining.
Do you want to ask another question? (yes/no): yes 
Enter your question related to the document: Who is virat kohli?

Generating answer...

Answer: I don't know who Siraj Raval is, but I do know who Virat Kohli is. Unfortunately, I don't have any information about Virat Kohli in the provided context.
Do you want to ask another question? (yes/no): yes
Enter your question related to the document: what is effect of Effect of Retrieving more documents?

Generating answer...

Answer: Retrieving more documents at test time can improve Open-domain QA results for RAG-Sequence, but performance peaks for RAG-Token at 10 retrieved documents.
Do you want to ask another question? (yes/no

In [78]:
while True:
    question = input("Enter your question related to the document: ")

    if not question.strip():
        print("Please enter a valid question.\n")
        continue

    print("\nGenerating answer...\n")
    answer = generateResponse(question)
    print("Answer:", answer)

    continue_chat = input("Do you want to ask another question? (yes/no): ").strip().lower()
    if continue_chat not in ['yes', 'y']:
        print("\nThank you for using the document Q&A system. Goodbye!")
        break

Enter your question related to the document: What is index hot swapping discussed on page 7,8?

Generating answer...

Answer: Index hot-swapping is discussed as an advantage of non-parametric memory models like RAG, allowing knowledge to be easily updated at test time.
Do you want to ask another question? (yes/no): yes
Enter your question related to the document:  Who is virat kohli?

Generating answer...

Answer: I don't know who Virat Kohli is.
Do you want to ask another question? (yes/no): yes
Enter your question related to the document: what is effect of Effect of Retrieving more documents?

Generating answer...

Answer: Retrieving more documents at test time can monotonically improve Open-domain QA results for RAG-Sequence, but performance peaks at 10 retrieved documents. For RAG-Token, performance peaks at 10 retrieved documents. Retrieving more documents also leads to higher Rouge-L scores for RAG-Token at the expense of Bleu-1.
Do you want to ask another question? (yes/no): no
