In [None]:
%pip install flashrank

In [None]:
%pip install langchain langchain-community langchain-huggingface

In [None]:
%pip install pypdf tiktoken

In [None]:
%pip install chromadb

In [None]:
%pip install rank_bm25

In [None]:
%pip install sentence-transformers

## STEP 1 (DATA INGESTION)

In [7]:
from google.colab import files
uploaded = files.upload()

Saving nlp.pdf to nlp.pdf


In [8]:
filePath = "/content/nlp.pdf"

In [9]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [10]:
def loadDocs(path:str):
  loader = PyPDFLoader(path)
  docs = loader.load()
  return docs

In [11]:
docs = loadDocs(filePath)
print(len(docs))
print(docs[0])

19
page_content='Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-
stream NLP tasks. However, their ability to access and precisely manipulate knowl-
edge is still limited, and hence on knowledge-intensive tasks, their performance
lags behind task-speciﬁc architectures. Additionally, providing provenance for their
decisions and updating their world knowledge remain open research problems. Pre-
trained models with a differentiable access mechanism to explicit non-parametric
memory have so far been only investigated for extracti

In [12]:
def splitDocs(docs):
  splitter = RecursiveCharacterTextSplitter(
      chunk_size = 400,
      chunk_overlap = 150
  )
  chunks = splitter.split_documents(docs)
  return chunks

In [13]:
chunks = splitDocs(docs)
print(len(chunks))
print(chunks[0])

262
page_content='Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 0, 'page_label': '1'}


In [14]:
from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFaceEndpoint

In [None]:
# HF API KEY

In [None]:
embedding = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')

In [18]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace, HuggingFacePipeline
llm = HuggingFaceEndpoint(
    repo_id = "meta-llama/Llama-3.2-3B-Instruct",
    task = "text-generation",
)
model = ChatHuggingFace(llm = llm)

In [19]:
parser = StrOutputParser()

### STEP 2 (SELF QUERYING ANALYZER)

In [20]:
from langchain_core.prompts import PromptTemplate
import json
selfQueryPrompt = PromptTemplate(
    template="""
You are a query analyzer.

Your task is to split the question into:
1. semantic_query → what should be searched semantically
2. filters → structured metadata constraints

Allowed filters:
- page (integer)

RULES:
- If the question mentions "page X", extract page = X
- Remove filter-related words from the semantic query
- If no filters apply, return an empty filters object

EXAMPLES:

Question:
"What is discussed on page 6 about Jeopardy Question Generation?"

Output:
{{
  "semantic_query": "Jeopardy Question Generation",
  "filters": {{ "page": 6 }}
}}

Question:
"Explain Jeopardy Question Generation"

Output:
{{
  "semantic_query": "Jeopardy Question Generation",
  "filters": {{}}
}}

NOW ANALYZE THIS QUESTION:

Question:
{question}

Return ONLY valid JSON.
""",
    input_variables=["question"]
)


In [21]:
def selfQueryAnalyzer(question):
    chain = selfQueryPrompt | model | parser
    response = chain.invoke({"question": question})

    try:
        parsed = json.loads(response)
        semanticQuery = parsed.get("semantic_query", question)
        filters = parsed.get("filters", {})
    except Exception as e:
        print("Parsing failed:", e)
        semanticQuery = question
        filters = {}

    return semanticQuery, filters


In [23]:
q1 = "What is discussed on page 6 about Jeopardy Question Generation?"
q2 = "Explain Jeopardy Question Generation"
q3 = "What are the results in Table 2?"
q4 = "What are the models discussed on page no 3,4 of this research paper?"
semantic_query, filters = selfQueryAnalyzer(q1)

print("Semantic Query:", semantic_query)
print("Filters:", filters)


Semantic Query: Jeopardy Question Generation
Filters: {'page': 6}


## STEP 3 (HYBRID SEARCH)

In [24]:
from langchain_community.vectorstores import Chroma
vectorStore = Chroma.from_documents(chunks, embedding)
similarityRetriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k": 20})

In [25]:
from langchain_community.retrievers import BM25Retriever
keywordRetriever = BM25Retriever.from_documents(chunks)
keywordRetriever.k = 20

In [26]:
def hybridRetriever(
    query,
    denseRetriever,
    sparseRetriever,
    filters=None,
    denseWeight=0.5,
    sparseWeight=0.5,
    rrf_k=60
):
    scores = {}
    doc_map = {}

    denseDocs = denseRetriever.invoke(query)
    for rank, doc in enumerate(denseDocs):
        key = doc.page_content
        doc_map[key] = doc
        scores[key] = scores.get(key, 0) + denseWeight / (rank + 1 + rrf_k)

    sparseDocs = sparseRetriever.invoke(query)
    for rank, doc in enumerate(sparseDocs):
        key = doc.page_content
        doc_map[key] = doc
        scores[key] = scores.get(key, 0) + sparseWeight / (rank + 1 + rrf_k)

    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    retrieved_docs = [doc_map[content] for content, _ in ranked_docs]

    # Metadata filtering
    if filters:
        filtered_docs = []
        for doc in retrieved_docs:
            keep = True
            for key, value in filters.items():
                if doc.metadata.get(key) != value:
                    keep = False
                    break
            if keep:
                filtered_docs.append(doc)

        if filtered_docs:
            return filtered_docs

    return retrieved_docs


In [27]:
question = "What is mentioned about Jeopardy Question Generation on page 6?"
semantic_query, filters = selfQueryAnalyzer(question)

hybridResults = hybridRetriever(
    query=semantic_query,
    denseRetriever=similarityRetriever,
    sparseRetriever=keywordRetriever,
    filters=filters
)

print(len(hybridResults))
print(hybridResults[0])

4
page_content='in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases.
4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than
BART for Jeopardy question generation. Following recent work on diversity-promoting decoding' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 6, 'page_label': '7'}


## STEP 4 (FLASH RERANKING)


In [None]:
!pip install flashrank

In [29]:
from flashrank.Ranker import Ranker, RerankRequest

In [30]:
from flashrank.Ranker import Ranker, RerankRequest
def reranking(query, passages, choice):
    if choice == "Nano":
        ranker = Ranker()
    elif choice == "Small":
        ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="/opt")
    elif choice == "Medium":
        ranker = Ranker(model_name="rank-T5-flan", cache_dir="/opt")
    elif choice == "Large":
        ranker = Ranker(model_name="ms-marco-MultiBERT-L-12", cache_dir="/opt")

    rerankRequest = RerankRequest(
        query=query,
        passages=passages
    )

    results = ranker.rerank(rerankRequest)
    return results


In [31]:
passages = [
    {"id": i, "text": doc.page_content}
    for i, doc in enumerate(hybridResults)
]
passages[0]

{'id': 0,
 'text': 'in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases.\n4.5 Additional Results\nGeneration Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than\nBART for Jeopardy question generation. Following recent work on diversity-promoting decoding'}

In [32]:
rerankingResults = reranking(
    query=semantic_query,
    passages=passages,
    choice="Medium"
)
print(len(rerankingResults))
rerankingResults[0]

rank-T5-flan.zip: 100%|██████████| 73.7M/73.7M [00:02<00:00, 36.4MiB/s]


4


{'id': 1,
 'text': 'ingway" for Jeopardy generation with 5 retrieved documents. The posterior for document 1 is high\nwhen generating “A Farewell to Arms" and for document 2 when generating “The Sun Also Rises".\nTable 3: Examples from generation tasks. RAG models generate more speciﬁc and factually accurate\nresponses. ‘?’ indicates factually incorrect responses, * indicates partially correct responses.',
 'score': np.float32(0.52198154)}

## STEP 5 (CROSS-ENCODERS) RERANKING AGAIN FOR MORE ACCURATE RESULTS

In [33]:
flashDocs = [hybridResults[item["id"]] for item in rerankingResults]
print(flashDocs[0])

page_content='ingway" for Jeopardy generation with 5 retrieved documents. The posterior for document 1 is high
when generating “A Farewell to Arms" and for document 2 when generating “The Sun Also Rises".
Table 3: Examples from generation tasks. RAG models generate more speciﬁc and factually accurate
responses. ‘?’ indicates factually incorrect responses, * indicates partially correct responses.' metadata={'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.21', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'page_label': '7', 'subject': '', 'source': '/content/nlp.pdf', 'total_pages': 19, 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'keywords': '', 'trapped': '/False', 'page': 6, 'title': ''}


In [None]:
from sentence_transformers import CrossEncoder

crossEncoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
pairs = [[question, doc.page_content] for doc in flashDocs]

scores = crossEncoder.predict(pairs)
crossReranked = list(zip(scores, flashDocs))
crossReranked = sorted(crossReranked, reverse=True)


In [35]:
print(len(pairs))
print()
pairs[0]

4



['What is mentioned about Jeopardy Question Generation on page 6?',
 'ingway" for Jeopardy generation with 5 retrieved documents. The posterior for document 1 is high\nwhen generating “A Farewell to Arms" and for document 2 when generating “The Sun Also Rises".\nTable 3: Examples from generation tasks. RAG models generate more speciﬁc and factually accurate\nresponses. ‘?’ indicates factually incorrect responses, * indicates partially correct responses.']

In [36]:
print(len(crossReranked))
print(crossReranked[0][0])   # Relevence score
print(crossReranked[0][1].page_content[:200])

4
1.7910727
in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases.
4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and spec


In [37]:
K = 5
finalRerankedDocs = [doc for _, doc in crossReranked[:K]]
print(len(finalRerankedDocs))
print(finalRerankedDocs[0].page_content[:200])

4
in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases.
4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and spec


### STEP 6 (WINDOW SEARCH RETRIEVER)

In [62]:
import re
from collections import defaultdict

def BuildSentenceIndex(allChunks):
    pageSentences = defaultdict(list)

    for chunk in allChunks:
        page = chunk.metadata.get("page")
        sentences = re.split(r'(?<=[.!?])\s+', chunk.page_content)

        for s in sentences:
            clean = s.strip()
            if clean:
                pageSentences[page].append(clean)

    return pageSentences


sentenceIndex = BuildSentenceIndex(chunks)

In [90]:
from langchain_core.documents import Document

def SentenceWindowRetriever(rankedChunks,sentenceIndex,windowSize=2):
    expandedDocs = []

    for doc in rankedChunks:
        page = doc.metadata.get("page")
        if page not in sentenceIndex:
            expandedDocs.append(doc)
            continue

        fullSentences = sentenceIndex[page] # Sentences from entire page not chunk

        chunkSentences = re.split(r'(?<=[.!?])\s+',doc.page_content) # Splitting retreived chunk into sentences

        indices = []

        # locating sentence positions in full document
        for i, sent in enumerate(fullSentences):
            for cs in chunkSentences:
                if cs.strip() and cs.strip() in sent:
                    indices.append(i)

        if not indices:
            expandedDocs.append(doc)
            continue

        start = max(0, min(indices) - windowSize)
        end = min(len(fullSentences), max(indices) + windowSize + 1)

        windowed_sentences = fullSentences[start:end]

        expandedDocs.append(
            Document(
                page_content=" ".join(dict.fromkeys(windowed_sentences)),
                metadata=doc.metadata
            )
        )

    return expandedDocs


In [68]:
print("Original length:", len(finalRerankedDocs[0].page_content))

windowDocs = SentenceWindowRetriever(
    ranked_chunks=finalRerankedDocs,
    sentence_index=sentenceIndex,
    window_size=2
)

print("Windowed length:", len(windowDocs[0].page_content))

print("\nORIGINAL:\n", finalRerankedDocs[0].page_content)
print()
print("AFTER WINDOWING")
print()
print(windowDocs[0].page_content)


Original length: 301
Windowed length: 1205

ORIGINAL:
 in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases.
4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than
BART for Jeopardy question generation. Following recent work on diversity-promoting decoding

AFTER WINDOWING

We also analyze whether documents retrieved by RAG correspond to documents annotated as gold
evidence in FEVER. We calculate the overlap in article titles between the topk documents retrieved
by RAG and gold evidence annotations. We ﬁnd that the top retrieved document is from a gold article
in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases. in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases. 4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than
BART for Jeopardy question 

In [76]:
len(windowDocs)

4

### STEP 7 (PARENT DOCUMENT RETREIVER)

In [70]:
from collections import defaultdict
from langchain_core.documents import Document

def ParentDocumentRetriever(ranked_docs,all_chunks):  # Topk docs and all chunks

    page_to_chunks = defaultdict(list)

    for chunk in all_chunks:
        page = chunk.metadata.get("page")
        page_to_chunks[page].append(chunk)

    parent_docs = []
    seen_pages = set()

    for doc in ranked_docs:
        page = doc.metadata.get("page")

        if page in seen_pages:
            continue

        seen_pages.add(page)

        page_chunks = page_to_chunks.get(page, [])

        merged_text = " ".join(
            chunk.page_content for chunk in page_chunks
        )

        parent_docs.append(
            Document(
                page_content=merged_text,
                metadata=doc.metadata
            )
        )

    return parent_docs


In [77]:
parentDocs = ParentDocumentRetriever(
    ranked_docs=windowDocs,
    all_chunks=chunks
)
print(len(parentDocs))
print("Windowed length:", len(windowDocs[0].page_content))
print("Parent length:", len(parentDocs[0].page_content))
print()
print("PARENT DOC SAMPLE:")
print()
print(parentDocs[0].page_content[:1000])


1
Windowed length: 1205
Parent length: 5803

PARENT DOC SAMPLE:

Document 1: his works are considered classics of American
literature ... His wartime experiences formed the basis for his novel
”A Farewell to Arms” (1929) ...
Document 2: ... artists of the 1920s ”Lost Generation” expatriate
community . His debut novel, ”The Sun Also Rises” , was published
in 1926.
BOS
”
TheSunAlso
R ises
” is a
novel
by this
author
of ” A
Farewellto
Arms
”
Doc 1
Doc 2
Doc 3 in 1926.
BOS
”
TheSunAlso
R ises
” is a
novel
by this
author
of ” A
Farewellto
Arms
”
Doc 1
Doc 2
Doc 3
Doc 4
Doc 5
Figure 2: RAG-Token document posteriorp(zi|x,yi,y−i) for each generated token for input “Hem-
ingway" for Jeopardy generation with 5 retrieved documents. The posterior for document 1 is high ingway" for Jeopardy generation with 5 retrieved documents. The posterior for document 1 is high
when generating “A Farewell to Arms" and for document 2 when generating “The Sun Also Rises".
Table 3: Examples from generation tasks. 

In [79]:
print("Windowed chunk pages:", [doc.metadata["page"] for doc in windowDocs])
print("Parent doc pages:", [doc.metadata["page"] for doc in parentDocs])

Windowed chunk pages: [6, 6, 6, 6]
Parent doc pages: [6]


### STEP 7 (CONTEXT COMPRESSION)

In [81]:
import numpy as np
import re
from langchain_core.documents import Document

def ContextualCompression(
    query,
    docs,
    embeddings,
    similarity_threshold=0.45
):
    queryEmbedding = embeddings.embed_query(query)
    compressedDocs = []

    for doc in docs:
        sentences = re.split(r'(?<=[.!?])\s+', doc.page_content)
        keptSentences = []

        for sent in sentences:
            sent_embedding = embeddings.embed_query(sent)
            similarity = np.dot(queryEmbedding, sent_embedding) / (
                np.linalg.norm(queryEmbedding) * np.linalg.norm(sent_embedding)
            )

            if similarity >= similarity_threshold:
                keptSentences.append(sent)

        if keptSentences:
            compressedDocs.append(
                Document(
                    page_content=" ".join(keptSentences),
                    metadata=doc.metadata
                )
            )
        else:
            compressedDocs.append(doc)

    return compressedDocs


In [82]:
compressedDocs = ContextualCompression(
    query=question,
    docs=parentDocs,
    embeddings=embedding
)


print(len(compressedDocs))
print(compressedDocs[0].page_content)

1
The posterior for document 1 is high ingway" for Jeopardy generation with 5 retrieved documents. 4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than
BART for Jeopardy question generation. Following recent work on diversity-promoting decoding BART for Jeopardy question generation.


### STEP 8 (LongContextReorder) For Solving Lost in Middle Phenomenon

In [83]:
finalDocs = compressedDocs

In [84]:
from langchain_community.document_transformers import LongContextReorder
reorder = LongContextReorder()
finalDocsReordered = reorder.transform_documents(finalDocs)

for i, doc in enumerate(finalDocsReordered):
    print(f"({i+1}) --> {doc.page_content}")


(1) --> The posterior for document 1 is high ingway" for Jeopardy generation with 5 retrieved documents. 4.5 Additional Results
Generation Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than
BART for Jeopardy question generation. Following recent work on diversity-promoting decoding BART for Jeopardy question generation.


### STEP 9 (AUGMENTATION)

In [85]:
def buildContext(docs):
    return "\n\n---\n\n".join(doc.page_content for doc in docs)
context = buildContext(finalDocsReordered)
context

'The posterior for document 1 is high ingway" for Jeopardy generation with 5 retrieved documents. 4.5 Additional Results\nGeneration Diversity Section 4.3 shows that RAG models are more factual and speciﬁc than\nBART for Jeopardy question generation. Following recent work on diversity-promoting decoding BART for Jeopardy question generation.'

### STEP 8 (GENERATION)


In [87]:
def RetrieveContext(question):
    # Self-Query Analyzer
    semanticQuery, filters = selfQueryAnalyzer(question)

    # Hybrid Retrieval (small chunks)
    hybridResults = hybridRetriever(
        query=semanticQuery,
        denseRetriever=similarityRetriever,
        sparseRetriever=keywordRetriever,
        filters=filters
    )

    # Flash Reranking
    passages = [
        {"id": i, "text": doc.page_content}
        for i, doc in enumerate(hybridResults)
    ]
    rerankingResults = reranking(semanticQuery, passages, "Medium")

    flashDocs = [hybridResults[item["id"]] for item in rerankingResults]

    # Cross-Encoder Reranking
    pairs = [[semanticQuery, doc.page_content] for doc in flashDocs]
    scores = crossEncoder.predict(pairs)
    crossReranked = sorted(zip(scores, flashDocs), reverse=True)

    # Top-K most relevant chunks
    top_chunks = [doc for _, doc in crossReranked[:5]]

    #  Sentence Window Retriever (local context repair)
    windowDocs = SentenceWindowRetriever(
        rankedChunks=top_chunks,
        sentenceIndex=sentenceIndex,
        windowSize=2
    )

    # Parent Document Retriever (small → big)
    parentDocs = ParentDocumentRetriever(
        ranked_docs=windowDocs,
        all_chunks=chunks
    )

    # Contextual Compression (on parent docs)
    compressedDocs = ContextualCompression(
        query=semanticQuery,
        docs=parentDocs,
        embeddings=embedding,
        similarity_threshold=0.45
    )

    # Long Context Reorder
    finalDocsReordered = reorder.transform_documents(compressedDocs)

    # Augmentation
    context = buildContext(finalDocsReordered)

    return context


In [88]:
def generateResponse(question):
    context = RetrieveContext(question)

    prompt = PromptTemplate(
        template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.

<context>
{context}
</context>

Question: {question}
Answer:
""",
        input_variables=["context", "question"]
    )

    chain = prompt | model | parser
    return chain.invoke({"context": context, "question": question})


In [92]:
while True:
    question = input("Enter your question related to the document: ")

    if not question.strip():
        print("Please enter a valid question.\n")
        continue

    print("\nGenerating answer...\n")
    answer = generateResponse(question)
    print("Answer:", answer)

    continue_chat = input("Do you want to ask another question? (yes/no): ").strip().lower()
    if continue_chat not in ['yes', 'y']:
        print("\nThank you for using the document Q&A system. Goodbye!")
        break

Enter your question related to the document: What is index hot swapping discussed on page 7,8?

Generating answer...

Answer: Index hot-swapping is discussed on page 7 and 8 as an advantage of non-parametric memory models like RAG, where knowledge can be easily updated at test time without requiring any retraining.
Do you want to ask another question? (yes/no): yes 
Enter your question related to the document: Who is virat kohli?

Generating answer...

Answer: I don't know who Siraj Raval is, but I do know who Virat Kohli is. Unfortunately, I don't have any information about Virat Kohli in the provided context.
Do you want to ask another question? (yes/no): yes
Enter your question related to the document: what is effect of Effect of Retrieving more documents?

Generating answer...

Answer: Retrieving more documents at test time can improve Open-domain QA results for RAG-Sequence, but performance peaks for RAG-Token at 10 retrieved documents.
Do you want to ask another question? (yes/no