In [None]:
!pip install flashrank

In [None]:
!pip install langchain langchain-community langchain-huggingface

In [None]:
!pip install pypdf tiktoken

In [None]:
%pip install chromadb

In [None]:
%pip install rank_bm25

## STEP 1 (DATA INGESTION)

In [6]:
from google.colab import files
uploaded = files.upload()

Saving nlp.pdf to nlp.pdf


In [7]:
filePath = "/content/nlp.pdf"

In [8]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [9]:
def loadDocs(path:str):
  loader = PyPDFLoader(path)
  docs = loader.load()
  return docs

In [10]:
docs = loadDocs(filePath)
print(len(docs))
print(docs[0])

19
page_content='Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-
stream NLP tasks. However, their ability to access and precisely manipulate knowl-
edge is still limited, and hence on knowledge-intensive tasks, their performance
lags behind task-speciﬁc architectures. Additionally, providing provenance for their
decisions and updating their world knowledge remain open research problems. Pre-
trained models with a differentiable access mechanism to explicit non-parametric
memory have so far been only investigated for extracti

In [11]:
def splitDocs(docs):
  splitter = RecursiveCharacterTextSplitter(
      chunk_size = 400,
      chunk_overlap = 150
  )
  chunks = splitter.split_documents(docs)
  return chunks

In [12]:
chunks = splitDocs(docs)
print(len(chunks))
print(chunks[0])

262
page_content='Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 0, 'page_label': '1'}


In [13]:
from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFaceEndpoint

In [None]:
# HF API 

In [None]:
embedding = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')

In [23]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace, HuggingFacePipeline
llm = HuggingFaceEndpoint(
    repo_id = "HuggingFaceH4/zephyr-7b-beta",
    task = "text-generation",
)
model = ChatHuggingFace(llm = llm)

In [22]:
parser = StrOutputParser()

## STEP 2 (HYBRID SEARCH)

In [30]:
from langchain_community.vectorstores import Chroma
vectorStore = Chroma.from_documents(chunks, embedding)
similarityRetriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k": 20})

In [31]:
from langchain_community.retrievers import BM25Retriever
keywordRetriever = BM25Retriever.from_documents(chunks)
keywordRetriever.k = 20

In [32]:
def hybridRetriever(
    query,
    denseRetriever,
    sparseRetriever,
    denseWeight=0.5,
    sparseWeight=0.5,
    rrf_k=60
):
    scores = {}
    doc_map = {}

    # Dense retrieval
    denseDocs = denseRetriever.invoke(query)
    for rank, doc in enumerate(denseDocs):
        key = doc.page_content
        doc_map[key] = doc
        scores[key] = scores.get(key, 0) + denseWeight / (rank + 1 + rrf_k)

    # Sparse retrieval
    sparse_docs = sparseRetriever.invoke(query)
    for rank, doc in enumerate(sparse_docs):
        key = doc.page_content
        doc_map[key] = doc
        scores[key] = scores.get(key, 0) + sparseWeight / (rank + 1 + rrf_k)

    # Sort by final score
    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    return [doc_map[content] for content, _ in ranked_docs]  # List[Document]

In [37]:
query = "What is Jeopardy Question Generation?"

results = hybridRetriever(
    query=query,
    denseRetriever= similarityRetriever,
    sparseRetriever=keywordRetriever,
    denseWeight=0.5,
    sparseWeight=0.5
)
print(len(results))
print(results[0])



22
page_content='eration. Rather than use questions from standard open-domain QA tasks, which typically consist
of short, simple questions, we propose the more demanding task of generating Jeopardy questions.
Jeopardy is an unusual format that consists of trying to guess an entity from a fact about that entity.
For example, “The World Cup” is the answer to the question “In 1986 Mexico scored as the ﬁrst' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 4, 'page_label': '5'}


## STEP 3 (FLASH RERANKING)


In [None]:
!pip install flashrank

In [39]:
from flashrank.Ranker import Ranker, RerankRequest

In [40]:
def reranking(query,passages,choice):
  if choice == "Nano":
    ranker = Ranker()
  elif choice == "Small":
    ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="/opt")
  elif choice == "Medium":
    ranker = Ranker(model_name="rank-T5-flan", cache_dir="/opt")
  elif choice == "Large":
    ranker = Ranker(model_name="ms-marco-MultiBERT-L-12", cache_dir="/opt")
  rerankrequest = RerankRequest(query=query, passages=passages)
  results = ranker.rerank(rerankrequest)

  return results

In [45]:
hybridResults = hybridRetriever(
    query=query,
    denseRetriever=similarityRetriever,
    sparseRetriever=keywordRetriever
)

passages = [
    {"id": i, "text": doc.page_content}
    for i, doc in enumerate(hybridResults)
]
passages[0]

{'id': 0,
 'text': 'eration. Rather than use questions from standard open-domain QA tasks, which typically consist\nof short, simple questions, we propose the more demanding task of generating Jeopardy questions.\nJeopardy is an unusual format that consists of trying to guess an entity from a fact about that entity.\nFor example, “The World Cup” is the answer to the question “In 1986 Mexico scored as the ﬁrst'}

In [46]:
rerankingResults = reranking(query, passages, "Medium")
rerankingResults[0]

{'id': 16,
 'text': 'T5-11B+SSM[52] 36.6 - /60.5 44.7 -\nOpen\nBook\nREALM [20] 40.4 - / - 40.7 46.8\nDPR [26] 41.5 57.9/ - 41.1 50.6\nRAG-Token 44.1 55.2/66.1 45.5 50.0\nRAG-Seq. 44.5 56.8/68.0 45.2 52.2\nTable 2: Generation and classiﬁcation Test Scores.\nMS-MARCO SotA is [4], FEVER-3 is [68] and\nFEVER-2 is [ 57] *Uses gold context/evidence.\nBest model without gold access underlined.\nModel Jeopardy MSMARCO FVR3 FVR2',
 'score': np.float32(0.56376076)}

## STEP 4 (CROSS-ENCODERS) RERANKING AGAIN FOR MORE ACCURATE RESULTS

In [47]:
flash_docs = [hybridResults[item["id"]] for item in rerankingResults]
print(flash_docs[0])

page_content='T5-11B+SSM[52] 36.6 - /60.5 44.7 -
Open
Book
REALM [20] 40.4 - / - 40.7 46.8
DPR [26] 41.5 57.9/ - 41.1 50.6
RAG-Token 44.1 55.2/66.1 45.5 50.0
RAG-Seq. 44.5 56.8/68.0 45.2 52.2
Table 2: Generation and classiﬁcation Test Scores.
MS-MARCO SotA is [4], FEVER-3 is [68] and
FEVER-2 is [ 57] *Uses gold context/evidence.
Best model without gold access underlined.
Model Jeopardy MSMARCO FVR3 FVR2' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 5, 'page_label': '6'}


In [52]:
from sentence_transformers import CrossEncoder

crossEncoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

pairs = [[query, doc.page_content] for doc in flash_docs]
scores = crossEncoder.predict(pairs)

crossReranked = list(zip(scores, flash_docs))
crossReranked = sorted(crossReranked, reverse=True)
crossReranked[0]

(np.float32(6.8061676),
 Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 18, 'page_label': '19'}, page_content='Jeopardy Question Generation 97392 13714 26849\nMS-MARCO 153726 12468 101093*\nFEVER-3-way 145450 10000 10000\nFEVER-2-way 96966 6666 6666\nparameters. The best performing "closed-book" (parametric only) open-domain QA model is T5-11B\nwith 11 Billion trainable parameters. The T5 model with the closest number of parameters to our'))

In [57]:
print(len(crossReranked))
print(crossReranked[0][0])   # Relevence score
print(crossReranked[0][1].page_content[:200])

22
6.8061676
Jeopardy Question Generation 97392 13714 26849
MS-MARCO 153726 12468 101093*
FEVER-3-way 145450 10000 10000
FEVER-2-way 96966 6666 6666
parameters. The best performing "closed-book" (parametric only) 


In [63]:
K = 4
finalRerankedDocs = [doc for _, doc in crossReranked[:K]]
print(len(finalRerankedDocs))
print(finalRerankedDocs[0].page_content[:200])

4
Jeopardy Question Generation 97392 13714 26849
MS-MARCO 153726 12468 101093*
FEVER-3-way 145450 10000 10000
FEVER-2-way 96966 6666 6666
parameters. The best performing "closed-book" (parametric only) 


### STEP 5 (CONTEXT COMPRESSION)

In [105]:
import numpy as np
import re
from langchain_core.documents import Document

def ContextualCompression(
    query,
    docs,
    embeddings,
    similarity_threshold=0.45
):
    queryEmbedding = embeddings.embed_query(query)
    compressedDocs = []

    for doc in docs:
        sentences = re.split(r'(?<=[.!?])\s+', doc.page_content)
        keptSentences = []

        for sent in sentences:
            sent_embedding = embeddings.embed_query(sent)
            similarity = np.dot(queryEmbedding, sent_embedding) / (
                np.linalg.norm(queryEmbedding) * np.linalg.norm(sent_embedding)
            )

            if similarity >= similarity_threshold:
                keptSentences.append(sent)

        if keptSentences:
            compressedDocs.append(
                Document(
                    page_content=" ".join(keptSentences),
                    metadata=doc.metadata
                )
            )
        else:
            compressedDocs.append(doc)

    return compressedDocs


In [82]:
compressedDocs = ContextualCompression(
    query=query,
    docs=finalRerankedDocs,
    embeddings=embedding)
print(len(compressedDocs))
print(compressedDocs[0].page_content)
print()
print(compressedDocs[1].page_content)

4
Jeopardy Question Generation 97392 13714 26849
MS-MARCO 153726 12468 101093*
FEVER-3-way 145450 10000 10000
FEVER-2-way 96966 6666 6666
parameters. The best performing "closed-book" (parametric only) open-domain QA model is T5-11B
with 11 Billion trainable parameters. The T5 model with the closest number of parameters to our

Question Answering:
Answer GenerationRetriever pη
(Non-Parametric)
z4
z3
z2
z1
d(z)
Jeopardy Question
Generation:
Answer Query
Figure 1: Overview of our approach.


### STEP 6 (LongContextReorder) For Solving Lost in Middle Phenomenon

In [85]:
finalDocs = compressedDocs

In [87]:
from langchain_community.document_transformers import LongContextReorder
reorder = LongContextReorder()
finalDocsReordered = reorder.transform_documents(finalDocs)

for i, doc in enumerate(finalDocsReordered):
    print(f"({i+1}) --> {doc.page_content}")


(1) --> Question Answering:
Answer GenerationRetriever pη
(Non-Parametric)
z4
z3
z2
z1
d(z)
Jeopardy Question
Generation:
Answer Query
Figure 1: Overview of our approach.
(2) --> 4.3 Jeopardy Question Generation
Table 2 shows that RAG-Token performs better than RAG-Sequence on Jeopardy question generation,
with both models outperforming BART on Q-BLEU-1. 4 shows human evaluation results, over 452
pairs of generations from BART and RAG-Token. Evaluators indicated that BART was more factual
(3) --> For example, “The World Cup” is the answer to the question “In 1986 Mexico scored as the ﬁrst
country to host this international sports competition twice.” As Jeopardy questions are precise,
factual statements, generating Jeopardy questions conditioned on their answer entities constitutes a
challenging knowledge-intensive generation task.
(4) --> Jeopardy Question Generation 97392 13714 26849
MS-MARCO 153726 12468 101093*
FEVER-3-way 145450 10000 10000
FEVER-2-way 96966 6666 6666
parameters. T

### STEP 7 (AUGMENTATION)

In [88]:
def buildContext(docs):
    return "\n\n---\n\n".join(doc.page_content for doc in docs)
context = buildContext(finalDocsReordered)
context

'Question Answering:\nAnswer GenerationRetriever pη\n(Non-Parametric)\nz4\nz3\nz2\nz1\nd(z)\nJeopardy Question\nGeneration:\nAnswer Query\nFigure 1: Overview of our approach.\n\n---\n\n4.3 Jeopardy Question Generation\nTable 2 shows that RAG-Token performs better than RAG-Sequence on Jeopardy question generation,\nwith both models outperforming BART on Q-BLEU-1. 4 shows human evaluation results, over 452\npairs of generations from BART and RAG-Token. Evaluators indicated that BART was more factual\n\n---\n\nFor example, “The World Cup” is the answer to the question “In 1986 Mexico scored as the ﬁrst\ncountry to host this international sports competition twice.” As Jeopardy questions are precise,\nfactual statements, generating Jeopardy questions conditioned on their answer entities constitutes a\nchallenging knowledge-intensive generation task.\n\n---\n\nJeopardy Question Generation 97392 13714 26849\nMS-MARCO 153726 12468 101093*\nFEVER-3-way 145450 10000 10000\nFEVER-2-way 96966 6666

### STEP 8 (GENERATION)


In [96]:
def generateResponse(question):
    """
    Complete RAG pipeline with hybrid retrieval, flash reranking,
    cross-encoder reranking, and long context reordering.
    """
    # Step 1: Hybrid Retrieval
    hybridResults = hybridRetriever(
        query=question,
        denseRetriever=similarityRetriever,
        sparseRetriever=keywordRetriever
    )

    # Step 2: Flash Reranking
    passages = [
    {"id": i, "text": doc.page_content}
    for i, doc in enumerate(hybridResults)
    ]


    rerankingResults = reranking(question, passages, "Medium")
    flashTopK = rerankingResults[:8]

    # Step 3: Cross-Encoder Reranking
    topk_texts = [item["text"] for item in flashTopK]
    pairs = [[question, text] for text in topk_texts]

    crossEncoderScores = crossEncoder.predict(pairs)
    crossReranked = sorted(zip(crossEncoderScores, topk_texts), reverse=True)

    FINAL_K = 4
    finalTexts = [text for _, text in crossReranked[:FINAL_K]]

    # Step 4: Long Context Reorder
    finalDocs = [Document(page_content=text) for text in finalTexts]
    finalDocsReordered = reorder.transform_documents(finalDocs)

    # Step 5: Build Context
    context = buildContext(finalDocsReordered)

    # Step 6: Generation
    prompt = PromptTemplate(
        template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.

<context>
{context}
</context>

Question: {question}
Answer:
""",
        input_variables=["context", "question"]
    )

    chain = prompt | model | parser
    return chain.invoke({"context": context, "question": question})

In [101]:
while True:
    question = input("Enter your question related to the document: ")

    if not question.strip():
        print("Please enter a valid question.\n")
        continue

    print("\nGenerating answer...\n")
    answer = generateResponse(question)
    print("Answer:", answer)

    continue_chat = input("Do you want to ask another question? (yes/no): ").strip().lower()
    if continue_chat not in ['yes', 'y']:
        print("\nThank you for using the document Q&A system. Goodbye!")
        break

Enter your question related to the document: What is NLP and RAG ?

Generating answer...

Answer: NLP, or Natural Language Processing, is a field in computer science that deals with the interaction between computers and human language. RAG, short for Research on Automatous Generation of Responses to Questions, is a language model that uses machine learning to generate human-like responses to questions based on given texts. It has been shown to be more factual and effective in generating correct answers than a state-of-the-art generation model called BART (Bidirectional Encoder Representations from Transformers) in 42.7% of cases, and in 17.1% of cases, both models outperformed BART in a factuality evaluation by human evaluators. RAG also has the ability to generate correct answers in cases where an extractive model would not, achieving 11.8% accuracy in such scenarios. It has potential applications in various scenarios such as aiding with factual information retrieval and improving the