In [None]:
!pip install flashrank

In [2]:
!pip install langchain langchain-community langchain-huggingface

In [None]:
!pip install pypdf tiktoken

In [None]:
%pip install chromadb

In [6]:
%pip install rank_bm25



In [None]:
%pip install sentence-transformers

## STEP 1 (DATA INGESTION)

In [None]:
from google.colab import files
uploaded = files.upload()

In [8]:
filePath = "/content/nlp.pdf"

In [9]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [10]:
def loadDocs(path:str):
  loader = PyPDFLoader(path)
  docs = loader.load()
  return docs

In [11]:
docs = loadDocs(filePath)
print(len(docs))
print(docs[0])

19
page_content='Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-
stream NLP tasks. However, their ability to access and precisely manipulate knowl-
edge is still limited, and hence on knowledge-intensive tasks, their performance
lags behind task-speciﬁc architectures. Additionally, providing provenance for their
decisions and updating their world knowledge remain open research problems. Pre-
trained models with a differentiable access mechanism to explicit non-parametric
memory have so far been only investigated for extracti

In [12]:
def splitDocs(docs):
  splitter = RecursiveCharacterTextSplitter(
      chunk_size = 400,
      chunk_overlap = 150
  )
  chunks = splitter.split_documents(docs)
  return chunks

In [13]:
chunks = splitDocs(docs)
print(len(chunks))
print(chunks[0])

262
page_content='Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 0, 'page_label': '1'}


In [18]:
from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFaceEndpoint

In [None]:
# HF API

In [None]:
embedding = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')

In [24]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace, HuggingFacePipeline
llm = HuggingFaceEndpoint(
    repo_id = "meta-llama/Llama-3.2-3B-Instruct",
    task = "text-generation",
)
model = ChatHuggingFace(llm = llm)

In [25]:
parser = StrOutputParser()

### STEP 2 (SELF QUERYING ANALYZER)

In [26]:
from langchain_core.prompts import PromptTemplate
import json
selfQueryPrompt = PromptTemplate(
    template="""
You are a query analyzer.

Your task is to split the question into:
1. semantic_query → what should be searched semantically
2. filters → structured metadata constraints

Allowed filters:
- page (integer)

RULES:
- If the question mentions "page X", extract page = X
- Remove filter-related words from the semantic query
- If no filters apply, return an empty filters object

EXAMPLES:

Question:
"What is discussed on page 6 about Jeopardy Question Generation?"

Output:
{{
  "semantic_query": "Jeopardy Question Generation",
  "filters": {{ "page": 6 }}
}}

Question:
"Explain Jeopardy Question Generation"

Output:
{{
  "semantic_query": "Jeopardy Question Generation",
  "filters": {{}}
}}

NOW ANALYZE THIS QUESTION:

Question:
{question}

Return ONLY valid JSON.
""",
    input_variables=["question"]
)


In [27]:
def selfQueryAnalyzer(question):
    chain = selfQueryPrompt | model | parser
    response = chain.invoke({"question": question})

    try:
        parsed = json.loads(response)
        semanticQuery = parsed.get("semantic_query", question)
        filters = parsed.get("filters", {})
    except Exception as e:
        print("Parsing failed:", e)
        semanticQuery = question
        filters = {}

    return semanticQuery, filters


In [30]:
q1 = "What is discussed on page 6 about Jeopardy Question Generation?"
q2 = "Explain Jeopardy Question Generation"
q3 = "What are the results in Table 2?"
q4 = "What are the models discussed on page no 3,4 of this research paper?"
semantic_query, filters = selfQueryAnalyzer(q4)

print("Semantic Query:", semantic_query)
print("Filters:", filters)


Semantic Query: models
Filters: {'page': '3,4'}


## STEP 3 (HYBRID SEARCH)

In [31]:
from langchain_community.vectorstores import Chroma
vectorStore = Chroma.from_documents(chunks, embedding)
similarityRetriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k": 20})

In [32]:
from langchain_community.retrievers import BM25Retriever
keywordRetriever = BM25Retriever.from_documents(chunks)
keywordRetriever.k = 20

In [33]:
def hybridRetriever(
    query,
    denseRetriever,
    sparseRetriever,
    filters=None,
    denseWeight=0.5,
    sparseWeight=0.5,
    rrf_k=60
):
    scores = {}
    doc_map = {}

    denseDocs = denseRetriever.invoke(query)
    for rank, doc in enumerate(denseDocs):
        key = doc.page_content
        doc_map[key] = doc
        scores[key] = scores.get(key, 0) + denseWeight / (rank + 1 + rrf_k)

    sparseDocs = sparseRetriever.invoke(query)
    for rank, doc in enumerate(sparseDocs):
        key = doc.page_content
        doc_map[key] = doc
        scores[key] = scores.get(key, 0) + sparseWeight / (rank + 1 + rrf_k)

    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    retrieved_docs = [doc_map[content] for content, _ in ranked_docs]

    # Metadata filtering
    if filters:
        filtered_docs = []
        for doc in retrieved_docs:
            keep = True
            for key, value in filters.items():
                if doc.metadata.get(key) != value:
                    keep = False
                    break
            if keep:
                filtered_docs.append(doc)

        if filtered_docs:
            return filtered_docs

    return retrieved_docs


In [36]:
question = "What is mentioned about Memory-based Architectures on page 9?"
semantic_query, filters = selfQueryAnalyzer(question)

hybridResults = hybridRetriever(
    query=semantic_query,
    denseRetriever=similarityRetriever,
    sparseRetriever=keywordRetriever,
    filters=filters
)

print(len(hybridResults))
print(hybridResults[0])

37
page_content='can be ﬁne-tuned for strong performance on a variety of tasks.
Memory-based Architectures Our document index can be seen as a large external memory for
neural networks to attend to, analogous to memory networks [64, 55]. Concurrent work [14] learns
to retrieve a trained embedding for each entity in the input, rather than to retrieve raw text as in our' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 8, 'page_label': '9'}


## STEP 4 (FLASH RERANKING)


In [None]:
!pip install flashrank

In [38]:
from flashrank.Ranker import Ranker, RerankRequest

In [39]:
from flashrank.Ranker import Ranker, RerankRequest
def reranking(query, passages, choice):
    if choice == "Nano":
        ranker = Ranker()
    elif choice == "Small":
        ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="/opt")
    elif choice == "Medium":
        ranker = Ranker(model_name="rank-T5-flan", cache_dir="/opt")
    elif choice == "Large":
        ranker = Ranker(model_name="ms-marco-MultiBERT-L-12", cache_dir="/opt")

    rerankRequest = RerankRequest(
        query=query,
        passages=passages
    )

    results = ranker.rerank(rerankRequest)
    return results


In [40]:
passages = [
    {"id": i, "text": doc.page_content}
    for i, doc in enumerate(hybridResults)
]
passages[0]

{'id': 0,
 'text': 'can be ﬁne-tuned for strong performance on a variety of tasks.\nMemory-based Architectures Our document index can be seen as a large external memory for\nneural networks to attend to, analogous to memory networks [64, 55]. Concurrent work [14] learns\nto retrieve a trained embedding for each entity in the input, rather than to retrieve raw text as in our'}

In [41]:
rerankingResults = reranking(
    query=semantic_query,
    passages=passages,
    choice="Medium"
)
print(len(rerankingResults))
rerankingResults[0]

rank-T5-flan.zip: 100%|██████████| 73.7M/73.7M [00:01<00:00, 48.1MiB/s]


37


{'id': 33,
 'text': 'distributed representations, which makes the memory both (i) human-readable, lending a form of\ninterpretability to our model, and (ii) human-writable, enabling us to dynamically update the model’s\nmemory by editing the document index. This approach has also been used in knowledge-intensive\ndialog, where generators have been conditioned on retrieved text directly, albeit obtained via TF-IDF',
 'score': np.float32(0.650604)}

## STEP 5 (CROSS-ENCODERS) RERANKING AGAIN FOR MORE ACCURATE RESULTS

In [42]:
flashDocs = [hybridResults[item["id"]] for item in rerankingResults]
print(flashDocs[0])

page_content='distributed representations, which makes the memory both (i) human-readable, lending a form of
interpretability to our model, and (ii) human-writable, enabling us to dynamically update the model’s
memory by editing the document index. This approach has also been used in knowledge-intensive
dialog, where generators have been conditioned on retrieved text directly, albeit obtained via TF-IDF' metadata={'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'page': 8, 'producer': 'pdfTeX-1.40.21', 'creationdate': '2021-04-13T00:48:38+00:00', 'subject': '', 'title': '', 'keywords': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'moddate': '2021-04-13T00:48:38+00:00', 'creator': 'LaTeX with hyperref', 'total_pages': 19, 'page_label': '9', 'author': ''}


In [43]:
from sentence_transformers import CrossEncoder

crossEncoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
pairs = [[question, doc.page_content] for doc in flashDocs]

scores = crossEncoder.predict(pairs)
crossReranked = list(zip(scores, flashDocs))
crossReranked = sorted(crossReranked, reverse=True)


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [44]:
print(len(pairs))
print()
pairs[0]

37



['What is mentioned about Memory-based Architectures on page 9?',
 'distributed representations, which makes the memory both (i) human-readable, lending a form of\ninterpretability to our model, and (ii) human-writable, enabling us to dynamically update the model’s\nmemory by editing the document index. This approach has also been used in knowledge-intensive\ndialog, where generators have been conditioned on retrieved text directly, albeit obtained via TF-IDF']

In [45]:
print(len(crossReranked))
print(crossReranked[0][0])   # Relevence score
print(crossReranked[0][1].page_content[:200])

37
3.2602353
can be ﬁne-tuned for strong performance on a variety of tasks.
Memory-based Architectures Our document index can be seen as a large external memory for
neural networks to attend to, analogous to memor


In [46]:
K = 5
finalRerankedDocs = [doc for _, doc in crossReranked[:K]]
print(len(finalRerankedDocs))
print(finalRerankedDocs[0].page_content[:200])

5
can be ﬁne-tuned for strong performance on a variety of tasks.
Memory-based Architectures Our document index can be seen as a large external memory for
neural networks to attend to, analogous to memor


### STEP 6 (PARENT DOCUMENT RETRIEVER)

In [49]:
from collections import defaultdict
from langchain_core.documents import Document

def ParentDocumentRetriever(ranked_docs,all_chunks):  # Topk docs and all chunks

    page_to_chunks = defaultdict(list)

    for chunk in all_chunks:
        page = chunk.metadata.get("page")
        page_to_chunks[page].append(chunk)

    parent_docs = []
    seen_pages = set()

    for doc in ranked_docs:
        page = doc.metadata.get("page")

        if page in seen_pages:
            continue

        seen_pages.add(page)

        page_chunks = page_to_chunks.get(page, [])

        merged_text = " ".join(
            chunk.page_content for chunk in page_chunks
        )

        parent_docs.append(
            Document(
                page_content=merged_text,
                metadata=doc.metadata
            )
        )

    return parent_docs


In [51]:
print("Top-ranked chunk pages:")
print([doc.metadata["page"] for doc in finalRerankedDocs])

parentDocs = ParentDocumentRetriever(finalRerankedDocs, chunks)

print("\nParent document pages:")
print([doc.metadata["page"] for doc in parentDocs])


Top-ranked chunk pages:
[8, 0, 1, 0, 8]

Parent document pages:
[8, 0, 1]


In [57]:
len(parentDocs)

3

In [52]:
print("Chunk length:", len(finalRerankedDocs[0].page_content))
print("Parent length:", len(parentDocs[0].page_content))

Chunk length: 353
Parent length: 5637


In [56]:
print(finalRerankedDocs[0])
print()
print(parentDocs[0].page_content)

page_content='can be ﬁne-tuned for strong performance on a variety of tasks.
Memory-based Architectures Our document index can be seen as a large external memory for
neural networks to attend to, analogous to memory networks [64, 55]. Concurrent work [14] learns
to retrieve a trained embedding for each entity in the input, rather than to retrieve raw text as in our' metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/nlp.pdf', 'total_pages': 19, 'page': 8, 'page_label': '9'}

General-Purpose Architectures for NLP Prior work on general-purpose architectures for NLP
tasks has shown great success without the use of retrieval. A single, pre-trained language model
has been shown to 

### STEP 7 (CONTEXT COMPRESSION)

In [50]:
import numpy as np
import re
from langchain_core.documents import Document

def ContextualCompression(
    query,
    docs,
    embeddings,
    similarity_threshold=0.45
):
    queryEmbedding = embeddings.embed_query(query)
    compressedDocs = []

    for doc in docs:
        sentences = re.split(r'(?<=[.!?])\s+', doc.page_content)
        keptSentences = []

        for sent in sentences:
            sent_embedding = embeddings.embed_query(sent)
            similarity = np.dot(queryEmbedding, sent_embedding) / (
                np.linalg.norm(queryEmbedding) * np.linalg.norm(sent_embedding)
            )

            if similarity >= similarity_threshold:
                keptSentences.append(sent)

        if keptSentences:
            compressedDocs.append(
                Document(
                    page_content=" ".join(keptSentences),
                    metadata=doc.metadata
                )
            )
        else:
            compressedDocs.append(doc)

    return compressedDocs


In [61]:
compressedDocs = ContextualCompression(
    query=question,
    docs=parentDocs,
    embeddings=embedding
)


print(len(compressedDocs))
print(compressedDocs[0].page_content)

3
Memory-based Architectures Our document index can be seen as a large external memory for
neural networks to attend to, analogous to memory networks [64, 55].


### STEP 8 (LongContextReorder) For Solving Lost in Middle Phenomenon

In [60]:
finalDocs = compressedDocs

In [63]:
from langchain_community.document_transformers import LongContextReorder
reorder = LongContextReorder()
finalDocsReordered = reorder.transform_documents(finalDocs)

for i, doc in enumerate(finalDocsReordered):
    print(f"({i+1}) --> {doc.page_content}")


(1) --> Memory-based Architectures Our document index can be seen as a large external memory for
neural networks to attend to, analogous to memory networks [64, 55].
(2) --> There has been extensive previous work proposing architectures to enrich systems with non-parametric
memory which are trained from scratch for speciﬁc tasks, e.g. memory networks [ 64, 55], stack-
augmented networks [25] and memory layers [ 30].
(3) --> Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract †Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-

### STEP 9 (AUGMENTATION)

In [64]:
def buildContext(docs):
    return "\n\n---\n\n".join(doc.page_content for doc in docs)
context = buildContext(finalDocsReordered)
context

'Memory-based Architectures Our document index can be seen as a large external memory for\nneural networks to attend to, analogous to memory networks [64, 55].\n\n---\n\nThere has been extensive previous work proposing architectures to enrich systems with non-parametric\nmemory which are trained from scratch for speciﬁc tasks, e.g. memory networks [ 64, 55], stack-\naugmented networks [25] and memory layers [ 30].\n\n---\n\nRetrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez⋆,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,\nMike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†\n†Facebook AI Research;‡University College London;⋆New York University;\nplewis@fb.com\nAbstract †Facebook AI Research;‡University College London;⋆New York University;\nplewis@fb.com\nAbstract\nLarge pre-trained language models have been shown to store factual knowledge\nin their parameters, and achiev

### STEP 8 (GENERATION)


In [65]:
def RetrieveContext(question):
    # Component 1: Self-Query Analyzer
    semanticQuery, filters = selfQueryAnalyzer(question)

    # Component 2: Hybrid Retrieval (small chunks)
    hybridResults = hybridRetriever(
        query=semanticQuery,
        denseRetriever=similarityRetriever,
        sparseRetriever=keywordRetriever,
        filters=filters
    )

    # Component 3: Flash Reranking
    passages = [{"id": i, "text": doc.page_content} for i, doc in enumerate(hybridResults)]
    rerankingResults = reranking(semanticQuery, passages, "Medium")

    flashDocs = [hybridResults[item["id"]] for item in rerankingResults]

    # Component 4: Cross Encoder Reranking
    pairs = [[semanticQuery, doc.page_content] for doc in flashDocs]
    scores = crossEncoder.predict(pairs)
    crossReranked = sorted(zip(scores, flashDocs), reverse=True)

    top_chunks = [doc for _, doc in crossReranked[:5]]

    # Component 5: Parent Document Retriever (small → big)
    parentDocs = ParentDocumentRetriever(
        ranked_docs=top_chunks,
        all_chunks=chunks
    )

    #  Component 6: Contextual Compression (on parents)
    compressedDocs = ContextualCompression(
        query=semanticQuery,
        docs=parentDocs,
        embeddings=embedding
    )

    #  Component 7: Long Context Reorder
    finalDocsReordered = reorder.transform_documents(compressedDocs)

    # Component 8: Augmentation
    context = buildContext(finalDocsReordered)

    return context


In [68]:
def generateResponse(question):
    context = RetrieveContext(question)

    prompt = PromptTemplate(
        template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.

<context>
{context}
</context>

Question: {question}
Answer:
""",
        input_variables=["context", "question"]
    )

    chain = prompt | model | parser
    return chain.invoke({"context": context, "question": question})


In [69]:
while True:
    question = input("Enter your question related to the document: ")

    if not question.strip():
        print("Please enter a valid question.\n")
        continue

    print("\nGenerating answer...\n")
    answer = generateResponse(question)
    print("Answer:", answer)

    continue_chat = input("Do you want to ask another question? (yes/no): ").strip().lower()
    if continue_chat not in ['yes', 'y']:
        print("\nThank you for using the document Q&A system. Goodbye!")
        break

Enter your question related to the document: How does RAG differ from standard sequence-to-sequence models?

Generating answer...

Answer: RAG differs from standard sequence-to-sequence models by using a retrieval-augmented approach, where a dense vector index of Wikipedia is accessed with a pre-trained neural retriever to retrieve text documents and use them as additional context when generating the target sequence.
Do you want to ask another question? (yes/no): yes
Enter your question related to the document: What does the paper say on page 5 about Jeopardy Question Generation?

Generating answer...

Answer: I don't know.
Do you want to ask another question? (yes/no): yes
Enter your question related to the document: What is the role of Wikipedia in RAG experiments?

Generating answer...

Answer: Wikipedia serves as a single, non-parametric knowledge source for RAG experiments. It is used to provide evidence for claims and test the model's ability to reason over this evidence.
Do you 