<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/RAG_Question_Answer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Generative AI in Research: using RAG+LLM to understand a journal paper well

leverage RAG for answering questions based on a journal paper.

Download a paper and convert to .txt file in a directory named "data". Use this .txt file and evaluate if the RAG technique is giving good results.

In [16]:
!pip install pymupdf requests



In [18]:
import os
import requests
import fitz  # PyMuPDF

# Sample arXiv paper IDs related to diffusion models
arxiv_ids = [
    "2210.05274",  # Equivariant 3D-Conditional Diffusion Models for Molecular Linker Design
]

def download_pdf(arxiv_id, output_folder):
    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    pdf_path = os.path.join(output_folder, f"{arxiv_id}.pdf")
    response = requests.get(url)
    with open(pdf_path, "wb") as f:
        f.write(response.content)
    print(f"Downloaded {arxiv_id}")
    return pdf_path

def pdf_to_text(pdf_path, txt_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Converted to text: {txt_path}")

def main():
    data_dir = "data"
    os.makedirs(data_dir, exist_ok=True)

    for arxiv_id in arxiv_ids:
        pdf_path = download_pdf(arxiv_id, data_dir)
        txt_path = os.path.join(data_dir, f"{arxiv_id}.txt")
        pdf_to_text(pdf_path, txt_path)

if __name__ == "__main__":
    main()


Downloaded 2210.05274
Converted to text: data/2210.05274.txt


In [19]:
# Install dependencies
!pip install langchain langchain_community faiss-cpu sentence-transformers transformers networkx matplotlib spacy
#!python -m spacy download en_core_web_sm



In [45]:
import os, glob
import gc
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import networkx as nx
import matplotlib.pyplot as plt
import spacy
import re
gc.collect()
import torch
torch.cuda.empty_cache()

In [117]:
def clean_text(text):
    import re
    # Remove inline citations like [14], [14, 27], and (author year)
    text = re.sub(r"\[[0-9,\s]+\]", "", text)
    text = re.sub(r"\(.*?\d{4}.*?\)", "", text)  # Remove citations like (Author, 2020)

    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # Remove LaTeX math expressions
    text = re.sub(r"\$.*?\$", "", text)

    # Remove repeated words
    text = re.sub(r"\b(\w+)( \1\b)+", r"\1", text)

    # Remove special characters
    text = re.sub(r"[^a-zA-Z0-9.,;:?!\s]", "", text)

    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Pattern for common author name formats
    pattern = r"\b([A-Z][a-z]+(?: [A-Z][a-z]+)*,? (?:[A-Z]\.)?\b(?: and [A-Z][a-z]+(?: [A-Z][a-z]+)*,? (?:[A-Z]\.)?)+|\b(?:et al\.)\b)"
    # Replace any matched author names with an empty string
    cleaned_text = re.sub(pattern, '', text)
    # Remove extra spaces after author removal
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return text

def clean_text_remove_references(text):
    # Remove extra newlines and empty lines
    text = "\n".join([line.strip() for line in text.splitlines() if line.strip() != ""])

    # Cut off everything after 'References' or 'Bibliography'
    refs_keywords = ["references", "bibliography"]
    for keyword in refs_keywords:
        idx = text.lower().find(keyword)
        if idx != -1:
            text = text[:idx]  # Keep text before the references section
            break

    # Remove sections related to funding or acknowledgments
    text = re.sub(r".*grant agreement.*|.*funding.*|.*acknowledg(e)?ment.*", "", text, flags=re.IGNORECASE)

    return text


In [94]:
# 2. Load the long journal paper
file_path = "data/2210.05274.txt"  # Path to your journal paper

loader = TextLoader(file_path)
documents = loader.load()

for doc in documents:
    doc.page_content = clean_text(doc.page_content)

raw_text = "\n".join([doc.page_content for doc in documents])
clean_text = clean_text_remove_references(raw_text)

# 3. Split into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,  # approx 1000 characters per chunk
    chunk_overlap=20
)

In [95]:
# 7. Ask Questions
def ask_question(question):
    result = qa_chain.run(question)
    return result

In [96]:
from transformers import AutoTokenizer, pipeline
docs = text_splitter.create_documents([raw_text])

# 4. Build vector database (FAISS)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)


In [105]:
from langchain.prompts import PromptTemplate
# Set up Language Model (use a smaller one to reduce memory consumption)
model_name = 'google/flan-t5-large'  # Switching to a smaller version of FLAN-T5
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

text2text_gen = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=300,  # <-- important! specifies the size of outputs (answer) that LLM should give you
    repetition_penalty=1.2,   # Important to discourage endless repeats
    temperature=0.7,           # A bit more randomness
    top_p=0.9,
    do_sample=True
    )

llm = HuggingFacePipeline(pipeline=text2text_gen)

# Custom prompt template
custom_prompt_template = """
You are a helpful academic assistant. Read the following article content carefully. Then, answer the given question completely and concisely.

=== Article Content ===
{context}

=== Question ===
{question}

=== Answer ===
"""

custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=custom_prompt_template,
)

# 6. Build RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), # Retrieve top 10 chunks for question answer
    chain_type="stuff",
    chain_type_kwargs={"prompt": custom_prompt}  # Use the custom prompt
)



Device set to use cpu


In [106]:
print("Q: What is the paper about?\nA:", ask_question("What is the paper about?"))


Q: What is the paper about?
A: Diffusion model for generative models


In [107]:
print("Q: What are the main contributions?\nA:", ask_question("What are the main contributions?"))


Q: What are the main contributions?
A: a new method for the design of candidate structures from pharmacophoric hypotheses. Journal of medicinal chemistry, 3624: 38633870, 1993.


In [108]:
print("Q: What are the key themes?\nA:", ask_question("What are the key themes discussed?"))


Q: What are the key themes?
A: Existing approaches are either based on syntactic pattern recognition or on autoregressive models


In [109]:
print("Q: What are the key results?\nA:", ask_question("What are the key results?"))

Q: What are the key results?
A: The following metrics assess the chemical relevance of the generated molecules. The last three metrics evaluate the standard generative properties of the methods. Method QED SA Rings Valid, Unique, Novel, GEOM and combine results removing duplicates. Overall, we obtain 41,907 molecules and 285,142 fragmentations that are randomly split in train 282,602 examples, valida tion 1,250 examples and test 1,290 examples sets. Pockets Dataset In order to assess


In [110]:
print(ask_question("What is the name of the chaperone used in the case study? Don't make up the result if you don't know - just say NO."))


RA


In [111]:
# 2. Create a strong prompt template
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful academic assistant.

Read the following article carefully.

=== Article Content ===
{context}

=== Question ===
{question}

=== Answer ===
"""
)

# 3. Build the RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": qa_prompt},
)

In [112]:
qa_chain.run("What are the main contributions?")


'E n equivariant normalizing ows. arXiv preprint arXiv:2105.09016, 2021. Chunquan Sheng and Wannian Zhang. New lead structures in for helpful feedback and insightful dis cussions. Ilia Igashov has received funding from the European Unions Horizon 2020 research and innovation programme under the Marie Skodowska-Curie grant agreement No 945363. Clement Vignac would like to thank and Renxiao Wang. Comparative assessment of scoring functions: The CASF2016 update. Journal of Chemical Information and Modeling, 592:895913, November 2018. David C Thompson, R Aldrin Denny, Ramaswamy Nilakantan, Christine Humblet, Diane Joseph'

# NOT doing a good job understanding the paper.

# Trying to see if it can understand a novel story written by ChatGPT.

the story does not exist anywhere on internet.


In [113]:
# Define the story
story = """
The Echoes of Arlandria

In the distant kingdom of Arlandria, nestled between jagged mountains and dark, mist-filled forests, there was a legend—a legend of an ancient city that had vanished without a trace. The city, once the pride of the kingdom, was said to have been built by the first rulers, whose bloodline was rumored to possess the ability to manipulate the very fabric of time itself. The city was called Rivakhar.

But Rivakhar had disappeared overnight, leaving behind only a haunting silence. No one knew where it had gone, nor why it vanished. Over time, the tale faded into myth, and the people of Arlandria moved on with their lives, leaving the mystery to the realms of imagination.

That is, until a young woman named Liora stumbled upon an old, tattered map in the attic of her late grandmother’s cottage. The map depicted the mountains surrounding Arlandria, with a strange, glowing mark near the heart of the forest, where no human had ventured in centuries. Liora, a curious and determined soul, had always felt a deep connection to the unknown, as if something—or someone—was calling her.

Driven by an unexplainable urge, Liora set out on a journey to find the lost city. She was joined by Caden, a quiet scholar from the kingdom's library who had long been fascinated by the legend of Rivakhar, and Verrin, a skilled tracker from the village who had lost his brother to the forests long ago.

The journey took them through treacherous terrain, where the air grew thick with magic, and the forest seemed to whisper secrets long forgotten. They encountered strange creatures—twisted shadows that flickered in and out of existence, glowing insects that left trails of light, and an ancient wolf who seemed to know their every step.

As they ventured deeper into the forest, they uncovered forgotten ruins and ancient symbols etched into the earth. Liora began to dream of Rivakhar—vivid, surreal dreams of a city that floated above the ground, surrounded by shimmering rivers of light. In these dreams, she could hear the voices of the city's inhabitants, calling for help.

One night, while camping under the stars, Liora awoke to a soft voice calling her name. It was the voice of her grandmother, long passed. The voice told her of a great choice: She was the heir of the Rivakhari bloodline, and the key to unlocking the city's return.

The next day, they reached the heart of the forest, where the map’s glowing mark had led them. But as they arrived, they discovered an ancient temple, buried beneath the roots of a massive tree. Inside the temple, they found a pedestal with a crystal that pulsed with an ethereal light. The crystal, however, was cracked—its power waning.

It was here that they learned the truth: Rivakhar had not disappeared; it had simply folded into another dimension, trapped between time and space. The bloodline of the rulers had the power to bring it back, but only if the heir could restore the crystal and use their blood to fuel the magic that bound the city.

But there was a catch. The ruler’s bloodline had been cursed long ago. Anyone who tried to restore Rivakhar would be forced to choose: save the city or save themselves. If the city returned, its magic would drain the life of the one who called it back, trapping them in its eternity.

Liora stood before the crystal, her heart heavy with the weight of the decision. She could feel the pulse of the city’s magic, calling her, but she also felt the warmth of her companions behind her—their hope, their belief in her.

In a moment of clarity, Liora made her choice.

She pressed her hand to the crystal, her blood mingling with its magic. A burst of light engulfed her, and the world seemed to warp around her. For a moment, everything stood still, and then…

Rivakhar reappeared, rising from the earth, glowing brighter than the sun. The ancient city, with its grand towers and shining rivers, was restored.

But Liora did not return with it. Her body, now made of stardust and light, stood as a sentinel at the heart of the city, her soul bound to the city forever.

The people of Arlandria, hearing the legends once again, would tell stories of the brave young woman who gave everything to restore Rivakhar—the city that never truly disappeared, but had waited for the right soul to bring it back.

The End.
"""

# Save the story to a text file
with open("TheEchoesOfArlandria.txt", "w") as file:
    file.write(story)

print("Story saved successfully as 'TheEchoesOfArlandria.txt'")


Story saved successfully as 'TheEchoesOfArlandria.txt'


In [140]:
import openai
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

# 1. Load the long journal paper
file_path = "TheEchoesOfArlandria.txt"  # Path to your journal paper
loader = TextLoader(file_path)
documents = loader.load()

raw_text = "\n".join([doc.page_content for doc in documents])

# 3. Split into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,  # approx 1000 characters per chunk
    chunk_overlap=20
)

docs = text_splitter.create_documents([raw_text])

# 4. Build vector database (FAISS)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)

model_name = 'google/flan-t5-large'  # Switching to a smaller version of FLAN-T5
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

text2text_gen = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=300,  # <-- important! specifies the size of outputs (answer) that LLM should give you
    )

llm = HuggingFacePipeline(pipeline=text2text_gen)

# Custom prompt template
custom_prompt_template = """
You are a helpful assistant. Read the following content carefully. Then, answer the given question completely and concisely.

=== Content ===
{context}

=== Question ===
{question}

=== Answer ===
"""

custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=custom_prompt_template,
)

# 6. Build RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), # Retrieve top 10 chunks for question answer
    chain_type="stuff",
    chain_type_kwargs={"prompt": custom_prompt}  # Use the custom prompt
)


Device set to use cpu


In [141]:
print(ask_question("Who is the main character in the story?"))


Liora


In [142]:
print(ask_question("What was the city called in the legend?"))


Rivakhar


In [143]:
question = "What happened to the city Rivakhar?"
ask_question(question)

'It had folded into another dimension.'

In [144]:
print(ask_question("Write a summary of this story."))


Liora dreamed of the city


# It understands this story and answers properly.