#RAG System for Lung Cancer Information Retrieval
RAG-based chatbot tailored to lung cancer, offering dynamic responses by integrating retrieved information with LLM and prompt to generate the desired output. This enhances user interaction and ensures access to the latest advancements and research findings in real-time.

In [None]:
#installing packages
!pip install langchain
!pip install langchain-community
!pip install PyPDF2 chromadb text-generation langchain sentence-transformers
!pip install pypdf

In [3]:
#imports
from langchain_community.document_loaders import PyPDFLoader
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from chromadb.utils import embedding_functions
from langchain_community.embeddings import HuggingFaceEmbeddings
from text_generation import InferenceAPIClient,Client
from langchain import hub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
import os

from langchain.text_splitter import CharacterTextSplitter # For chunks creation - slpits the text into chunks
from PyPDF2 import PdfReader # For reading the content in pdf
import chromadb # Vector data base for creating embeddings and storing in a collection
from chromadb.utils import embedding_functions # Provides different embedding functions
from langchain.embeddings import HuggingFaceEmbeddings # For creating hugging face embeddings
from langchain.vectorstores import Chroma

Connecting Colab with Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
#Ask the user to paste the path to the uploaded folder
folder_path = input("Please paste the path to the uploaded folder: ")

#Validate the path
if not os.path.exists(folder_path):
  raise ValueError("Invalid folder path. Please check and try again.")

#List the contents of the folder using the provided path
folder_contents = os.listdir(folder_path)
print(folder_contents)

#Proceed with further processing using the files in the folder_path


Please paste the path to the uploaded folder: /content/drive/MyDrive/healthcare_RAG_data
['What Is Lung Cancer_ _ Types of Lung Cancer _ American Cancer Society.pdf', 'The pathogenesis of mesothelioma - ScienceDirect (1).pdf', 'lung-cancer-where-are-we-today.pdf', 'Lung Cancer_ Types, Stages, Symptoms, Diagnosis & Treatment.pdf', 'lung cancer research paper (1).pdf', 'Lung cancer - Symptoms and causes - Mayo Clinic.pdf', 'Cancer - 2014 - Byers - Small cell lung cancer  Where do we go from here (1).pdf', 'CA A Cancer J Clinicians - 2019 - Carbone - Mesothelioma  Scientific clues for prevention  diagnosis  and therapy.pdf', 'Basic Information About Lung Cancer _ CDC.pdf', 'Worldwide Overview of the Current Status.pdf', '.ipynb_checkpoints']


The code below combines several PDF files from a folder into one. It looks for PDFs in a folder, reads each one, grabs their pages, and puts them all together into a new PDF file

In [7]:
import os
import PyPDF2

# User-defined function to merge PDFs in a folder
def merge_pdfs(folder_path, output_file_name="MergedFiles1.pdf"):

  # Get a list of all PDF files in the folder
  pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith(".pdf")]

  if not pdf_files:
    print("No PDF files found in the specified folder.")
    return

  # Create a new PdfFileWriter object to store merged PDF
  pdf_writer = PyPDF2.PdfWriter()

  # Loop through each PDF file and append its pages to the writer
  for pdf_file in pdf_files:
    with open(pdf_file, 'rb') as file:
      pdf_reader = PyPDF2.PdfReader(file)
      for page_num in range(len(pdf_reader.pages)):
        page_obj = pdf_reader.pages[page_num]
        pdf_writer.add_page(page_obj)

  # Write the merged PDF to a new file
  with open(os.path.join(folder_path, output_file_name), 'wb') as output_file:
    pdf_writer.write(output_file)

  print(f"Merged PDF files successfully and saved as '{output_file_name}' in the folder.")

# Replace this with the actual path to your uploaded folder
folder_path = "/content/drive/MyDrive/healthcare_RAG_data"

# Call the merge_pdfs function
merge_pdfs(folder_path)


Merged PDF files successfully and saved as 'MergedFiles1.pdf' in the folder.


In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/RAG BASED/MergedFiles1.pdf')

In [9]:
loader = PyPDFLoader("/content/drive/MyDrive/healthcare_RAG_data/MergedFiles1.pdf")
text = loader.load_and_split()

Chunking the Data

In [10]:
def get_chunk(text):

    text_splitter = RecursiveCharacterTextSplitter(
        # separator="\n",  # Adjust separator if needed
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_documents(text)
    return chunks


In [11]:
text_chunks=get_chunk(text)

#Creating Embeddings and Storing in VectorDB
1.Using all-MiniLM-L6-v2 embeddings from HuggingFace

2.Storing Embeddings in Chroma DB

In [12]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_documents(text_chunks,embeddings,persist_directory="chroma_persist")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Using different Embedding techniques

In [None]:
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/msmarco-distilbert-base-v2")
# db = Chroma.from_documents(text_chunks,embeddings,persist_directory="chroma_persist")

Using different Vector DBs for example Faiss

In [None]:
# from langchain_community.vectorstores import FAISS
# from langchain_text_splitters import RecursiveCharacterTextSplitter


# text_splitter = RecursiveCharacterTextSplitter()
# vector = FAISS.from_documents(text_chunks, embeddings)

In [13]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

Example of getting related Documents using Retriever as per user query

In [14]:
retrieved_docs1 = retriever.invoke("What are the considerations for treating elderly patients with SCLC?")

In [15]:
retrieved_docs1

[Document(page_content='viewed in this light, that is, there must be a toxicity-to-beneﬁting the studies to be stopped early (250, 251).ratio.The optimal management of the elderly with SCLC is anA Phase II study of irinotecan plus cisplatin yielded a CR ofimportant issue as 40% of those who present with the disease29% and an overall response rate of 86% with a median survivalare over 70 years old. Studies that investigated this area suggestof 13.2 months in patients with extensive disease SCLC (269).that a reasonably high initial dose of chemotherapy is importantThis has led to an RCT of irinotecan and cisplatin versus etopo-and that the elderly tolerate radiotherapy well (274–276). In-side and cisplatin in patients with extensive disease SCLC. Thedeed, elderly patients with good performance status and normalstudy was halted early because of a signiﬁcant survival advantageorgan function do as well with optimal chemotherapy dosesfor the patients randomized to irinotecan plus cisplatin (

Paste your HuggingFace API key here

In [16]:
HUGGINGFACEHUB_API_TOKEN = ""

using Mistral-7B-Instruct (llm) as generator

In [17]:
repo_id = "mistralai/Mistral-7B-Instruct-v0.1"

llm = HuggingFaceEndpoint(repo_id=repo_id,huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,temperature=0.05)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Other LLM models can be explored like Falcon,Mistral-7B etc

In [None]:
# repo_id = "tiiuae/falcon-7b"

# llm = HuggingFaceEndpoint(repo_id=repo_id,huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,temperature=0.05)

In [18]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

This code builds a pipeline that uses a factual passage and large language model to answer medical questions in a user-friendly way.

It retrieves relevant info, formats it, generates an answer, and outputs it as a string.

In [19]:
from langchain_core.prompts import PromptTemplate

template = """You are a helpful and informative lung cancer expert bot that answers questions using text from the reference passage included below. \
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.
  However, you are talking to a non-technical audience, so be sure to break down complicated concepts and
  strike a friendly and converstional tone.
  If the passage is irrelevant to the answer, you may ignore it.

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What are the considerations for treating elderly patients with SCLC?")

"\n\nWhen it comes to treating elderly patients with small cell lung cancer (SCLC), there are several factors to consider. First and foremost, the optimal management of the elderly with SCLC is an area of active research, and there are currently no proven methods for early detection of the disease.\n\nIn terms of treatment, the vigorous response of SCLC to frontline chemotherapy and radiation is contrasted by its subsequent resistance to second-line and subsequent therapies after disease recurrence. Limited-stage (LS) SCLC, which is cancer confined to the thorax in a single radiation field, is typically treated with concurrent chemoradiation, while extensive-stage (ES) SCLC is treated with chemotherapy alone.\n\nWhen it comes to chemotherapy, a reasonably high initial dose is important for elderly patients with SCLC. Studies have shown that elderly patients with good performance status and normal organ function do as well with optimal chemotherapy doses as their younger counterparts. H

#Benchmarking RAG System
Using Cosine Similarity

In [21]:
import pandas as pd
df=pd.read_csv("/content/merged_file_new.csv")

Database contains Question, Ground Truth, Retriever Answer and Generator Answer

In [22]:
df.head()

Unnamed: 0,Question,Answer,Retriever answer,Generated answer
0,What potential drug targets are currently bein...,Novel drug targets under investigation in clin...,Document(page_content='(and rebiopsies at the ...,Several potential drug targets are currently ...
1,What recent advances in SCLC research have con...,"Several recent advances in SCLC research, incl...",[Document(page_content='Current Barriers and C...,Several recent advances in SCLC research have...
2,Does a chest X-ray show lung cancer?,X-rays arenâ€™t as good as CT scans for showin...,[Document(page_content='been updated to assess...,"\n\nA chest X-ray can detect lung cancer, but ..."
3,Who Should Be Screened for Lung Cancer?,Lung cancer screening is recommended only for ...,"[Document(page_content='2/25/24, 5:54 PM Basic...",Lung cancer screening is recommended only for...
4,Explain Lung cancer screening,You can increase your chances of catching canc...,[Document(page_content='Better understanding o...,Lung cancer screening refers to the process o...


In [23]:
df.isnull().sum()

Question            0
Answer              0
Retriever answer    0
Generated answer    0
dtype: int64

Pre-processing

In [None]:
import pandas as pd


# List to store modified retriever answers
modified_retriever_answers = []

# Iterate through all the entries in the 'Retriever answer' column
for index, row in df.iterrows():
    retriever_answer = row['Retriever answer']

    # Remove "[Document(page_content=" from the retriever answer
    retriever_answer = retriever_answer.replace("[Document(page_content=", "")

    # Remove "\n" from the retriever answer
    retriever_answer = retriever_answer.replace("\n", "")

    # Append the modified retriever answer to the list
    modified_retriever_answers.append(retriever_answer)

# Add the modified retriever answers to the DataFrame
df['Modified Retriever Answer'] = modified_retriever_answers

# Print the DataFrame with modified retriever answers
print(df)


In [None]:
import pandas as pd
import re



# Function to remove metadata part from the modified retriever answer
def remove_metadata(retriever_answer):
    # Use regular expression to match the metadata part
    return re.sub(r', metadata=.*\)', '', retriever_answer)

# Apply the remove_metadata function to each entry in the 'modified_retriever_answers' column
df['Modified Retriever Answer'] = df['Modified Retriever Answer'].apply(remove_metadata)

# Print the DataFrame with modified retriever answers
print(df)


In [27]:
df.head(5)

Unnamed: 0,Question,Answer,Retriever answer,Generated answer,Modified Retriever Answer
0,What potential drug targets are currently bein...,Novel drug targets under investigation in clin...,Document(page_content='(and rebiopsies at the ...,Several potential drug targets are currently ...,Document(page_content='(and rebiopsies at the ...
1,What recent advances in SCLC research have con...,"Several recent advances in SCLC research, incl...",[Document(page_content='Current Barriers and C...,Several recent advances in SCLC research have...,'Current Barriers and Challenges in Translatio...
2,Does a chest X-ray show lung cancer?,X-rays arenâ€™t as good as CT scans for showin...,[Document(page_content='been updated to assess...,"\n\nA chest X-ray can detect lung cancer, but ...",'been updated to assess the incidence of lung ...
3,Who Should Be Screened for Lung Cancer?,Lung cancer screening is recommended only for ...,"[Document(page_content='2/25/24, 5:54 PM Basic...",Lung cancer screening is recommended only for...,"'2/25/24, 5:54 PM Basic Information About Lung..."
4,Explain Lung cancer screening,You can increase your chances of catching canc...,[Document(page_content='Better understanding o...,Lung cancer screening refers to the process o...,'Better understanding of genetic predispositio...


In [28]:
# Remove the 'Retriever answer' column from the DataFrame
df = df.drop(columns=['Retriever answer'])

# Print the DataFrame after removing the 'Retriever answer' column
df.head(5)


Unnamed: 0,Question,Answer,Generated answer,Modified Retriever Answer
0,What potential drug targets are currently bein...,Novel drug targets under investigation in clin...,Several potential drug targets are currently ...,Document(page_content='(and rebiopsies at the ...
1,What recent advances in SCLC research have con...,"Several recent advances in SCLC research, incl...",Several recent advances in SCLC research have...,'Current Barriers and Challenges in Translatio...
2,Does a chest X-ray show lung cancer?,X-rays arenâ€™t as good as CT scans for showin...,"\n\nA chest X-ray can detect lung cancer, but ...",'been updated to assess the incidence of lung ...
3,Who Should Be Screened for Lung Cancer?,Lung cancer screening is recommended only for ...,Lung cancer screening is recommended only for...,"'2/25/24, 5:54 PM Basic Information About Lung..."
4,Explain Lung cancer screening,You can increase your chances of catching canc...,Lung cancer screening refers to the process o...,'Better understanding of genetic predispositio...


Generator Benchmarking

Used BERT tokenizer for Embeddings

In [29]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a given text
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs['last_hidden_state']
    # Take the mean along the sequence length dimension to get a single vector
    embeddings = torch.mean(embeddings, dim=1)
    return embeddings.numpy()

# Function to calculate cosine similarity between two vectors
def cosine_similarity_score(vector1, vector2):
    return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]

# List to store cosine similarity values
cosine_similarities = []

# Calculate cosine similarity for each pair of entries in the DataFrame
for index, row in df.iterrows():
    answer = row['Answer']
    generator_answer = row['Generated answer']

    # Get BERT embeddings for answer and generator_answer
    answer_embedding = get_bert_embedding(answer)
    generator_answer_embedding = get_bert_embedding(generator_answer)

    # Calculate cosine similarity
    similarity = cosine_similarity_score(answer_embedding, generator_answer_embedding)
    cosine_similarities.append(similarity)

# Print the list of cosine similarity values
print("Cosine Similarities:", cosine_similarities)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Cosine Similarities: [0.89000005, 0.90225524, 0.870137, 0.8971708, 0.81383157, 0.9525388, 0.95817655, 0.9278368, 0.9039205, 0.9113158, 0.9377673, 0.9427082, 0.90808725, 0.93354654, 0.9512731, 0.93486977, 0.91726613, 0.97283375, 0.9385454, 0.97539467, 0.9230012, 0.95142245, 0.9331231, 0.92610765, 0.85460573, 0.9071871, 0.9620664, 0.8966062, 0.9150687, 0.81964713, 0.96358883, 0.9097562, 0.9071934, 0.8516847, 0.9444643]


In [30]:
sum = 0
for i in cosine_similarities:
  sum = sum+i
l = len(cosine_similarities)
avg = sum/l
avg

0.9172856654439654

Retirever Benchmarking

In [31]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a given text
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs['last_hidden_state']
    # Take the mean along the sequence length dimension to get a single vector
    embeddings = torch.mean(embeddings, dim=1)
    return embeddings.numpy()

# Function to calculate cosine similarity between two vectors
def cosine_similarity_score(vector1, vector2):
    return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]



# List to store cosine similarity values
cosine_similarities1 = []

# Calculate cosine similarity for each pair of entries in the DataFrame
for index, row in df.iterrows():
    answer = row['Answer']
    retriever_answer = row['Modified Retriever Answer']

    # Get BERT embeddings for answer and generator_answer
    answer_embedding = get_bert_embedding(answer)
    retriever_answer_embedding = get_bert_embedding(retriever_answer)

    # Calculate cosine similarity
    similarity = cosine_similarity_score(answer_embedding,retriever_answer_embedding)
    cosine_similarities1.append(similarity)

# Print the list of cosine similarity values
print("Cosine Similarities:", cosine_similarities1)



Cosine Similarities: [0.7601464, 0.7572893, 0.8269283, 0.76513886, 0.7468462, 0.79164267, 0.72982275, 0.59746635, 0.7964464, 0.7478242, 0.76003534, 0.80072147, 0.78702694, 0.84066653, 0.86307013, 0.82522684, 0.8230679, 0.7612903, 0.93521696, 0.9218618, 0.8792752, 0.90518564, 0.7823363, 0.80151296, 0.8758538, 0.79394984, 0.8996692, 0.72691417, 0.86815274, 0.76222396, 0.8118925, 0.63914853, 0.8197434, 0.8228714, 0.7696135]


In [32]:
sum = 0
for i in cosine_similarities1 :
  sum = sum+i
l = len(cosine_similarities1)
avg1 = sum/l
avg1

0.7998879637037005