# Advanced Retrieval-Augmented Generation (RAG) System for Medical Question Answering on Cardiac Anatomy

In [1]:
!pip install faiss-cpu sentence-transformers transformers rank_bm25 langchain --quiet
import nltk
nltk.download('stopwords')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
## This section loads all necessary Python libraries for building a Retrieval-Augmented Generation (RAG) pipeline
import json
import numpy as np
import pandas as pd
import torch
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForSequenceClassification
from rank_bm25 import BM25Okapi
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
!pip install gdown
!pip install pymupdf
# Download the file from Google Drive using gdown
!gdown --id 1Ek7g9yGFB3iusI-ytqsmwi2foBvnPsGL -O heart_anatomy.pdf

import fitz  # PyMuPDF

pdf_filename = "heart_anatomy.pdf"
doc = fitz.open(pdf_filename)
docs_read = ""
for page in doc:
    docs_read += page.get_text()

print("First 1000 characters of extracted text:\n")
print(docs_read[:1000])


Collecting pymupdf
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.0
Downloading...
From: https://drive.google.com/uc?id=1Ek7g9yGFB3iusI-ytqsmwi2foBvnPsGL
To: /content/heart_anatomy.pdf
100% 259k/259k [00:00<00:00, 113MB/s]
First 1000 characters of extracted text:

Introduction 
The heart is a muscular organ that serves to collect deoxygenated blood from all parts of the 
body, carries it to the lungs to be oxygenated and release carbon dioxide. Then, it transports the 
oxygenated blood from the lungs and distributes it to all the body parts 
• 
The heart pumps around 7,200 litres of blood in a day throughout the body. 
• 
The heart is situat

In [4]:
# Semantic Chunking
splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=30)
chunks = splitter.create_documents([docs_read])
chunk_texts = [doc.page_content for doc in chunks]

In [5]:
# Embedding Model
embed_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = embed_model.encode(chunk_texts, show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# FAISS Index
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))

In [7]:
# BM25 Index
tokenized_corpus = [doc.split() for doc in chunk_texts]
bm25 = BM25Okapi(tokenized_corpus)

In [8]:
# Reranker
rerank_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")
rerank_model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-base")
rerank_model.eval()

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [9]:
# Rerank Function
def rerank_passages(query, passages, top_k=3):
    pairs = [[query, p] for p in passages]
    inputs = rerank_tokenizer(pairs, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        scores = rerank_model(**inputs).logits.squeeze(-1)
    sorted_indices = torch.argsort(scores, descending=True)
    return [passages[i] for i in sorted_indices[:top_k]]

In [10]:
# Generator
gen_model_name = "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)
text_gen = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


In [11]:
# Step-back Prompting
def generate_step_back_query(original_query):
    prompt = f"""You are a medical assistant helping clarify medical questions.
Given the following user question, rewrite it or break it into a clearer sub-question that helps retrieve the right medical information.

ORIGINAL QUESTION:
{original_query}

CLARIFIED or SIMPLIFIED QUESTION:"""
    return text_gen(prompt, max_new_tokens=100)[0]["generated_text"].strip()

In [12]:
# Hybrid Retrieval with Step-Back
def hybrid_retrieve_stepback(query, k=5):
    clarified_query = generate_step_back_query(query)
    print("Step-Back Reformulated Query:", clarified_query)
    query_vector = embed_model.encode([clarified_query])
    _, faiss_indices = index.search(np.array(query_vector), k)
    vector_hits = [chunk_texts[i] for i in faiss_indices[0]]
    bm25_hits = bm25.get_top_n(clarified_query.split(), chunk_texts, n=10)
    combined = list(set(bm25_hits + vector_hits))
    return rerank_passages(clarified_query, combined, top_k=k)

In [13]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

domain_words = set(['heart'])

def get_meaningful_tokens(query):
    return [w for w in query.lower().split() if w not in stop_words and w not in domain_words]

def generate_rag_standard_answer(query, similarity_threshold=0.3):
    query_tokens = query.lower().split()
    key_tokens = get_meaningful_tokens(query)
    all_scores = bm25.get_scores(query_tokens)
    query_vector = embed_model.encode([query])
    _, faiss_indices = index.search(np.array(query_vector), 5)
    vector_hits = [chunk_texts[i] for i in faiss_indices[0]]
    bm25_hits = bm25.get_top_n(query_tokens, chunk_texts, n=10)
    combined = list(set(bm25_hits + vector_hits))
    reranked = rerank_passages(query, combined, top_k=5)
    context = "\n".join(reranked)
    num_key_tokens_in_context = sum(token in context.lower() for token in key_tokens)
    if num_key_tokens_in_context < min(2, len(key_tokens)):
        return "Sorry, the provided documents do not contain relevant information for your query."
    prompt = f"""
You are MedicalBot, a medical assistant. Answer the user's question strictly using ONLY the information in CONTEXT below.
If the context does NOT contain an answer, respond: "Sorry, the provided documents do not contain relevant information for your query."
Do NOT use your own knowledge or make up information. Cite medical facts clearly.

CONTEXT:
{context}

QUESTION:
{query}

ANSWER:
"""
    return text_gen(prompt, max_new_tokens=350)[0]["generated_text"].strip()


In [14]:
def generate_rag_answer_stepback(query, similarity_threshold=0.3):
    stepback_query = generate_step_back_query(query)
    print("Step-Back Reformulated Query:", stepback_query)
    stepback_tokens = stepback_query.lower().split()
    key_tokens = get_meaningful_tokens(stepback_query)
    all_scores = bm25.get_scores(stepback_tokens)
    query_vector = embed_model.encode([stepback_query])
    _, faiss_indices = index.search(np.array(query_vector), 5)
    vector_hits = [chunk_texts[i] for i in faiss_indices[0]]
    bm25_hits = bm25.get_top_n(stepback_tokens, chunk_texts, n=10)
    combined = list(set(bm25_hits + vector_hits))
    reranked = rerank_passages(stepback_query, combined, top_k=5)
    context = "\n".join(reranked)
    num_key_tokens_in_context = sum(token in context.lower() for token in key_tokens)
    if num_key_tokens_in_context < min(2, len(key_tokens)):
        return "Sorry, the provided documents do not contain relevant information for your query."
    prompt = f"""
You are MedicalBot, a medical assistant. Answer the user's question strictly using ONLY the information in CONTEXT below.
If the context does NOT contain an answer, respond: "Sorry, the provided documents do not contain relevant information for your query."
Do NOT use your own knowledge or make up information. Cite medical facts clearly.

CONTEXT:
{context}

QUESTION:
{query}

ANSWER:
"""
    return text_gen(prompt, max_new_tokens=350)[0]["generated_text"].strip()


In [15]:
# Baseline
def generate_baseline_answer(query):
    prompt = f"Answer this medical question clearly:\n\n{query}"
    return text_gen(prompt, max_new_tokens=300)[0]["generated_text"].strip()

# Evaluation
def evaluate_quality(baseline, rag, query):
    baseline_len = len(baseline.split())
    rag_len = len(rag.split())
    depth_better = rag_len > baseline_len
    query_words = query.lower().split()
    baseline_match = sum(1 for word in query_words if word in baseline.lower())
    rag_match = sum(1 for word in query_words if word in rag.lower())
    relevance_better = rag_match >= baseline_match
    accuracy_check = ("not" not in rag.lower()) and (rag_match > 0)
    return accuracy_check, relevance_better, depth_better

In [16]:
# Test Queries
test_queries = [
    "What is the anatomical position and orientation of the heart in the human body?",
    "Which chambers form the base of the heart, and how is it oriented?",
    "What anatomical structures border the heart anteriorly, posteriorly, and laterally?",
    "What is the significance of the interventricular grooves in separating heart chambers?",
    "How big is the brain?",
    "Which part of the heart forms the apex, and what is its clinical significance?",
    "How are the surfaces of the heart defined and which chambers contribute to each?",
    "What are the three layers of the heart wall, and what are their functions?",
    "How do the subendocardial and subepicardial layers contribute to the heart’s structure?",
    "What is the flow of blood through the four chambers of the heart?",
    "What is the functional significance of each heart chamber?",
    "What are the four valves of the heart, and how do they regulate blood flow?",
    "How do atrioventricular valves differ from semilunar valves?",
    "What is the role of the tricuspid and mitral valves in heart function?",
    "What are the main coronary arteries, and which regions of the heart do they supply?",
    "How do coronary veins drain blood from the heart?",
    "What is the function of the coronary sinus and thebesian veins?",
    "How does the sinoatrial node function as the heart’s natural pacemaker?",
    "What role does the atrioventricular node play in cardiac conduction?",
    "How does the autonomic nervous system influence heart rate?",
    "Why is the point of maximal impulse (PMI) important in physical exams?",
    "How does posture affect the position of the heart’s base relative to vertebrae?",
    "How does the sympathetic nervous system increase heart rate during exercise?",
    "What happens to blood flow if the mitral valve becomes defective?",
    "How do the heart’s anatomical surfaces relate to surrounding organs like the lungs and diaphragm?",
    "How does air pollution contribute to heart disease?"
]

results = []
for query in test_queries:
    print(f"\n====================\nQuery: {query}")
    baseline = generate_baseline_answer(query)
    rag_standard = generate_rag_standard_answer(query)
    rag_stepback = generate_rag_answer_stepback(query)

    accuracy, relevance, depth = evaluate_quality(baseline, rag_stepback, query)
    results.append({
        "Query": query,
        "Baseline": baseline,
        "RAG_Standard": rag_standard,
        "RAG_StepBack": rag_stepback,
        "Accuracy": accuracy,
        "Relevance": relevance,
        "Depth": depth
    })

# Create DataFrame
results_df = pd.DataFrame(results)
results_df


Query: What is the anatomical position and orientation of the heart in the human body?
Step-Back Reformulated Query: What is the anatomical position and orientation of the heart in the human body?

Query: Which chambers form the base of the heart, and how is it oriented?


Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


Step-Back Reformulated Query: Which chambers form the base of the heart, and how is it oriented?

Query: What anatomical structures border the heart anteriorly, posteriorly, and laterally?


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Step-Back Reformulated Query: What anatomical structures border the heart anteriorly, posteriorly, and laterally?

Query: What is the significance of the interventricular grooves in separating heart chambers?
Step-Back Reformulated Query: What is the significance of the interventricular grooves in separating heart chambers?

Query: How big is the brain?
Step-Back Reformulated Query: What is the size of the brain?

Query: Which part of the heart forms the apex, and what is its clinical significance?
Step-Back Reformulated Query: Which part of the heart forms the apex, and what is its clinical significance?

Query: How are the surfaces of the heart defined and which chambers contribute to each?
Step-Back Reformulated Query: How are the surfaces of the heart defined and which chambers contribute to each?

Query: What are the three layers of the heart wall, and what are their functions?
Step-Back Reformulated Query: What are the three layers of the heart wall, and what are their functions?

Unnamed: 0,Query,Baseline,RAG_Standard,RAG_StepBack,Accuracy,Relevance,Depth
0,What is the anatomical position and orientatio...,The heart is located in the left ventricle of ...,The apex is typically located at the level of ...,The apex is typically located at the level of ...,True,True,True
1,"Which chambers form the base of the heart, and...",The base of the heart consists of the left atr...,The base is formed primarily by the atria (the...,The base is formed primarily by the atria (the...,True,True,True
2,What anatomical structures border the heart an...,"aorta, mitral valve, aorta lateralis, aorta su...",thoracic duct • Superficially : bifurcation of...,thoracic duct • Superficially : bifurcation of...,True,True,True
3,What is the significance of the interventricul...,The interventricular grooves separate the left...,"Sorry, the provided documents do not contain r...","Sorry, the provided documents do not contain r...",False,False,True
4,How big is the brain?,The brain is about the size of a tennis ball.,"Sorry, the provided documents do not contain r...","Sorry, the provided documents do not contain r...",False,False,True
5,"Which part of the heart forms the apex, and wh...",The apex of the heart forms the apex of the he...,"The apex is formed by the left ventricle, and ...","The apex is formed by the left ventricle, and ...",True,True,True
6,How are the surfaces of the heart defined and ...,The cardiac surfaces are defined as the left a...,Main contributors: The right atrium and right ...,Main contributors: The right atrium and right ...,True,True,True
7,"What are the three layers of the heart wall, a...",The three layers of the heart wall are the end...,Epicardium - the outer layer of the wall of th...,Epicardium - the outer layer of the wall of th...,True,False,True
8,How do the subendocardial and subepicardial la...,The subendocardial and subepicardial layers co...,The rest of the heart is composed mainly of th...,The rest of the heart is composed mainly of th...,True,False,True
9,What is the flow of blood through the four cha...,The flow of blood through the four chambers of...,Venous blood returning from the body drains in...,Venous blood returning from the body drains in...,True,True,True


In [17]:
# from google.colab import files
# results_df.to_csv('results.csv', index=False)
# files.download('results.csv')