# **MediQuest RAG with Vector Database**

In [1]:
!pip install -U transformers torch accelerate pandas datasets
!pip install -U langchain langchain-community langchain-core langchain-huggingface faiss-cpu

Collecting transformers
  Downloading transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading transformers-4.57.0-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [2]:
!pip install -U langchain-community



In [3]:
!pip install -U langchain-text-splitters



In [4]:
# Import Libraries
import torch
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from datasets import load_dataset
from huggingface_hub import login
from google.colab import userdata

In [5]:
# --- 1. CONFIGURATION AND AUTHENTICATION ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5"
LLM_MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
INDEX_PATH = "faiss_medical_index"

HF_TOKEN = userdata.get("HF_TOKEN")
if HF_TOKEN is None:
    raise ValueError("HF_TOKEN not found in Colab secrets. Please set it!")
login(token=HF_TOKEN, add_to_git_credential=True)

print(f"Running on device: {DEVICE}")

Running on device: cuda


In [6]:
# --- 2. DATA LOADING, REFORMATTING, and CHUNKING PubMedQA Dataset
# Load a sample of the dataset (first 500 rows)
dataset = load_dataset("pubmed_qa", "pqa_labeled", split="train").select(range(500))

documents_for_rag = []
for item in dataset:
    question = item['question']
    context_text = "\n".join(item['context']['contexts'])
    answer = item['final_decision']

    combined_content = f"Question: {question}\nContext Abstract: {context_text}\nAnswer: {answer}"

    new_doc = Document(page_content=combined_content)
    documents_for_rag.append(new_doc)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50,
    separators=["\n\n", "\n", ". ", " ", ""])

all_chunks = text_splitter.split_documents(documents_for_rag)

README.md: 0.00B [00:00, ?B/s]

pqa_labeled/train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
# --- 3. EMBEDDING AND FAISS INDEXING ---
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME,
    model_kwargs={'device': DEVICE},encode_kwargs={'normalize_embeddings': True})

if os.path.exists(INDEX_PATH):
    vectorstore = FAISS.load_local(
        folder_path=INDEX_PATH, embeddings=embeddings, allow_dangerous_deserialization=True)
else:
    vectorstore = FAISS.from_documents(all_chunks, embeddings)
    vectorstore.save_local(INDEX_PATH)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# --- 4. LLM LOADING ---
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_NAME,
    dtype=torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_available() else torch.float32,
    device_map="auto",token=HF_TOKEN)

pipe = pipeline("text-generation",model=model,tokenizer=tokenizer,max_new_tokens=256,
    temperature=0.1, do_sample=True,eos_token_id=tokenizer.eos_token_id)

llm = HuggingFacePipeline(pipeline=pipe)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0


In [9]:
# --- 5. RAG CHAIN DEFINITION ---
prompt_template_string = """You are an expert medical assistant. Use the following CONTEXT provided by the knowledge retrieval system to answer the user's QUESTION accurately.
If the CONTEXT does not contain the answer, you MUST state clearly and politely that you cannot find the answer in the provided documents.

CONTEXT:
{context}

QUESTION:
{question}

Answer:"""

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def format_llama_prompt(data):
    messages = [
        {"role": "user", "content": prompt_template_string.format(
            context=data["context"],
            question=data["question"]
        )}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

rag_chain = (
    RunnableParallel(
        context=retriever | format_docs,
        question=RunnablePassthrough()
    )
    | format_llama_prompt
    | llm
    | StrOutputParser())


In [10]:
# --- 6. TESTING ---
query_1 = "Is tumor cell expression of B7-H3 associated with decreased survival?"
print(f"\n[QUERY 1] {query_1}")
response_1 = rag_chain.invoke(query_1)

query_2 = "Who won the last soccer world cup?"
print(f"\n[QUERY 2] {query_2}")
response_2 = rag_chain.invoke(query_2)

print("\n" + "=" * 60)
print(f"[RAG ANSWER 1 (Grounded)]:\n{response_1.strip()}")
print("=" * 60)
print(f"[RAG ANSWER 2 (Grounded Refusal)]:\n{response_2.strip()}")
print("=" * 60)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.



[QUERY 1] Is tumor cell expression of B7-H3 associated with decreased survival?


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.



[QUERY 2] Who won the last soccer world cup?

[RAG ANSWER 1 (Grounded)]:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 06 Oct 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

You are an expert medical assistant. Use the following CONTEXT provided by the knowledge retrieval system to answer the user's QUESTION accurately. 
If the CONTEXT does not contain the answer, you MUST state clearly and politely that you cannot find the answer in the provided documents.

CONTEXT:
Only tumour size appeared to be significantly different between the 3 groups. On univariate analysis, invasion of the perirenal fat, lymph node involvement, distant metastases and VEGF expression were significantly associated with survival (p<0.01)

. On multivariate analysis, lymph node involvement, distant metastases and VEGF expression (OR 6.07) were identified as independent predictive factors of survival.

78 patients with a pT3a or pT3b tumou