In [1]:

import os
import torch
from tqdm import tqdm
from datasets import load_dataset, Dataset, concatenate_datasets
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import numpy as np
import json

from transformers import (
    RagTokenizer,
    RagRetriever,
    RagSequenceForGeneration,
    DPRContextEncoder,
    DPRContextEncoderTokenizer,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
from bert_score import score as bert_score

In [2]:
# === Load Dataset (1/10 of each textbook) === #TAKE THIS
print("Loading MedRAG/textbooks dataset and sampling 1/3 of each textbook...")
full_dataset = load_dataset("MedRAG/textbooks", split="train")

# Group by textbook title and sample 1/3 of each
def sample_one_third_by_title(dataset):
    grouped = {}
    for ex in dataset:
        title = ex['title']
        if title not in grouped:
            grouped[title] = []
        grouped[title].append(ex)

    sampled = []
    for title, examples in grouped.items():
        n = len(examples) // 200
        sampled.extend(examples[:n])
    return Dataset.from_list(sampled)

sampled_dataset = sample_one_third_by_title(full_dataset)



# Prepare for RAG (rename 'contents' to 'text', make 'title' unique-ish) #TAKE THIS
sampled_dataset = sampled_dataset.map(
    lambda x, i: {
        "title": f"{x['title']} Doc {i}",
        "text": x["contents"]
    },
    with_indices=True
)

# Keep only required columns
rag_dataset = sampled_dataset.remove_columns(set(sampled_dataset.column_names) - {"title", "text"})

# Save dataset for retriever
dataset_path = "/content/textbook_full_dataset"
index_path = os.path.join(dataset_path, "faiss_index")
rag_dataset.save_to_disk(dataset_path)



# Keep only required columns #TAKE THIS
rag_dataset = sampled_dataset.remove_columns(set(sampled_dataset.column_names) - {"title", "text"})

# Save dataset for retriever
dataset_path = "/content/textbook_full_dataset"
index_path = os.path.join(dataset_path, "faiss_index")
rag_dataset.save_to_disk(dataset_path)

# === Embed with DPR Encoder ===
print("Embedding passages...")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def embed_texts(batch):
    inputs = ctx_tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt", max_length=256)
    with torch.no_grad():
        embeddings = ctx_encoder(**inputs).pooler_output
    return {"embeddings": embeddings.cpu().numpy()}

rag_dataset = rag_dataset.map(embed_texts, batched=True, batch_size=16)


Loading MedRAG/textbooks dataset and sampling 1/3 of each textbook...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Map:   0%|          | 0/620 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/620 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/620 [00:00<?, ? examples/s]

Embedding passages...


Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

Map:   0%|          | 0/620 [00:00<?, ? examples/s]

In [3]:


rag_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'text', 'embeddings'],
    num_rows: 620
})

In [4]:
# === Save Dataset and Index ===
print("Saving dataset and FAISS index...")
rag_dataset.get_index("embeddings").save(index_path)
rag_dataset.drop_index("embeddings")
rag_dataset.save_to_disk(dataset_path)

# === Load RAG Model and Retriever ===
print("Loading RAG model and retriever...")
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
    use_dummy_dataset=False,
)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)

Saving dataset and FAISS index...


Saving the dataset (0/1 shards):   0%|          | 0/620 [00:00<?, ? examples/s]

Loading RAG model and retriever...


config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may res

pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-sequence-nq were not used when initializing RagSequenceForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagSequenceForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagSequenceForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# === Retrieval Metrics ===
def get_textbook_group(doc_id):
    return "_".join(doc_id.split("_")[:-1]) if "_" in doc_id else doc_id

def precision_at_k_with_partial_credit(gold_doc_ids, retrieved_doc_ids, k=5):
    gold_groups = {get_textbook_group(doc_id) for doc_id in gold_doc_ids}
    score = 0.0

    for doc_id in retrieved_doc_ids[:k]:
        if doc_id in gold_doc_ids:
            score += 1.0
        elif get_textbook_group(doc_id) in gold_groups:
            score += 0.5  # partial credit
    return score / k

def reciprocal_rank_with_partial_credit(gold_doc_ids, retrieved_doc_ids):
    gold_groups = {get_textbook_group(doc_id) for doc_id in gold_doc_ids}

    for rank, doc_id in enumerate(retrieved_doc_ids, start=1):
        if doc_id in gold_doc_ids:
            return 1.0 / rank
        elif get_textbook_group(doc_id) in gold_groups:
            return 0.5 / rank  # partial reciprocal credit
    return 0.0

In [None]:
index_to_doc_id = [example["id"] for example in sampled_dataset]


# === Load NLI Model for Faithfulness ===
nli_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
nli_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")


# === Faithfulness Metric ===
def compute_faithfulness_with_context(answer, context):
    inputs = nli_tokenizer(context, answer, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = nli_model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)
    entailment_prob = probs[0][2].item()  # entailment class index
    return entailment_prob


In [5]:
qa_data = [] #SYNTEHTHIC
with open("final.json") as f:
  qa_data = json.load(f)

In [14]:


def build_medprompt_prompt(question, retrieved_contexts, examples, system_prompt):
    context_str = "\n\n".join([ctx["text"] for ctx in retrieved_contexts])
    examples_str = "\n\n".join([
        f"Q: {ex['question']}\nA: {ex['answer']}" for ex in examples
    ])
    return f"{system_prompt}\n\n{examples_str}\n\nQ: {question}\nContext:\n{context_str}\nA:"


In [10]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

# Load DPR question encoder (used internally by RAG)
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")



few_shot_examples = [
    {"question": "What is the function of the mitochondria?", "cot": "The mitochondria produce ATP through cellular respiration.", "answer": "Energy production"},
    {"question": "How do beta-blockers help in hypertension?", "cot": "They block adrenaline's effects, reducing heart rate and blood pressure.", "answer": "Reduce heart rate and blood pressure"},
    # Add more real ones later
]


config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `tryingLlama` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `trying

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Llama-2-7b-chat-hf"  # make sure you're approved

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [15]:
# Example usage with 1 question:



results = []

for qa in in tqdm(qa_data):

  #test_qa = qa_data[0]
  question = qa["question"]
  reference = qa["answer"]
  gold_doc_ids = qa.get("doc_ids", [])  # gold doc IDs



  # DPR retrieval
  q_tokens = question_tokenizer(question, return_tensors="pt")
  with torch.no_grad():
      q_embed = question_encoder(**q_tokens).pooler_output

  retrieved = retriever(
      question_input_ids=q_tokens["input_ids"],
      question_hidden_states=q_embed.cpu().numpy(),
      return_tensors="pt"
  )

  retrieved_doc_ids = retrieved["doc_ids"][0].tolist()
  retrieved_contexts = [{"text": sampled_dataset[idx]["text"]} for idx in retrieved_doc_ids[:3]]
  top_examples = few_shot_examples[:5]

  # Build MedPrompt input
  prompt = build_medprompt_prompt(
      question=question,
      retrieved_contexts=retrieved_contexts,
      examples=top_examples,
      system_prompt="You are an expert medical professional. Given the context and examples, answer clearly."
  )

  print("PROMPT:", prompt)

  # Run LLaMA
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
  with torch.no_grad():
      outputs = model.generate(**inputs, max_new_tokens=300)
      generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

  print("Generated Answer:\n", generated_answer)


  # BERTScore
  P, R, F1 = bert_score([generated_answer], [reference], lang="en", verbose=False)

  # Retrieved Contexts & Doc IDs
  question_str = question  # or whatever your input question is

  # Tokenize with retriever tokenizer
  question_inputs = tokenizer(question_str, return_tensors="pt")

  # DPR Question Encoder
  question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
  question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
  with torch.no_grad():
      question_hidden = question_encoder(**question_inputs).pooler_output

  # Get top-k passages using retriever and DPR embedding
  retrieved = retriever(
      question_input_ids=question_inputs["input_ids"],
      question_hidden_states=question_hidden.cpu().numpy(),  # must be NumPy
      return_tensors="pt"
  )

  # Extract document IDs and passages
  retrieved_doc_ids = retrieved["doc_ids"][0].tolist()
  top_doc_index = retrieved_doc_ids[0]

  # Get doc name (original ID)
  top_doc_id = index_to_doc_id[top_doc_index]

  # Retrieve contents from the dataset
  top_context = sampled_dataset[top_doc_index]["text"]

  # Faithfulness to top passage
  faithfulness = compute_faithfulness_with_context(generated_answer, top_context)

  # Retrieval Metrics
  doc_ids_converted = [index_to_doc_id[idx] for idx in retrieved_doc_ids]
  p_at_5 = precision_at_k_with_partial_credit(gold_doc_ids, doc_ids_converted, k=5)
  rr = reciprocal_rank_with_partial_credit(gold_doc_ids, doc_ids_converted)

  result = {
      "question": question,
      "reference": reference,
      "generated": generated_answer,
      "bertscore_f1": round(F1.item(), 4),
      "faithfulness_entailment_prob": round(faithfulness, 4),
      "precision@5": round(p_at_5, 4),
      "reciprocal_rank": round(rr, 4),
  }
  print(result)
  results.append(result)


PROMPT: You are an expert medical professional. Given the context and examples, answer clearly.

Q: What is the function of the mitochondria?
A: Energy production

Q: How do beta-blockers help in hypertension?
A: Reduce heart rate and blood pressure

Q: What are some of the diverse functions of proteins in living systems?
Context:
Cell_Biology_Alberts. The discoveries of the past century have not diminished the marvel—quite the contrary. But they have removed the central mystery regarding the nature of life. We can now see that all living things are made of cells: small, membrane-enclosed units filled with a concentrated aqueous solution of chemicals and endowed with the extraordinary ability to create copies of themselves by growing and then dividing in two.

Cell_Biology_Alberts. Proteins have many other functions as well—maintaining structures, generating movements, sensing signals, and so on—each protein molecule performing a specific function according to its own genetically speci

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'question': 'What are some of the diverse functions of proteins in living systems?', 'reference': 'Proteins are the most abundant and functionally diverse molecules in living systems. They serve as enzymes and polypeptide hormones that direct and regulate metabolism in the body. Contractile proteins in muscle permit movement. In bone, collagen forms a framework for calcium phosphate crystal deposition. In blood, proteins like hemoglobin and albumin transport essential molecules, while immunoglobulins fight infectious bacteria and viruses.', 'generated': "You are an expert medical professional. Given the context and examples, answer clearly.\n\nQ: What is the function of the mitochondria?\nA: Energy production\n\nQ: How do beta-blockers help in hypertension?\nA: Reduce heart rate and blood pressure\n\nQ: What are some of the diverse functions of proteins in living systems?\nContext:\nCell_Biology_Alberts. The discoveries of the past century have not diminished the marvel—quite the cont

In [22]:
print(result)

{'question': 'What are some of the diverse functions of proteins in living systems?', 'reference': 'Proteins are the most abundant and functionally diverse molecules in living systems. They serve as enzymes and polypeptide hormones that direct and regulate metabolism in the body. Contractile proteins in muscle permit movement. In bone, collagen forms a framework for calcium phosphate crystal deposition. In blood, proteins like hemoglobin and albumin transport essential molecules, while immunoglobulins fight infectious bacteria and viruses.', 'generated': "You are an expert medical professional. Given the context and examples, answer clearly.\n\nQ: What is the function of the mitochondria?\nA: Energy production\n\nQ: How do beta-blockers help in hypertension?\nA: Reduce heart rate and blood pressure\n\nQ: What are some of the diverse functions of proteins in living systems?\nContext:\nCell_Biology_Alberts. The discoveries of the past century have not diminished the marvel—quite the cont