In [1]:
%pip install datasets
%pip install faiss-cpu --no-cache

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

# Partial Dataset

In [None]:
import os
import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import (
    RagTokenizer,
    RagRetriever,
    RagSequenceForGeneration,
    DPRContextEncoder,
    DPRContextEncoderTokenizer,
)
from collections import defaultdict
import random

# === Config ===
dataset_path = "/content/textbook_subset_dataset"
index_path = os.path.join(dataset_path, "faiss_index")

# === Step 1: Load and Truncate Dataset ===
print("Loading textbook dataset...")
full_dataset = load_dataset("MedRAG/textbooks", split="train")

# Organize entries by title
title_to_examples = defaultdict(list)
for example in full_dataset:
    title_to_examples[example["title"]].append(example)

# Keep the first N entries per title
samples_per_title = 3
sampled_examples = []
for title, examples in title_to_examples.items():
    top_n = examples[:samples_per_title]
    sampled_examples.extend(top_n)

# Shuffle if desired
# random.shuffle(sampled_examples)

# Build new dataset
rag_dataset = Dataset.from_list(sampled_examples)

# Prepare fields for RAG (rename contents → text)
rag_dataset = rag_dataset.map(lambda x, i: {
    "title": f"{x['title']} Doc {i}",
    "text": x["contents"]  # <- rename "contents" to "text"
}, with_indices=True)

# Drop unnecessary columns
rag_dataset = rag_dataset.remove_columns(set(rag_dataset.column_names) - {"title", "text"})

# Save intermediate dataset (optional)
rag_dataset.save_to_disk(dataset_path)

# === Step 2: Embed with DPR Encoder ===
print("Embedding passages...")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def embed_texts(batch):
    inputs = ctx_tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt", max_length=256)
    with torch.no_grad():
        embeddings = ctx_encoder(**inputs).pooler_output
    return {"embeddings": embeddings.cpu().numpy()}

rag_dataset = rag_dataset.map(embed_texts, batched=True, batch_size=16)
rag_dataset.add_faiss_index(column="embeddings")

# === Step 3: Save Dataset + Index ===
print("Saving dataset and FAISS index...")
rag_dataset.get_index("embeddings").save(index_path)
rag_dataset.drop_index("embeddings")
rag_dataset.save_to_disk(dataset_path)

# === Step 4: Load RAG Components ===
print("Loading RAG model and retriever...")
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
    use_dummy_dataset=False,
)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)


KeyboardInterrupt: 

In [None]:
# === Step 5: Ask a Question ===
question = "What methods are used by histologists?"
inputs = tokenizer.prepare_seq2seq_batch([question], return_tensors="pt")

with torch.no_grad():
    generated = model.generate(input_ids=inputs["input_ids"])

answer = tokenizer.batch_decode(generated, skip_special_tokens=True)
print("Answer:", answer[0])

In [None]:
# === Step 7: Precision@k Calculation ===

# Example: assume you know ground truth relevant doc ids for this question
# Replace this list with actual ground truth doc indices
ground_truth_doc_ids = [130, 205, 64208]  # Replace with real relevant IDs

# Compute Precision@k
precision_at_k = sum([1 for doc_id in retrieved_doc_ids if doc_id in ground_truth_doc_ids]) / k
print(f"\nPrecision@{k}: {precision_at_k:.2f}")

In [None]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

# Load DPR question encoder (used internally by RAG)
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# Encode the same question
question_inputs = question_tokenizer(question, return_tensors="pt")
with torch.no_grad():
    question_hidden_states = question_encoder(**question_inputs).pooler_output.cpu().numpy()

# Use retriever to get top-k doc indices
retrieval_output = retriever(
    question_input_ids=question_inputs["input_ids"],
    question_hidden_states=question_hidden_states
)
k = 5
retrieved_doc_ids = retrieval_output["doc_ids"][0][:k].tolist()

# Print retrieved docs
print(f"\nTop {k} Retrieved Documents for: '{question}'\n" + "="*60)
for idx in retrieved_doc_ids:
    print(f"[Doc {idx}]\n{rag_dataset[idx]['text']}\n" + "-"*60)

# Full dataset

In [2]:
pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [3]:
import os
import torch
from tqdm import tqdm
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import (
    RagTokenizer,
    RagRetriever,
    RagSequenceForGeneration,
    DPRContextEncoder,
    DPRContextEncoderTokenizer,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
from bert_score import score as bert_score
import json

In [6]:
# === Load Dataset (1/10 of each textbook) ===
print("Loading MedRAG/textbooks dataset and sampling 1/3 of each textbook...")
full_dataset = load_dataset("MedRAG/textbooks", split="train")

# Group by textbook title and sample 1/3 of each
def sample_one_third_by_title(dataset):
    grouped = {}
    for ex in dataset:
        title = ex['title']
        if title not in grouped:
            grouped[title] = []
        grouped[title].append(ex)

    sampled = []
    for title, examples in grouped.items():
        n = len(examples) // 200
        sampled.extend(examples[:n])
    return Dataset.from_list(sampled)

sampled_dataset = sample_one_third_by_title(full_dataset)

Loading MedRAG/textbooks dataset and sampling 1/3 of each textbook...


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

In [7]:
sampled_dataset

Dataset({
    features: ['id', 'title', 'content', 'contents'],
    num_rows: 620
})

In [8]:
print(sampled_dataset[1004])


IndexError: Invalid key: 1004 is out of bounds for size 620

In [9]:
# Prepare for RAG (rename 'contents' to 'text', make 'title' unique-ish)
sampled_dataset = sampled_dataset.map(
    lambda x, i: {
        "title": f"{x['title']} Doc {i}",
        "text": x["contents"]
    },
    with_indices=True
)

# Keep only required columns
rag_dataset = sampled_dataset.remove_columns(set(sampled_dataset.column_names) - {"title", "text"})

# Save dataset for retriever
dataset_path = "/content/textbook_full_dataset"
index_path = os.path.join(dataset_path, "faiss_index")
rag_dataset.save_to_disk(dataset_path)

Map:   0%|          | 0/620 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/620 [00:00<?, ? examples/s]

In [10]:
# Keep only required columns
rag_dataset = sampled_dataset.remove_columns(set(sampled_dataset.column_names) - {"title", "text"})

# Save dataset for retriever
dataset_path = "/content/textbook_full_dataset"
index_path = os.path.join(dataset_path, "faiss_index")
rag_dataset.save_to_disk(dataset_path)

# === Embed with DPR Encoder ===
print("Embedding passages...")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def embed_texts(batch):
    inputs = ctx_tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt", max_length=256)
    with torch.no_grad():
        embeddings = ctx_encoder(**inputs).pooler_output
    return {"embeddings": embeddings.cpu().numpy()}

rag_dataset = rag_dataset.map(embed_texts, batched=True, batch_size=16)
rag_dataset.add_faiss_index(column="embeddings")

Saving the dataset (0/1 shards):   0%|          | 0/620 [00:00<?, ? examples/s]

Embedding passages...


config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Map:   0%|          | 0/620 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'text', 'embeddings'],
    num_rows: 620
})

In [11]:
# === Save Dataset and Index ===
print("Saving dataset and FAISS index...")
rag_dataset.get_index("embeddings").save(index_path)
rag_dataset.drop_index("embeddings")
rag_dataset.save_to_disk(dataset_path)

# === Load RAG Model and Retriever ===
print("Loading RAG model and retriever...")
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
    use_dummy_dataset=False,
)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)

Saving dataset and FAISS index...


Saving the dataset (0/1 shards):   0%|          | 0/620 [00:00<?, ? examples/s]

Loading RAG model and retriever...


config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may res

pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-sequence-nq were not used when initializing RagSequenceForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagSequenceForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagSequenceForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
qa_data = []
with open("final.json") as f:
  qa_data = json.load(f)

In [14]:
# === Load NLI Model for Faithfulness ===
nli_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
nli_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")

# === Faithfulness Metric ===
def compute_faithfulness_with_context(answer, context):
    inputs = nli_tokenizer(context, answer, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = nli_model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)
    entailment_prob = probs[0][2].item()  # entailment class index
    return entailment_prob

# === Retrieval Metrics ===
def get_textbook_group(doc_id):
    return "_".join(doc_id.split("_")[:-1]) if "_" in doc_id else doc_id

def precision_at_k_with_partial_credit(gold_doc_ids, retrieved_doc_ids, k=5):
    gold_groups = {get_textbook_group(doc_id) for doc_id in gold_doc_ids}
    score = 0.0

    for doc_id in retrieved_doc_ids[:k]:
        if doc_id in gold_doc_ids:
            score += 1.0
        elif get_textbook_group(doc_id) in gold_groups:
            score += 0.5  # partial credit
    return score / k

def reciprocal_rank_with_partial_credit(gold_doc_ids, retrieved_doc_ids):
    gold_groups = {get_textbook_group(doc_id) for doc_id in gold_doc_ids}

    for rank, doc_id in enumerate(retrieved_doc_ids, start=1):
        if doc_id in gold_doc_ids:
            return 1.0 / rank
        elif get_textbook_group(doc_id) in gold_groups:
            return 0.5 / rank  # partial reciprocal credit
    return 0.0

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
index_to_doc_id = [example["id"] for example in sampled_dataset]

In [None]:
# === Main Loop ===
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

results = []
print("Evaluating QA examples...")

for qa in tqdm(qa_data):
  question = qa["question"]
  reference = qa["answer"]
  gold_doc_ids = qa.get("doc_id", [])  # gold doc IDs

  # Generate Answer
  inputs = tokenizer(question, return_tensors="pt")
  with torch.no_grad():
      generated_ids = model.generate(
          input_ids=inputs["input_ids"],
          attention_mask=inputs["attention_mask"],
          num_return_sequences=1,
          num_beams=4,
      )
  generated_answer = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  # BERTScore
  P, R, F1 = bert_score([generated_answer], [reference], lang="en", verbose=False)

  # Retrieved Contexts & Doc IDs
  question_str = question  # or whatever your input question is

  # Tokenize with retriever tokenizer
  question_inputs = tokenizer(question_str, return_tensors="pt")

  # DPR Question Encoder
  question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
  question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
  with torch.no_grad():
      question_hidden = question_encoder(**question_inputs).pooler_output

  # Get top-k passages using retriever and DPR embedding
  retrieved = retriever(
      question_input_ids=question_inputs["input_ids"],
      question_hidden_states=question_hidden.cpu().numpy(),  # must be NumPy
      return_tensors="pt"
  )

  # Extract document IDs and passages
  retrieved_doc_ids = retrieved["doc_ids"][0].tolist()
  top_doc_index = retrieved_doc_ids[0]

  # Get doc name (original ID)
  top_doc_id = index_to_doc_id[top_doc_index]

  # Retrieve contents from the dataset
  top_context = sampled_dataset[top_doc_index]["text"]

  # Faithfulness to top passage
  faithfulness = compute_faithfulness_with_context(generated_answer, top_context)

  # Retrieval Metrics
  doc_ids_converted = [index_to_doc_id[idx] for idx in retrieved_doc_ids]
  p_at_5 = precision_at_k_with_partial_credit(gold_doc_ids, doc_ids_converted, k=5)
  rr = reciprocal_rank_with_partial_credit(gold_doc_ids, doc_ids_converted)

  result = {
      "question": question,
      "reference": reference,
      "generated": generated_answer,
      "bertscore_f1": round(F1.item(), 4),
      "faithfulness_entailment_prob": round(faithfulness, 4),
      "precision@5": round(p_at_5, 4),
      "reciprocal_rank": round(rr, 4),
  }
  print(result)
  results.append(result)


Evaluating QA examples...


  0%|          | 0/312 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 1/312 [03:20<17:19:04, 200.46s/it]

{'question': 'What are some of the diverse functions of proteins in living systems?', 'reference': 'Proteins are the most abundant and functionally diverse molecules in living systems. They serve as enzymes and polypeptide hormones that direct and regulate metabolism in the body. Contractile proteins in muscle permit movement. In bone, collagen forms a framework for calcium phosphate crystal deposition. In blood, proteins like hemoglobin and albumin transport essential molecules, while immunoglobulins fight infectious bacteria and viruses.', 'generated': ' sensing signals', 'bertscore_f1': 0.7965, 'faithfulness_entailment_prob': 0.3311, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  1%|          

{'question': 'How many standard amino acids are found in mammalian proteins, and what defines a standard amino acid?', 'reference': 'There are 20 standard amino acids commonly found as constituents of mammalian proteins. These standard amino acids are the only amino acids that are encoded by DNA, the genetic material in the cell. Nonstandard amino acids are produced by chemical modification of standard amino acids.', 'generated': ' 20', 'bertscore_f1': 0.7899, 'faithfulness_entailment_prob': 0.6452, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  1%|          

{'question': 'Describe the basic structural components of an amino acid.', 'reference': 'Each amino acid has a carboxyl group, a primary amino group (except for proline, which has a secondary amino group), and a distinctive side chain (R group) bonded to the α carbon atom. At physiologic pH (~7.4), the carboxyl group is dissociated, forming the negatively charged carboxylate ion (−COO−), and the amino group is protonated (−NH3+).', 'generated': ' amino acids', 'bertscore_f1': 0.7816, 'faithfulness_entailment_prob': 0.2259, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  1%|▏         

{'question': 'Explain the hydrophobic effect in protein structure and its importance.', 'reference': 'The hydrophobic effect occurs when the side chains of nonpolar amino acids tend to cluster together in the interior of a protein in aqueous solutions. This phenomenon happens because the nonpolar R groups act like oil droplets that coalesce in an aqueous environment. By filling up the interior of the folded protein, these nonpolar R groups help give the protein its three-dimensional shape. For proteins located in hydrophobic environments like membranes, the nonpolar R groups are found on the outside surface, interacting with the lipid environment.', 'generated': ' hydrophobicity', 'bertscore_f1': 0.7848, 'faithfulness_entailment_prob': 0.1998, 'precision@5': 0.2, 'reciprocal_rank': 0.5}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  2%|▏         

{'question': 'What makes proline unique among amino acids, and how does this affect protein structure?', 'reference': 'Proline differs from other amino acids in that its side chain and α-amino nitrogen form a rigid, five-membered ring structure. This gives proline a secondary (rather than a primary) amino group, and it is frequently referred to as an "imino acid." The unique geometry of proline contributes to the formation of the fibrous structure of collagen, but it interrupts the α-helices found in globular proteins.', 'generated': ' geometry', 'bertscore_f1': 0.7815, 'faithfulness_entailment_prob': 0.5332, 'precision@5': 0.2, 'reciprocal_rank': 1.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  2%|▏         

{'question': 'What is a disulfide bond, and what is its significance in protein structure?', 'reference': 'A disulfide bond is a covalent cross-link that forms when the sulfhydryl (thiol) groups (−SH) of two cysteine residues are oxidized, creating a −S−S− linkage. Two disulfide-linked cysteines are referred to as cystine. Disulfide bonds are important for protein stability, particularly in extracellular proteins like albumin, a blood protein that functions as a transporter for various molecules.', 'generated': ' disulfide bond', 'bertscore_f1': 0.7868, 'faithfulness_entailment_prob': 0.1252, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  2%|▏         

{'question': 'Which amino acid has a side chain that can ionize within the physiologic pH range, and why is this property important?', 'reference': 'Histidine is the only amino acid with a side chain that can ionize within the physiologic pH range. When incorporated into a protein, its R group can be either positively charged (protonated) or neutral, depending on the ionic environment provided by the protein. This important property contributes to the buffering role histidine plays in the functioning of proteins such as hemoglobin.', 'generated': ' weak acids', 'bertscore_f1': 0.7941, 'faithfulness_entailment_prob': 0.102, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  3%|▎         

{'question': 'Explain the system for one-letter symbols used to designate amino acids.', 'reference': 'The one-letter codes for amino acids follow several rules: (1) If only one amino acid begins with a given letter, then that letter is used as its symbol (e.g., V = valine). (2) If more than one amino acid begins with a particular letter, the most common of these amino acids receives this letter as its symbol (e.g., G = glycine). (3) Some one-letter symbols sound like the amino acid they represent (e.g., F = phenylalanine, W = tryptophan). (4) For the remaining amino acids, a one-letter symbol is assigned that is as close in the alphabet as possible to the initial letter of the amino acid (e.g., K = lysine).', 'generated': ' symbols', 'bertscore_f1': 0.7744, 'faithfulness_entailment_prob': 0.6691, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  3%|▎         

{'question': 'What are D and L amino acid isomers, and which form is found in mammalian proteins?', 'reference': 'Because the α-carbon of an amino acid is attached to four different chemical groups (with glycine as an exception), it is an asymmetric (chiral) atom. Amino acids with a chiral α-carbon exist in two different isomeric forms, designated D and L, which are enantiomers or mirror images. All amino acids found in mammalian proteins are of the L configuration. D-amino acids are found in some antibiotics and in bacterial cell walls.', 'generated': ' dna replication', 'bertscore_f1': 0.7677, 'faithfulness_entailment_prob': 0.1293, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  3%|▎         

{'question': 'What is the Henderson-Hasselbalch equation, and how is it applied to amino acids?', 'reference': 'The Henderson-Hasselbalch equation (pH = pKa + log[A−]/[HA]) describes the quantitative relationship between the pH of a solution and the concentration of a weak acid (HA) and its conjugate base (A−). For amino acids, this equation can be used to analyze the dissociation of their ionizable groups. For example, the dissociation constant of the carboxyl group of an amino acid is called K1, and the equation becomes: pH = pK1 + log[II]/[I], where I is the fully protonated form and II is the isoelectric form of the amino acid.', 'generated': ' henderson -- hasselbalch equation', 'bertscore_f1': 0.7983, 'faithfulness_entailment_prob': 0.0085, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  4%|▎         

{'question': 'Define the isoelectric point (pI) of an amino acid and explain how it is calculated for an amino acid with two dissociable groups.', 'reference': 'The isoelectric point (pI) is the pH at which an amino acid is electrically neutral, meaning the sum of the positive charges equals the sum of the negative charges. For an amino acid such as alanine that has only two dissociable hydrogens (one from the α-carboxyl and one from the α-amino group), the pI is the average of pK1 and pK2 (pI = [pK1 + pK2]/2). The pI corresponds to the pH at which the zwitterionic form (with a net charge of zero) predominates and at which there are also equal amounts of the positively charged and negatively charged forms.', 'generated': '', 'bertscore_f1': 0.0, 'faithfulness_entailment_prob': 0.0282, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  4%|▍         

{'question': 'How do buffers work, and what makes amino acids potential buffer components?', 'reference': 'A buffer is a solution that resists change in pH following the addition of an acid or base. It can be created by mixing a weak acid (HA) with its conjugate base (A−). Maximum buffering capacity occurs at a pH equal to the pKa. Amino acids can act as buffers because they contain weakly acidic α-carboxyl groups and weakly basic α-amino groups. Additionally, some amino acids contain ionizable groups in their side chains. For example, in alanine, the −COOH/−COO− pair can serve as a buffer in the pH region around pK1, and the −NH3+/−NH2 pair can buffer in the region around pK2.', 'generated': ' amino acids', 'bertscore_f1': 0.7907, 'faithfulness_entailment_prob': 0.1829, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  4%|▍         

{'question': 'What is anatomy?', 'reference': 'Anatomy includes those structures that can be seen grossly (without the aid of magnification) and microscopically (with the aid of magnification). Typically, when used by itself, the term anatomy tends to mean gross or macroscopic anatomy—that is, the study of structures that can be seen without using a microscope.', 'generated': '', 'bertscore_f1': 0.0, 'faithfulness_entailment_prob': 0.0336, 'precision@5': 0.0, 'reciprocal_rank': 0.0}


In [None]:
doc_ids_converted

['Cell_Biology_Alberts_31',
 'Cell_Biology_Alberts_85',
 'Biochemistry_Lippincott_13',
 'InternalMed_Harrison_198',
 'InternalMed_Harrison_211']

In [None]:
gold_doc_ids

'Biochemistry_Lippincott_14'

In [None]:
# === Main Loop ===
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

results = []
print("Evaluating QA examples...")

for qa in tqdm(qa_data):
    question = qa["question"]
    reference = qa["answer"]
    gold_doc_ids = qa.get("doc_ids", [])  # gold doc IDs

    # Generate Answer
    inputs = tokenizer(question, return_tensors="pt")
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            num_return_sequences=1,
            num_beams=4,
        )
    generated_answer = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # BERTScore
    P, R, F1 = bert_score([generated_answer], [reference], lang="en", verbose=False)

    # Retrieved Contexts & Doc IDs
    question_str = question  # or whatever your input question is

    # Tokenize with retriever tokenizer
    question_inputs = tokenizer(question_str, return_tensors="pt")

    # DPR Question Encoder
    question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    with torch.no_grad():
        question_hidden = question_encoder(**question_inputs).pooler_output

    # Get top-k passages using retriever and DPR embedding
    retrieved = retriever(
        question_input_ids=question_inputs["input_ids"],
        question_hidden_states=question_hidden.cpu().numpy(),  # must be NumPy
        return_tensors="pt"
    )

    # Extract document IDs and passages
    retrieved_doc_ids = retrieved["doc_ids"][0].tolist()
    retrieved_passage_top = full_dataset

    # Faithfulness to top passage
    top_context = retrieved_passages[0]["text"]
    faithfulness = compute_faithfulness_with_context(generated_answer, top_context)

    # Retrieval Metrics
    p_at_5 = precision_at_k_with_doc_ids(gold_doc_ids, retrieved_doc_ids, k=5)
    rr = reciprocal_rank_with_doc_ids(gold_doc_ids, retrieved_doc_ids)

    result = {
        "question": question,
        "reference": reference,
        "generated": generated_answer,
        "bertscore_f1": round(F1.item(), 4),
        "faithfulness_entailment_prob": round(faithfulness, 4),
        "precision@5": round(p_at_5, 4),
        "reciprocal_rank": round(rr, 4),
    }
    print(result)
    results.append(result)


Evaluating QA examples...


  0%|          | 0/312 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequen

AttributeError: 'CustomHFIndex' object has no attribute 'get_passages'

###above is the stuff i wrote, below is from ur file

In [None]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    RagTokenizer,
    RagRetriever,
    RagSequenceForGeneration,
    DPRContextEncoder,
    DPRContextEncoderTokenizer,
)

# === Config ===
dataset_path = "/content/textbook_full_dataset"
index_path = os.path.join(dataset_path, "faiss_index")

# === Step 1: Load and Randomly Sample 1/3 ===
print("Loading and sampling MedRAG/textbooks dataset...")
full_dataset = load_dataset("MedRAG/textbooks", split="train")
#full_dataset = full_dataset.shuffle(seed=42)  # Ensures random sampling

sample_size = len(full_dataset) // 30
sampled_dataset = full_dataset.select(range(sample_size))

# Prepare for RAG (rename 'contents' to 'text', make 'title' unique-ish)
rag_dataset = sampled_dataset.map(
    lambda x, i: {
        "title": f"{x['title']} Doc {i}",
        "text": x["contents"]
    },
    with_indices=True
)

# Keep only required columns
rag_dataset = rag_dataset.remove_columns(set(rag_dataset.column_names) - {"title", "text"})

# Save dataset for retriever
rag_dataset.save_to_disk(dataset_path)

# === Step 2: Embed with DPR Encoder ===
print("Embedding passages...")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def embed_texts(batch):
    inputs = ctx_tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt", max_length=256)
    with torch.no_grad():
        embeddings = ctx_encoder(**inputs).pooler_output
    return {"embeddings": embeddings.cpu().numpy()}

rag_dataset = rag_dataset.map(embed_texts, batched=True, batch_size=16)
rag_dataset.add_faiss_index(column="embeddings")

# === Step 3: Save Dataset and Index ===
print("Saving dataset and FAISS index...")
rag_dataset.get_index("embeddings").save(index_path)
rag_dataset.drop_index("embeddings")
rag_dataset.save_to_disk(dataset_path)

# === Step 4: Load RAG Model and Retriever ===
print("Loading RAG model and retriever...")
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
    use_dummy_dataset=False,
)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)

Loading and sampling MedRAG/textbooks dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/18 [00:00<?, ?files/s]

Anatomy_Gray.jsonl:   0%|          | 0.00/5.19M [00:00<?, ?B/s]

Physiology_Levy.jsonl:   0%|          | 0.00/6.97M [00:00<?, ?B/s]

Pathology_Robbins.jsonl:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

Pediatrics_Nelson.jsonl:   0%|          | 0.00/6.84M [00:00<?, ?B/s]

Gynecology_Novak.jsonl:   0%|          | 0.00/13.3M [00:00<?, ?B/s]

First_Aid_Step2.jsonl:   0%|          | 0.00/2.50M [00:00<?, ?B/s]

Pharmacology_Katzung.jsonl:   0%|          | 0.00/11.9M [00:00<?, ?B/s]

InternalMed_Harrison.jsonl:   0%|          | 0.00/52.6M [00:00<?, ?B/s]

Biochemistry_Lippincott.jsonl:   0%|          | 0.00/3.19M [00:00<?, ?B/s]

Histology_Ross.jsonl:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

First_Aid_Step1.jsonl:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

Immunology_Janeway.jsonl:   0%|          | 0.00/7.89M [00:00<?, ?B/s]

Cell_Biology_Alberts.jsonl:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Neurology_Adams.jsonl:   0%|          | 0.00/19.5M [00:00<?, ?B/s]

Pathoma_Husain.jsonl:   0%|          | 0.00/983k [00:00<?, ?B/s]

Obstentrics_Williams.jsonl:   0%|          | 0.00/15.2M [00:00<?, ?B/s]

Psichiatry_DSM-5.jsonl:   0%|          | 0.00/6.73M [00:00<?, ?B/s]

Surgery_Schwartz.jsonl:   0%|          | 0.00/30.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/125847 [00:00<?, ? examples/s]

Map:   0%|          | 0/4194 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4194 [00:00<?, ? examples/s]

Embedding passages...


config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Map:   0%|          | 0/4194 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
# === Step 5: Ask a Question ===
print("Generating answer...")
question = "What causes influenza?"
inputs = tokenizer.prepare_seq2seq_batch([question], return_tensors="pt")

with torch.no_grad():
    generated = model.generate(input_ids=inputs["input_ids"])

answer = tokenizer.batch_decode(generated, skip_special_tokens=True)
print("Answer:", answer[0])

Generating answer...


NameError: name 'tokenizer' is not defined