# **Packages**

In [4]:
%pip install transformers
%pip install torch
%pip install datasets



# **PubMedBERT**

In [10]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Load BioBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
model = BertForSequenceClassification.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", num_labels=3)
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# *100 Samples and 3 Epochs*

In [8]:
# Preprocess the dataset
def preprocess_function(examples):
    inputs = [f"Context: {context} Question: {question}" for question, context in zip(examples['question'], examples['context'])]
    targets = examples['final_decision']

    # Expanded label mapping to handle additional cases
    label_mapping = {"yes": 0, "no": 1, "uncertain": 2, "maybe": 2}

    # Handle missing or unexpected labels gracefully
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = [label_mapping.get(target.lower(), 2) for target in targets]

    return model_inputs

# Select a small subset of the dataset for demonstration
small_ds = ds['train'].select(range(100))
tokenized_ds = small_ds.map(preprocess_function, batched=True)

# Split the dataset into training, validation, and test sets
train_size = 60
val_size = 20
test_size = 20

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_ds, [train_size, val_size, test_size]
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-6,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test set evaluation:", metrics)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.917244
2,No log,0.884365
3,No log,0.880589


Test set evaluation: {'eval_loss': 1.1212488412857056, 'eval_runtime': 0.5439, 'eval_samples_per_second': 36.773, 'eval_steps_per_second': 36.773, 'epoch': 3.0}


In [9]:
# Generate predictions
def prediction_answer(question, context):
    # Combine the question and context into one input
    combined_input = f"Context: {context} Question: {question}"

    # Tokenize the input
    inputs = tokenizer(combined_input, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    inputs = {key: val.to(model.device) for key, val in inputs.items()}

    # Get the model's output
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()

    # Map numeric labels to "yes", "no", or "maybe"
    label_map = {0: "yes", 1: "no", 2: "maybe"}
    return label_map[predicted_label]

num_correct = 0

# Evaluate the Q&A performance
for example in test_dataset:
    question = example['question']
    # Include the context field in the prediction
    context = example['context']
    # Extract true answer from 'final_decision' column
    true_answer = example['final_decision']
    # Use model to predict the answer
    predicted_answer = prediction_answer(question, context)

    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    print("="*80)

    # Calculate accuracy
    if true_answer == predicted_answer:
        num_correct += 1

# Print accuracy
accuracy = num_correct / len(test_dataset)
print(f"Accuracy: {accuracy}")

Question: Israeli hospital preparedness for terrorism-related multiple casualty incidents: can the surge capacity and injury severity distribution be better predicted?
Context: {'contexts': ["The incidence of large-scale urban attacks on civilian populations has significantly increased across the globe over the past decade. These incidents often result in Hospital Multiple Casualty Incidents (HMCI), which are very challenging to hospital teams. 15 years ago the Emergency and Disaster Medicine Division in the Israeli Ministry of Health defined a key of 20 percent of each hospital's bed capacity as its readiness for multiple casualties. Half of those casualties are expected to require immediate medical treatment. This study was performed to evaluate the efficacy of the current readiness guidelines based on the epidemiology of encountered HMCIs.", 'A retrospective study of HMCIs was recorded in the Israeli Defense Force (IDF) home front command and the Israeli National Trauma Registry (IT

# *300 Samples and 5 Epochs*

In [11]:
# Preprocess the dataset
def preprocess_function(examples):
    inputs = [f"Context: {context} Question: {question}" for question, context in zip(examples['question'], examples['context'])]
    targets = examples['final_decision']

    # Expanded label mapping to handle additional cases
    label_mapping = {"yes": 0, "no": 1, "uncertain": 2, "maybe": 2}

    # Handle missing or unexpected labels gracefully
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = [label_mapping.get(target.lower(), 2) for target in targets]

    return model_inputs

# Select a small subset of the dataset for demonstration
small_ds = ds['train'].select(range(300))
tokenized_ds = small_ds.map(preprocess_function, batched=True)

train_size = 200
val_size = 50
test_size = 50

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_ds, [train_size, val_size, test_size]
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-6,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test set evaluation:", metrics)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.21228
2,No log,1.315039
3,1.113200,1.252248
4,1.113200,1.126076
5,0.864200,1.109741


Test set evaluation: {'eval_loss': 0.596902072429657, 'eval_runtime': 1.4571, 'eval_samples_per_second': 34.315, 'eval_steps_per_second': 34.315, 'epoch': 5.0}


In [12]:
# Generate predictions
def prediction_answer(question, context):
    # Combine the question and context into one input
    combined_input = f"Context: {context} Question: {question}"

    # Tokenize the input
    inputs = tokenizer(combined_input, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    inputs = {key: val.to(model.device) for key, val in inputs.items()}  # Move inputs to the correct device

    # Get the model's output
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()

    # Map numeric labels to "yes", "no", or "maybe"
    label_map = {0: "yes", 1: "no", 2: "maybe"}
    return label_map[predicted_label]

num_correct = 0

# Evaluate the Q&A performance
for example in test_dataset:
    question = example['question']
    # Include the context field in the prediction
    context = example['context']
    # Extract true answer from 'final_decision' column
    true_answer = example['final_decision']
    # Use model to predict the answer
    predicted_answer = prediction_answer(question, context)

    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    print("="*80)

    # Calculate accuracy
    if true_answer == predicted_answer:
        num_correct += 1

# Print accuracy
accuracy = num_correct / len(test_dataset)
print(f"Accuracy: {accuracy}")

Question: Vertical lines in distal esophageal mucosa (VLEM): a true endoscopic manifestation of esophagitis in children?
Context: {'contexts': ['We observed an endoscopic abnormally in a group of children with histological esophagitis. We termed this finding "vertical lines in esophageal mucosa" (VLEM). We examined the relationship between the presence of VLEM and significant histologic changes in esophageal mucosal biopsies.', 'Between January 1, 1992, and August 31, 1994, the senior author (JFF) performed 255 esophageal biopsies. The procedure reports, available endoscopic photographs, and histology reports were reviewed to establish the endoscopic and histologic appearance of the esophageal mucosa. Intraepithelial cells were counted in a blind review of 42 randomly selected biopsies.', 'The esophageal mucosa had a normal appearance on 160 endoscopic studies (Group 1) and VLEM were the only mucosal abnormalities in 41 endoscopies (Group 2). Histology was normal in 92 of 160 biopsies 