In [1]:
%pip install transformers
%pip install torch
%pip install datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

In [4]:
# Load BioBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=3)  # 3 for yes/no/maybe classification

# Move model to the available device (cuda or cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples['question']
    # Use context for more accurate predictions
    context = examples['context']
    # Extract the true answers
    targets = examples['final_decision']
    
    # Combine question and context
    combined_input = [f"Context: {c} Question: {q}" for q, c in zip(inputs, context)]
    
    # Tokenize the inputs and labels
    model_inputs = tokenizer(combined_input, max_length=512, truncation=True, padding="max_length")
    
    # Map the final_decision (yes/no/maybe) to labels
    label_map = {"yes": 0, "no": 1, "maybe": 2}
    labels = [label_map[ans] for ans in targets]
    
    model_inputs["labels"] = labels

    return model_inputs

# Select a small subset of the dataset for demonstration
small_ds = ds['train'].select(range(50))
tokenized_ds = small_ds.map(preprocess_function, batched=True)

# Split the dataset into training, validation, and test sets
train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_ds, [train_size, val_size, test_size]
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test set evaluation:", metrics)

# Generate predictions
def prediction_answer(question, context):
    # Combine the question and context into one input
    combined_input = f"Context: {context} Question: {question}"
    
    # Tokenize the input
    inputs = tokenizer(combined_input, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to the correct device
    
    # Get the model's output
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()
    
    # Map numeric labels to "yes", "no", or "maybe"
    label_map = {0: "yes", 1: "no", 2: "maybe"}
    return label_map[predicted_label]

num_correct = 0

# Evaluate the Q&A performance
for example in test_dataset:
    question = example['question']
    # Include the context field in the prediction
    context = example['context']
    # Extract true answer from 'final_decision' column
    true_answer = example['final_decision']
    # Use model to predict the answer
    predicted_answer = prediction_answer(question, context)
    
    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    print("="*80)
    
    # Calculate accuracy
    if true_answer == predicted_answer:
        num_correct += 1

# Print accuracy
accuracy = num_correct / len(test_dataset)
print(f"Accuracy: {accuracy}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                               
 33%|███▎      | 30/90 [00:49<01:34,  1.58s/it]

{'eval_loss': 1.1731359958648682, 'eval_runtime': 2.8042, 'eval_samples_per_second': 3.566, 'eval_steps_per_second': 3.566, 'epoch': 1.0}


                                               
 67%|██████▋   | 60/90 [01:37<00:45,  1.52s/it]

{'eval_loss': 1.2873268127441406, 'eval_runtime': 2.7697, 'eval_samples_per_second': 3.611, 'eval_steps_per_second': 3.611, 'epoch': 2.0}


                                               
100%|██████████| 90/90 [02:27<00:00,  1.64s/it]


{'eval_loss': 1.5273152589797974, 'eval_runtime': 2.9601, 'eval_samples_per_second': 3.378, 'eval_steps_per_second': 3.378, 'epoch': 3.0}
{'train_runtime': 147.9804, 'train_samples_per_second': 0.608, 'train_steps_per_second': 0.608, 'train_loss': 1.0607052273220485, 'epoch': 3.0}


100%|██████████| 10/10 [00:02<00:00,  3.85it/s]


Test set evaluation: {'eval_loss': 0.9353690147399902, 'eval_runtime': 2.8925, 'eval_samples_per_second': 3.457, 'eval_steps_per_second': 3.457, 'epoch': 3.0}
Question: Implementation of epidural analgesia for labor: is the standard of effective analgesia reachable in all women?
Context: {'contexts': ['Social and cultural factors combined with little information may prevent the diffusion of epidural analgesia for pain relief during childbirth. The present study was launched contemporarily to the implementation of analgesia for labor in our Department in order to perform a 2 years audit on its use. The goal is to evaluate the epidural acceptance and penetration into hospital practice by women and care givers and safety and efficacy during childbirth.', 'This audit cycle measured epidural analgesia performance against 4 standards: (1) Implementation of epidural analgesia for labor to all patients; (2) Acceptance and good satisfaction level reported by patients and caregivers. (3) Effecti