In [1]:
%pip install transformers
%pip install torch
%pip install datasets

Note: you may need to restart the kernel to use updated packages.




In [2]:
from datasets import load_dataset

ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

In [4]:
# Load ClinicalBERT tokenizer and model from "medicalai/ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model = AutoModelForSequenceClassification.from_pretrained("medicalai/ClinicalBERT", num_labels=3)  # 3 for yes/no/maybe classification


# Move model to the available device (cuda or cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples['question']
    long_answer = examples['long_answer']
    # Extract the true answers
    targets = examples['final_decision']
    
    # Combine question, and long_answer
    combined_input = [f"Question: {q} Long Answer: {la}" for q, la in zip(inputs, long_answer)]
    
    # Tokenize the inputs and labels
    model_inputs = tokenizer(combined_input, max_length=512, truncation=True, padding="max_length")
    
    # Map the final_decision (yes/no/maybe) to labels
    label_map = {"yes": 0, "no": 1, "maybe": 2}
    labels = [label_map[ans] for ans in targets]
    
    model_inputs["labels"] = labels

    return model_inputs

# Select a small subset of the dataset for demonstration
small_ds = ds['train'].select(range(100))
tokenized_ds = small_ds.map(preprocess_function, batched=True)

# Split the dataset into training, validation, and test sets
train_size = 60
val_size = 20
test_size = 20

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_ds, [train_size, val_size, test_size]
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test set evaluation:", metrics)

# Generate predictions
def prediction_answer(question, long_answer):
    # Combine the question, and long_answer into one input
    combined_input = f"Question: {question} Long Answer: {long_answer}"
    
    # Tokenize the input
    inputs = tokenizer(combined_input, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to the correct device
    
    # Get the model's output
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()
    
    # Map numeric labels to "yes", "no", or "maybe"
    label_map = {0: "yes", 1: "no", 2: "maybe"}
    return label_map[predicted_label]

num_correct = 0

# Evaluate the Q&A performance
for example in test_dataset:
    question = example['question']
    # Include the long_answer field in the prediction
    long_answer = example['long_answer']
    # Extract true answer from 'final_decision' column
    true_answer = example['final_decision']
    # Use model to predict the answer
    predicted_answer = prediction_answer(question, long_answer)
    
    print(f"Question: {question}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    print("="*80)
    
    # Calculate accuracy
    if true_answer == predicted_answer:
        num_correct += 1

# Print accuracy
accuracy = num_correct / len(test_dataset)
print(f"Accuracy: {accuracy}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 100/100 [00:00<00:00, 2450.92 examples/s]
                                              
 33%|███▎      | 8/24 [00:54<01:19,  4.99s/it]

{'eval_loss': 1.0270214080810547, 'eval_runtime': 3.3696, 'eval_samples_per_second': 5.935, 'eval_steps_per_second': 0.89, 'epoch': 1.0}


                                               
 67%|██████▋   | 16/24 [01:40<00:39,  4.92s/it]

{'eval_loss': 1.005135178565979, 'eval_runtime': 3.5445, 'eval_samples_per_second': 5.643, 'eval_steps_per_second': 0.846, 'epoch': 2.0}


                                               
100%|██████████| 24/24 [02:27<00:00,  6.15s/it]


{'eval_loss': 1.0021679401397705, 'eval_runtime': 3.6264, 'eval_samples_per_second': 5.515, 'eval_steps_per_second': 0.827, 'epoch': 3.0}
{'train_runtime': 147.5102, 'train_samples_per_second': 1.22, 'train_steps_per_second': 0.163, 'train_loss': 0.9824772675832113, 'epoch': 3.0}


100%|██████████| 3/3 [00:02<00:00,  1.27it/s]


Test set evaluation: {'eval_loss': 1.1304094791412354, 'eval_runtime': 3.8676, 'eval_samples_per_second': 5.171, 'eval_steps_per_second': 0.776, 'epoch': 3.0}
Question: Do general practice characteristics influence uptake of an information technology (IT) innovation in primary care?
True Answer: no
Predicted Answer: yes
Question: Does a 4 diagram manual enable laypersons to operate the Laryngeal Mask Supreme®?
True Answer: yes
Predicted Answer: yes
Question: Does pretreatment with statins improve clinical outcome after stroke?
True Answer: no
Predicted Answer: yes
Question: Should circumcision be performed in childhood?
True Answer: no
Predicted Answer: yes
Question: Should direct mesocolon invasion be included in T4 for the staging of gastric cancer?
True Answer: maybe
Predicted Answer: yes
Question: Are the long-term results of the transanal pull-through equal to those of the transabdominal pull-through?
True Answer: no
Predicted Answer: yes
Question: Did Chile's traffic law reform p