In [9]:
# Load model directly
from transformers import BioGptTokenizer, BioGptForCausalLM

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

In [17]:
from datasets import load_dataset

ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")


In [23]:
from transformers import Trainer, TrainingArguments, BioGptTokenizer, BioGptForCausalLM
from datasets import load_dataset

# Load BioGPT tokenizer and model
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

# Load the dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples['question']
    targets = examples['long_answer']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Select a small subset of the dataset for demonstration
small_ds = ds['train'].select(range(8))
tokenized_ds = small_ds.map(preprocess_function, batched=True)

# Split the dataset
train_test_split = tokenized_ds.train_test_split(test_size=1, train_size=1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Print dataset shapes and types for debugging
print("Train dataset:", train_dataset)
print("Validation dataset:", eval_dataset)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate(eval_dataset=eval_dataset)
print("Test set evaluation:", metrics)

Map: 100%|██████████| 2/2 [00:00<00:00, 108.05 examples/s]


Train dataset: Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})
Validation dataset: Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})



[A

[A[A                               
                                             
  0%|          | 0/4 [00:51<?, ?it/s]
[A
100%|██████████| 1/1 [00:15<00:00, 15.13s/it]


{'eval_loss': 15.311264991760254, 'eval_runtime': 1.3648, 'eval_samples_per_second': 0.733, 'eval_steps_per_second': 0.733, 'epoch': 1.0}
{'train_runtime': 15.1295, 'train_samples_per_second': 0.066, 'train_steps_per_second': 0.066, 'train_loss': 19.031047821044922, 'epoch': 1.0}


100%|██████████| 1/1 [00:00<00:00, 1974.72it/s]

Test set evaluation: {'eval_loss': 15.311264991760254, 'eval_runtime': 1.1741, 'eval_samples_per_second': 0.852, 'eval_steps_per_second': 0.852, 'epoch': 1.0}





In [28]:
import torch

# Generate predictions
def generate_answer(question):
    inputs = tokenizer(question, return_tensors="pt", max_length=257, truncation=True, padding="max_length")
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=258)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Evaluate the Q&A performance
for example in eval_dataset:
    question = example['question']
    true_answer = example['long_answer']
    predicted_answer = generate_answer(question)
    print(f"Question: {question}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    print("="*80)

Question: Landolt C and snellen e acuity: differences in strabismus amblyopia?
True Answer: Using the charts described, there was only a slight overestimation of visual acuity by the Snellen E compared to the Landolt C, even in strabismus amblyopia. Small differences in the lower visual acuity range have to be considered.
Predicted Answer: Landolt C and snellen e acuity: differences in strabismus amblyopia? r
