In [9]:
# Load model directly
from transformers import BioGptTokenizer, BioGptForCausalLM

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

In [17]:
from datasets import load_dataset

ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")


In [41]:
from transformers import Trainer, TrainingArguments, BioGptTokenizer, BioGptForCausalLM
from datasets import load_dataset

# Load BioGPT tokenizer and model
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

# Load the dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples['question']
    targets = examples['final_decision']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Select a small subset of the dataset for demonstration
small_ds = ds['train'].select(range(100))
tokenized_ds = small_ds.map(preprocess_function, batched=True)

# Split the dataset
train_test_split = tokenized_ds.train_test_split(test_size=60, train_size=40)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Print dataset shapes and types for debugging
print("Train dataset:", train_dataset)
print("Validation dataset:", eval_dataset)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate(eval_dataset=eval_dataset)
print("Test set evaluation:", metrics)



Train dataset: Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4
})
Validation dataset: Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4
})


100%|██████████| 4/4 [00:23<00:00,  5.67s/it]
100%|██████████| 4/4 [00:32<00:00,  8.19s/it]


{'eval_loss': 9.697182655334473, 'eval_runtime': 5.0531, 'eval_samples_per_second': 0.792, 'eval_steps_per_second': 0.792, 'epoch': 1.0}
{'train_runtime': 32.7418, 'train_samples_per_second': 0.122, 'train_steps_per_second': 0.122, 'train_loss': 15.264518737792969, 'epoch': 1.0}


100%|██████████| 4/4 [00:03<00:00,  1.11it/s]

Test set evaluation: {'eval_loss': 9.697182655334473, 'eval_runtime': 4.8731, 'eval_samples_per_second': 0.821, 'eval_steps_per_second': 0.821, 'epoch': 1.0}





In [42]:
import torch

# Generate predictions
def generate_answer(question):
    # Prepend the question with a prompt to guide the model
    input_prompt = f"Answer with 'yes' or 'no': {question}"
    
    # Tokenize the input
    inputs = tokenizer(input_prompt, return_tensors="pt", max_length=257, truncation=True, padding="max_length")
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=258)
    
    # Decode the output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract "yes" or "no" from the output
    if "yes" in answer.lower():
        return "yes"
    elif "no" in answer.lower():
        return "no"
    else:
        return "uncertain"  # Handle cases where the answer is not clear

# Evaluate the Q&A performance
for example in eval_dataset:
    question = example['question']
    true_answer = example['final_decision']
    predicted_answer = generate_answer(question)
    print(f"Question: {question}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    print("="*80)


Question: Are the long-term results of the transanal pull-through equal to those of the transabdominal pull-through?
True Answer: no
Predicted Answer: yes
Question: Can tailored interventions increase mammography use among HMO women?
True Answer: yes
Predicted Answer: yes
Question: Syncope during bathing in infants, a pediatric form of water-induced urticaria?
True Answer: yes
Predicted Answer: yes
Question: Is adjustment for reporting heterogeneity necessary in sleep disorders?
True Answer: no
Predicted Answer: yes
