In [9]:
# Load model directly
from transformers import BioGptTokenizer, BioGptForCausalLM

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

In [17]:
from datasets import load_dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

In [None]:
import torch
from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Load BioGPT tokenizer and model
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

# Load the dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples['question']
    context = examples['context']  # Assuming 'context' contains the relevant information
    targets = examples['final_decision']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Tokenize the targets
    labels = tokenizer(text_target=targets, max_length=1024, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["context"] = context  # Add context to the model inputs
    return model_inputs

# Select a small subset of the dataset for demonstration
small_ds = ds['train'].select(range(100))
tokenized_ds = small_ds.map(preprocess_function, batched=True)

train_size = 60
val_size = 20
test_size = 20

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_ds, [train_size, val_size, test_size]
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test set evaluation:", metrics)

  0%|          | 0/180 [00:00<?, ?it/s]

In [67]:
# Generate predictions
def generate_answer(question, context):
    # Prepend the question with a prompt to guide the model
    input_prompt = f"Context: {context} Question: {question} Answer with 'yes' or 'no':"
    
    # Tokenize the input
    inputs = tokenizer(input_prompt, return_tensors="pt", max_length=1025, truncation=True, padding=True)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=1024)
    
    # Decode the output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract "yes" or "no" from the output
    if "yes" in answer.lower():
        return "yes"
    elif "no" in answer.lower():
        return "no"
    else:
        return "uncertain"  # Handle cases where the answer is not clear

num_correct = 0

# Evaluate the Q&A performance
for example in test_dataset:
    question = example['question']
    context = example['context']  # Assuming 'context' contains the relevant information
    true_answer = example['final_decision']
    predicted_answer = generate_answer(question, context)
    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    # calculate accuracy
    if true_answer == predicted_answer:
        num_correct += 1

    print("="*80)

# Print accuracy
print(f"Accuracy: {num_correct / len(test_dataset)}")

Question: Can tailored interventions increase mammography use among HMO women?
Context: {'contexts': ['Telephone counseling and tailored print communications have emerged as promising methods for promoting mammography screening. However, there has been little research testing, within the same randomized field trial, of the efficacy of these two methods compared to a high-quality usual care system for enhancing screening. This study addressed the question: Compared to usual care, is tailored telephone counseling more effective than tailored print materials for promoting mammography screening?', 'Three-year randomized field trial.', 'One thousand ninety-nine women aged 50 and older recruited from a health maintenance organization in North Carolina.', 'Women were randomized to 1 of 3 groups: (1) usual care, (2) tailored print communications, and (3) tailored telephone counseling.', 'Adherence to mammography screening based on self-reports obtained during 1995, 1996, and 1997.', 'Compared 

In [52]:
# import torch

# # Generate predictions
# def generate_answer(question):
#     # Prepend the question with a prompt to guide the model
#     input_prompt = f"Answer with 'yes' or 'no': {question}"
    
#     # Tokenize the input
#     inputs = tokenizer(input_prompt, return_tensors="pt", max_length=257, truncation=True, padding="max_length")
#     inputs = {key: val.to(model.device) for key, val in inputs.items()}
    
#     with torch.no_grad():
#         outputs = model.generate(**inputs, max_length=258)
    
#     # Decode the output
#     answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     # Extract "yes" or "no" from the output
#     if "yes" in answer.lower():
#         return "yes"
#     elif "no" in answer.lower():
#         return "no"
#     else:
#         return "uncertain"  # Handle cases where the answer is not clear

# num_correct = 0

# # Evaluate the Q&A performance
# for example in eval_dataset:
#     question = example['question']
#     true_answer = example['final_decision']
#     predicted_answer = generate_answer(question)
#     print(f"Question: {question}")
#     print(f"True Answer: {true_answer}")
#     print(f"Predicted Answer: {predicted_answer}")
#     # calculate accuracy
#     if true_answer == predicted_answer:
#         num_correct += 1

#     print("="*80)


Question: Can predilatation in transcatheter aortic valve implantation be omitted?
True Answer: yes
Predicted Answer: yes
Question: Acute respiratory distress syndrome in children with malignancy--can we predict outcome?
True Answer: yes
Predicted Answer: yes
Question: Does immediate breast reconstruction compromise the delivery of adjuvant chemotherapy?
True Answer: no
Predicted Answer: yes
Question: Is there a model to teach and practice retroperitoneoscopic nephrectomy?
True Answer: yes
Predicted Answer: yes
Question: Is Acupuncture Efficacious for Treating Phonotraumatic Vocal Pathologies?
True Answer: yes
Predicted Answer: yes
Question: Can a practicing surgeon detect early lymphedema reliably?
True Answer: maybe
Predicted Answer: yes
Question: Should direct mesocolon invasion be included in T4 for the staging of gastric cancer?
True Answer: maybe
Predicted Answer: yes
Question: Does HER2 immunoreactivity provide prognostic information in locally advanced urothelial carcinoma pati

In [55]:
print(f"Accuracy: {num_correct / len(eval_dataset)}")

Accuracy: 0.5833333333333334
