In [9]:
# Load model directly
from transformers import BioGptTokenizer, BioGptForCausalLM

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

In [17]:
from datasets import load_dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

In [72]:
import torch
from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Load BioGPT tokenizer and model
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

# Load the dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples['question']
    context = examples['context']  # Assuming 'context' contains the relevant information
    long_answer = examples['long_answer']  # Assuming 'long_answer' contains the long answer
    targets = examples['final_decision']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Tokenize the targets
    labels = tokenizer(text_target=targets, max_length=1024, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["context"] = context  # Add context to the model inputs
    model_inputs["long_answer"] = long_answer  # Add long answer to the model inputs
    return model_inputs

# Select a small subset of the dataset for demonstration
small_ds = ds['train'].select(range(100))
tokenized_ds = small_ds.map(preprocess_function, batched=True)

train_size = 60
val_size = 20
test_size = 20

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_ds, [train_size, val_size, test_size]
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test set evaluation:", metrics)

  0%|          | 0/18 [53:43<?, ?it/s]
  0%|          | 0/90 [00:48<?, ?it/s]
 33%|███▎      | 30/90 [11:15<21:02, 21.04s/it]
 33%|███▎      | 30/90 [12:04<21:02, 21.04s/it]

{'eval_loss': 3.3578662872314453, 'eval_runtime': 48.8324, 'eval_samples_per_second': 0.41, 'eval_steps_per_second': 0.205, 'epoch': 1.0}


 67%|██████▋   | 60/90 [23:15<10:59, 21.99s/it]
 67%|██████▋   | 60/90 [24:09<10:59, 21.99s/it]

{'eval_loss': 1.621874213218689, 'eval_runtime': 54.58, 'eval_samples_per_second': 0.366, 'eval_steps_per_second': 0.183, 'epoch': 2.0}


100%|██████████| 90/90 [35:04<00:00, 21.55s/it]
100%|██████████| 90/90 [36:01<00:00, 24.01s/it]


{'eval_loss': 1.206207513809204, 'eval_runtime': 52.2161, 'eval_samples_per_second': 0.383, 'eval_steps_per_second': 0.192, 'epoch': 3.0}
{'train_runtime': 2161.1527, 'train_samples_per_second': 0.083, 'train_steps_per_second': 0.042, 'train_loss': 3.6603966606987846, 'epoch': 3.0}


100%|██████████| 10/10 [00:47<00:00,  4.75s/it]

Test set evaluation: {'eval_loss': 1.2089277505874634, 'eval_runtime': 53.1517, 'eval_samples_per_second': 0.376, 'eval_steps_per_second': 0.188, 'epoch': 3.0}





In [73]:
# Generate predictions
def generate_answer(question, context):
    # Prepend the question with a prompt to guide the model
    input_prompt = f"Context: {context} Question: {question} Answer with 'yes' or 'no':"
    
    # Tokenize the input
    inputs = tokenizer(input_prompt, return_tensors="pt", max_length=1025, truncation=True, padding=True)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=1024)
    
    # Decode the output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract "yes" or "no" from the output
    if "yes" in answer.lower():
        return "yes"
    elif "no" in answer.lower():
        return "no"
    else:
        return "uncertain"  # Handle cases where the answer is not clear

num_correct = 0

# Evaluate the Q&A performance
for example in test_dataset:
    question = example['question']
    context = example['context']  # Assuming 'context' contains the relevant information
    true_answer = example['final_decision']
    predicted_answer = generate_answer(question, context)
    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    # calculate accuracy
    if true_answer == predicted_answer:
        num_correct += 1

    print("="*80)

# Print accuracy
print(f"Accuracy: {num_correct / len(test_dataset)}")

Question: Is Panton-Valentine leucocidin associated with the pathogenesis of Staphylococcus aureus bacteraemia in the UK?
Context: {'contexts': ['The morbidity and mortality associated with Panton-Valentine leucocidin (PVL)-positive Staphylococcus aureus suggest that this toxin is a key marker of disease severity. Nevertheless, the importance of PVL in the pathogenesis of primary bacteraemia caused by S. aureus is uncertain. We have determined the prevalence of PVL-encoding genes among isolates of S. aureus from bacteraemic patients.', 'Consecutive bacteraemia isolates of S. aureus (n=244) from patients hospitalized in 25 centres in the UK and Ireland during 2005 were screened for PVL and mecA genes. PVL-positive isolates were characterized by toxin gene profiling, PFGE, spa-typing and MIC determinations for a range of antimicrobials.', 'Four out of 244 isolates (1.6%) were PVL-positive and susceptible to oxacillin [methicillin-susceptible S. aureus (MSSA)]. Eighty-eight out of 244 (36

In [52]:
import torch
from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Load BioGPT tokenizer and model
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

# Load the dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples['question']
    context = examples['context']  # Assuming 'context' contains the relevant information
    targets = examples['final_decision']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Tokenize the targets
    labels = tokenizer(text_target=targets, max_length=1024, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["context"] = context  # Add context to the model inputs
    return model_inputs

# Select a small subset of the dataset for demonstration
small_ds = ds['train'].select(range(100))
tokenized_ds = small_ds.map(preprocess_function, batched=True)

train_size = 60
val_size = 20
test_size = 20

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_ds, [train_size, val_size, test_size]
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test set evaluation:", metrics)

Question: Can predilatation in transcatheter aortic valve implantation be omitted?
True Answer: yes
Predicted Answer: yes
Question: Acute respiratory distress syndrome in children with malignancy--can we predict outcome?
True Answer: yes
Predicted Answer: yes
Question: Does immediate breast reconstruction compromise the delivery of adjuvant chemotherapy?
True Answer: no
Predicted Answer: yes
Question: Is there a model to teach and practice retroperitoneoscopic nephrectomy?
True Answer: yes
Predicted Answer: yes
Question: Is Acupuncture Efficacious for Treating Phonotraumatic Vocal Pathologies?
True Answer: yes
Predicted Answer: yes
Question: Can a practicing surgeon detect early lymphedema reliably?
True Answer: maybe
Predicted Answer: yes
Question: Should direct mesocolon invasion be included in T4 for the staging of gastric cancer?
True Answer: maybe
Predicted Answer: yes
Question: Does HER2 immunoreactivity provide prognostic information in locally advanced urothelial carcinoma pati

In [55]:
# Generate predictions
def generate_answer(question, context):
    # Prepend the question with a prompt to guide the model
    input_prompt = f"Context: {context} Question: {question} Answer with 'yes' or 'no':"
    
    # Tokenize the input
    inputs = tokenizer(input_prompt, return_tensors="pt", max_length=1025, truncation=True, padding=True)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=1024)
    
    # Decode the output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract "yes" or "no" from the output
    if "yes" in answer.lower():
        return "yes"
    elif "no" in answer.lower():
        return "no"
    else:
        return "uncertain"  # Handle cases where the answer is not clear

num_correct = 0

# Evaluate the Q&A performance
for example in test_dataset:
    question = example['question']
    context = example['context']  # Assuming 'context' contains the relevant information
    true_answer = example['final_decision']
    predicted_answer = generate_answer(question, context)
    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    # calculate accuracy
    if true_answer == predicted_answer:
        num_correct += 1

    print("="*80)

# Print accuracy
print(f"Accuracy: {num_correct / len(test_dataset)}")

Accuracy: 0.5833333333333334
