# **BlueBERT**

## **Initialize and set up**

In [1]:
# Load model directly
from transformers import TrainingArguments, Trainer, AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset
import pandas as pd
import torch

model = AutoModelForSequenceClassification.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12", num_labels=3)
print(model)

# Move model to the available device (cuda or cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## **Helper functions**

In [2]:
def parse_context(context):
    result = ""
    # Combine all of the sentences of a context into one string
    for sentence in context['contexts']:
        result += sentence + " "
    
    return result

def parse_all_contexts(context_list):
    end = []
    # Go through all of the contexts in a dataset
    for context in context_list:
        result = parse_context(context)
        end.append(result)
    
    return end

def preprocess(data):
    questions = data['question']
    context = data['context']
    final_decision = data['final_decision']

    # Combine the question, context, and long answer
    text = [f"Context: {c} Question: {q}" for c, q in zip(context, questions)]
    token_input = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    # Labels
    label_mapping = {"yes": 0, "no": 1, "maybe": 2}
    labels = [label_mapping[l] for l in final_decision]

    # Add the labels to the token_input
    token_input['labels'] = torch.tensor(labels, dtype=torch.long)

    return token_input

def predict_answer(context, question):
    text = f"Context: {context} Question: {question}"
    token_input = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512).to(device)

    # Generate model output (logits)
    with torch.no_grad():
        outputs = model(**token_input)

        # Extract the logits from the model output
        logits = outputs.logits

        # Get the predicted class (use argmax to get the index of the highest score)
        predicted_class = torch.argmax(logits, dim=-1).item()

    # Map the index to the label
    label_mapping = {0: "yes", 1: "no", 2: "maybe"}
    prediction = label_mapping[predicted_class]

    return prediction

## **Experiment 1: 200 Samples, 5 epochs, and 100/50/50 split**

### **Trial 1**

In [3]:
# Parameters
dataset_size = 200
train_size = 100
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Map: 100%|██████████| 200/200 [00:00<00:00, 1476.73 examples/s]


Epoch,Training Loss,Validation Loss
1,No log,0.878243
2,No log,0.873807
3,No log,0.875918
4,No log,0.879685
5,1.028900,0.881935


Test results 

 {'eval_loss': 1.225711464881897, 'eval_runtime': 1.4886, 'eval_samples_per_second': 33.589, 'eval_steps_per_second': 33.589, 'epoch': 5.0} 




In [4]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['Several studies have shown associations between hyperglycemia and risk of cardiovascular disease (CVD) and mortality, yet glucose-lowering treatment does little to mitigate this risk. We examined whether associations between hyperglycemia and CVD risk were explained by underlying insulin resistance.', 'In 60 middle-aged individuals without diabetes we studied the associations of fasting plasma glucose, 2-hour post oral glucose tolerance test plasma glucose, insulin sensitivity as well as body fat percentage with CVD risk. Insulin sensitivity was measured as the glucose infusion rate during a euglycemic hyperinsulinemic clamp, body fat percentage was measured by dual X-ray absorptiometry, and CVD risk was estimated using the Framingham risk score. Associations of fasting plasma glucose, 2-hour plasma glucose, insulin sensitivity and body fat percentage with the Framingham risk score were assessed in linear regression models.', 'Both fasting and 2-hour plasma gluc

### **Trial 2**

In [5]:
# Parameters
dataset_size = 200
train_size = 100
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Epoch,Training Loss,Validation Loss
1,No log,1.213558
2,No log,1.287831
3,No log,1.317917
4,No log,1.359372
5,1.243000,1.340268


Test results 

 {'eval_loss': 1.0522352457046509, 'eval_runtime': 1.5195, 'eval_samples_per_second': 32.906, 'eval_steps_per_second': 32.906, 'epoch': 5.0} 




In [6]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['Cyclical pamidronate therapy in a 2-year-old child with skeletal fragility resulted in remodelling of vertebral fractures and improvement in bone mineral density (BMD) at distal radial and spinal sites. The BMD at both sites decreased precipitously within 24 months of stopping treatment, raising the question as to whether bisphosphonates can be stopped in a growing child with skeletal fragility.', 'At age 23 months, a male toddler sustained a low trauma fracture of his right femur. Skeletal radiographs revealed generalised osteopenia with multiple vertebral body fractures. He was diagnosed with type IV osteogenesis imperfecta; however, no mutations were found in COL1A1 or COL1A2 genes.', 'This case report presents bone densitometry data before, during and after bisphosphonate treatment. Axial QCT was main outcome from 2 years of age; DXA and pQCT were taken after age 5.', 'QCT confirmed that he had low spinal trabecular volumetric BMD (Z-score -2.4). After 4 yea

### **Trial 3**

In [7]:
# Parameters
dataset_size = 200
train_size = 100
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Epoch,Training Loss,Validation Loss
1,No log,1.416233
2,No log,1.354824
3,No log,1.313713
4,No log,1.261153
5,1.254000,1.20351


Test results 

 {'eval_loss': 1.0247358083724976, 'eval_runtime': 1.5272, 'eval_samples_per_second': 32.74, 'eval_steps_per_second': 32.74, 'epoch': 5.0} 




In [8]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['Cyclical pamidronate therapy in a 2-year-old child with skeletal fragility resulted in remodelling of vertebral fractures and improvement in bone mineral density (BMD) at distal radial and spinal sites. The BMD at both sites decreased precipitously within 24 months of stopping treatment, raising the question as to whether bisphosphonates can be stopped in a growing child with skeletal fragility.', 'At age 23 months, a male toddler sustained a low trauma fracture of his right femur. Skeletal radiographs revealed generalised osteopenia with multiple vertebral body fractures. He was diagnosed with type IV osteogenesis imperfecta; however, no mutations were found in COL1A1 or COL1A2 genes.', 'This case report presents bone densitometry data before, during and after bisphosphonate treatment. Axial QCT was main outcome from 2 years of age; DXA and pQCT were taken after age 5.', 'QCT confirmed that he had low spinal trabecular volumetric BMD (Z-score -2.4). After 4 yea

## **Experiment 2: 300 Samples, 5 epochs, and 200/50/50 split**

### **Trial 1**

In [9]:
# Parameters
dataset_size = 300
train_size = 200
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Epoch,Training Loss,Validation Loss
1,No log,1.278826
2,No log,1.108839
3,1.351000,0.967518
4,1.351000,0.977706
5,1.143700,0.962204


Test results 

 {'eval_loss': 0.8572644591331482, 'eval_runtime': 1.4765, 'eval_samples_per_second': 33.863, 'eval_steps_per_second': 33.863, 'epoch': 5.0} 




In [10]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['The present study aims to compare strength, healing, and operation time of experimental intestinal anastomoses performed by polyglactin 910 (Vicryl; Ethicon, Edinburgh, United Kingdom) sutures with ethyl-2-cyanoacrylate glue (Pattex; Henkel, Dusseldorf, Germany).', "Ninety-six Sprague-Dawley rats were divided into 2 (groups E and L). Each group was further subdivided into 6 subgroups (EA1, EA2, EA3, EB1, EB2, EB3, LA1, LA2, LA3, LB1, LB2, LB3), each containing 8 rats. Intestinal anastomosis was performed by polyglactin 910 sutures in A subgroups and with ethyl-2-cyanoacrylate in B subgroups. The anastomosis was end to end in A1 and B1, side to side in A2 and B2, and end to side in A3 and B3. Time for anastomosis performance (AT) was recorded. In group E, bursting pressures and hydroxyproline levels were determined on the second postoperative day, whereas in group L, the same measurements were made on the sixth postoperative day. One-way analysis of variance was 

### **Trial 2**

In [11]:
# Parameters
dataset_size = 300
train_size = 200
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Epoch,Training Loss,Validation Loss
1,No log,1.29302
2,No log,0.909719
3,0.962500,0.940334
4,0.962500,0.920811
5,0.726300,0.927049


Test results 

 {'eval_loss': 0.8304675221443176, 'eval_runtime': 1.4869, 'eval_samples_per_second': 33.628, 'eval_steps_per_second': 33.628, 'epoch': 5.0} 




In [12]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['The present study aims to compare strength, healing, and operation time of experimental intestinal anastomoses performed by polyglactin 910 (Vicryl; Ethicon, Edinburgh, United Kingdom) sutures with ethyl-2-cyanoacrylate glue (Pattex; Henkel, Dusseldorf, Germany).', "Ninety-six Sprague-Dawley rats were divided into 2 (groups E and L). Each group was further subdivided into 6 subgroups (EA1, EA2, EA3, EB1, EB2, EB3, LA1, LA2, LA3, LB1, LB2, LB3), each containing 8 rats. Intestinal anastomosis was performed by polyglactin 910 sutures in A subgroups and with ethyl-2-cyanoacrylate in B subgroups. The anastomosis was end to end in A1 and B1, side to side in A2 and B2, and end to side in A3 and B3. Time for anastomosis performance (AT) was recorded. In group E, bursting pressures and hydroxyproline levels were determined on the second postoperative day, whereas in group L, the same measurements were made on the sixth postoperative day. One-way analysis of variance was 

### **Trial 3**

In [13]:
# Parameters
dataset_size = 300
train_size = 200
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Epoch,Training Loss,Validation Loss
1,No log,1.217747
2,No log,1.191775
3,0.663600,1.252684
4,0.663600,1.330103
5,0.511600,1.351123


Test results 

 {'eval_loss': 1.1068997383117676, 'eval_runtime': 1.4864, 'eval_samples_per_second': 33.638, 'eval_steps_per_second': 33.638, 'epoch': 5.0} 




In [14]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['The present study aims to compare strength, healing, and operation time of experimental intestinal anastomoses performed by polyglactin 910 (Vicryl; Ethicon, Edinburgh, United Kingdom) sutures with ethyl-2-cyanoacrylate glue (Pattex; Henkel, Dusseldorf, Germany).', "Ninety-six Sprague-Dawley rats were divided into 2 (groups E and L). Each group was further subdivided into 6 subgroups (EA1, EA2, EA3, EB1, EB2, EB3, LA1, LA2, LA3, LB1, LB2, LB3), each containing 8 rats. Intestinal anastomosis was performed by polyglactin 910 sutures in A subgroups and with ethyl-2-cyanoacrylate in B subgroups. The anastomosis was end to end in A1 and B1, side to side in A2 and B2, and end to side in A3 and B3. Time for anastomosis performance (AT) was recorded. In group E, bursting pressures and hydroxyproline levels were determined on the second postoperative day, whereas in group L, the same measurements were made on the sixth postoperative day. One-way analysis of variance was 

## **Experiment 3: 400 Samples, 5 epochs, and 300/50/50 split**

### **Trial 1**

In [15]:
# Parameters
dataset_size = 400
train_size = 300
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Map: 100%|██████████| 400/400 [00:01<00:00, 292.95 examples/s]


Epoch,Training Loss,Validation Loss
1,No log,0.585723
2,1.120000,0.626665
3,1.120000,0.656305
4,0.928300,0.708095
5,0.842700,0.712659


Test results 

 {'eval_loss': 0.698645830154419, 'eval_runtime': 1.4932, 'eval_samples_per_second': 33.485, 'eval_steps_per_second': 33.485, 'epoch': 5.0} 




In [16]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['Recent evaluations of IT innovations in primary care have highlighted variations between centres and practices in uptake and use. We evaluated whether structural characteristics of a general practice were associated with variations in use of a web-based clinical information system underpinning a Managed Clinical Network in diabetes, between the years 2001 and 2003.', 'Using a computerised audit trail, we calculated the numbers of web-based operations that occurred in each practice, stratified by staff type and year, and adjusted for the numbers of registered diabetic patients. In regression analyses, we determined whether total use was associated with structural characteristics of the practice (total list size, training status, numbers of GPs (general practitioners), mean age of the GPs, numbers of female GPs, level of deprivation of the population and whether staff had received advanced training in diabetes care).', 'Initially there were a few practices which m

### **Trial 2**

In [17]:
# Parameters
dataset_size = 400
train_size = 300
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Epoch,Training Loss,Validation Loss
1,No log,0.696023
2,0.792000,0.75445
3,0.792000,0.999719
4,0.555700,0.980979
5,0.498600,0.998645


Test results 

 {'eval_loss': 0.666733980178833, 'eval_runtime': 1.4861, 'eval_samples_per_second': 33.645, 'eval_steps_per_second': 33.645, 'epoch': 5.0} 




In [18]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['Recent evaluations of IT innovations in primary care have highlighted variations between centres and practices in uptake and use. We evaluated whether structural characteristics of a general practice were associated with variations in use of a web-based clinical information system underpinning a Managed Clinical Network in diabetes, between the years 2001 and 2003.', 'Using a computerised audit trail, we calculated the numbers of web-based operations that occurred in each practice, stratified by staff type and year, and adjusted for the numbers of registered diabetic patients. In regression analyses, we determined whether total use was associated with structural characteristics of the practice (total list size, training status, numbers of GPs (general practitioners), mean age of the GPs, numbers of female GPs, level of deprivation of the population and whether staff had received advanced training in diabetes care).', 'Initially there were a few practices which m

### **Trial 3**

In [19]:
# Parameters
dataset_size = 400
train_size = 300
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Epoch,Training Loss,Validation Loss
1,No log,1.206678
2,0.484000,1.345641
3,0.484000,1.472156
4,0.236200,1.507907
5,0.191700,1.547962


Test results 

 {'eval_loss': 0.9070796370506287, 'eval_runtime': 1.4828, 'eval_samples_per_second': 33.72, 'eval_steps_per_second': 33.72, 'epoch': 5.0} 




In [20]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['Recent evaluations of IT innovations in primary care have highlighted variations between centres and practices in uptake and use. We evaluated whether structural characteristics of a general practice were associated with variations in use of a web-based clinical information system underpinning a Managed Clinical Network in diabetes, between the years 2001 and 2003.', 'Using a computerised audit trail, we calculated the numbers of web-based operations that occurred in each practice, stratified by staff type and year, and adjusted for the numbers of registered diabetic patients. In regression analyses, we determined whether total use was associated with structural characteristics of the practice (total list size, training status, numbers of GPs (general practitioners), mean age of the GPs, numbers of female GPs, level of deprivation of the population and whether staff had received advanced training in diabetes care).', 'Initially there were a few practices which m