# **BlueBERT**

In [1]:
# Load model directly
from transformers import TrainingArguments, Trainer, AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset
import pandas as pd
import torch

model = AutoModelForSequenceClassification.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12", num_labels=3)
print(model)

# Move model to the available device (cuda or cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [2]:
def parse_context(context):
    result = ""
    # Combine all of the sentences of a context into one string
    for sentence in context['contexts']:
        result += sentence + " "
    
    return result

def parse_all_contexts(context_list):
    end = []
    # Go through all of the contexts in a dataset
    for context in context_list:
        result = parse_context(context)
        end.append(result)
    
    return end

def preprocess(data):
    questions = data['question']
    context = data['context']
    final_decision = data['final_decision']

    # Combine the question, context, and long answer
    text = [f"Context: {c} Question: {q}" for c, q in zip(context, questions)]
    token_input = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    # Labels
    label_mapping = {"yes": 0, "no": 1, "maybe": 2}
    labels = [label_mapping[l] for l in final_decision]

    # Add the labels to the token_input
    token_input['labels'] = torch.tensor(labels, dtype=torch.long)

    return token_input

def predict_answer(context, question):
    text = f"Context: {context} Question: {question}"
    token_input = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512).to(device)

    # Generate model output (logits)
    with torch.no_grad():
        outputs = model(**token_input)

        # Extract the logits from the model output
        logits = outputs.logits

        # Get the predicted class (use argmax to get the index of the highest score)
        predicted_class = torch.argmax(logits, dim=-1).item()

    # Map the index to the label
    label_mapping = {0: "yes", 1: "no", 2: "maybe"}
    prediction = label_mapping[predicted_class]

    return prediction

## **100 Samples, 3 epochs, and 60/20/20 split**

In [3]:
# Parameters
dataset_size = 100
train_size = 60
validate_size = 20
test_size = 20
num_epochs = 3
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Map: 100%|██████████| 100/100 [00:00<00:00, 811.26 examples/s]


Epoch,Training Loss,Validation Loss
1,No log,1.048082
2,No log,1.008482
3,No log,0.999772


Test results 

 {'eval_loss': 1.0407730340957642, 'eval_runtime': 0.5836, 'eval_samples_per_second': 34.267, 'eval_steps_per_second': 34.267, 'epoch': 3.0} 




In [4]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['Although the retroperitoneal approach has been the preferred choice for open urological procedures, retroperitoneoscopy is not the preferred approach for laparoscopy. This study aims to develop a training model for retroperitoneoscopy and to establish an experimental learning curve.', 'Fifteen piglets were operated on to develop a standard retroperitoneoscopic nephrectomy (RPN) training model. All procedures were performed with three ports. Intraoperative data (side, operative time, blood loss, peritoneal opening) were recorded. Animals were divided into groups A, the first eight, and B, the last seven cases. Data were statistically analyzed.', 'We performed fifteen RPNs. The operative time varied from 15 to 50 minutes (median 30 minutes). Blood loss varied from 5 to 100 mL (median 20 mL). We experienced five peritoneal openings; we had two surgical vascular complications managed laparoscopically. There was statistical difference between groups A and B for perit

## **300 Samples, 5 epochs, and 200/50/50 split**

In [7]:
# Parameters
dataset_size = 300
train_size = 200
validate_size = 50
test_size = 50
num_epochs = 5
learning_rate = 2e-6
weight_decay = 0.01

# Get the dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
small_dataset = dataset['train'].select(range(dataset_size))

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
tokenized_input = small_dataset.map(preprocess, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_input, [train_size, validate_size, test_size])

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print("Test results", "\n\n", results, "\n\n")

Epoch,Training Loss,Validation Loss
1,No log,1.365187
2,No log,1.404623
3,1.423500,1.369481
4,1.423500,1.38319
5,1.325400,1.352766


Test results 

 {'eval_loss': 0.9576215147972107, 'eval_runtime': 1.4809, 'eval_samples_per_second': 33.764, 'eval_steps_per_second': 33.764, 'epoch': 5.0} 




In [8]:
count = 0

# Test the model
for i in range(len(test_dataset)):
    context = test_dataset[i]['context']
    question = test_dataset[i]['question']
    answer = test_dataset[i]['final_decision']
    prediction = predict_answer(context, question)

    print(f"Context: {context}")
    print()
    print(f"Question: {question}")
    print()
    print(f"Predicted answer: {prediction}")
    print()
    print(f"Actual answer: {answer}")
    print("="*120)

    if prediction == answer:
        count += 1

print(f"Accuracy: {count/len(test_dataset)}")

Context: {'contexts': ['The present study aims to compare strength, healing, and operation time of experimental intestinal anastomoses performed by polyglactin 910 (Vicryl; Ethicon, Edinburgh, United Kingdom) sutures with ethyl-2-cyanoacrylate glue (Pattex; Henkel, Dusseldorf, Germany).', "Ninety-six Sprague-Dawley rats were divided into 2 (groups E and L). Each group was further subdivided into 6 subgroups (EA1, EA2, EA3, EB1, EB2, EB3, LA1, LA2, LA3, LB1, LB2, LB3), each containing 8 rats. Intestinal anastomosis was performed by polyglactin 910 sutures in A subgroups and with ethyl-2-cyanoacrylate in B subgroups. The anastomosis was end to end in A1 and B1, side to side in A2 and B2, and end to side in A3 and B3. Time for anastomosis performance (AT) was recorded. In group E, bursting pressures and hydroxyproline levels were determined on the second postoperative day, whereas in group L, the same measurements were made on the sixth postoperative day. One-way analysis of variance was 