<a href="https://colab.research.google.com/github/abd-ur/1.fine-tune_BioBert/blob/main/Fine_Tune_BioBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DATA PREP

In [29]:
import json
from collections import Counter
import random
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

data_var = "/content/variant.json"   # path to your data file
model = "dmis-lab/biobert-base-cased-v1.1"
length = 128
seed = 42

random.seed(seed)

# load variants
with open(data_var, "r", encoding="utf-8") as f:
    data = json.load(f)

print('Loaded',len(data),'records.')

# create training pairs
pairs = []
for item in data:
    variant = item.get("variant", "").strip()
    cancer = item.get("cancer_type", "").strip()
    interp = item.get("interpretation", "").strip()
    source = item.get("source", "").strip()

    if not variant or not cancer or not interp:
        continue  # skip incomplete entries

    input_text = f"Query: {variant} in {cancer}?"
    label_text = f"{interp} [{source}]" if source else interp

    pairs.append({
        "query": input_text,
        "label": label_text,
        "cancer_type": cancer
    })
print("Example pair:", pairs[0])

# data split with cancer_type balance
all_indices = list(range(len(pairs)))
cancer_types = [ex["cancer_type"] for ex in pairs]

train_idx, temp_idx = train_test_split(
    all_indices, test_size=0.30,
    random_state=seed,
    stratify=cancer_types)

temp_cancers = [cancer_types[i] for i in temp_idx]
val_idx, test_idx = train_test_split(
    temp_idx, test_size=0.5,
    random_state=seed,)
# not stratifying due to odd number of variants(21), val set has one extra variant(11) than test set(10)

# label encoding
unique_labels = sorted(set(ex["label"] for ex in pairs))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Total unique labels: {len(unique_labels)}")
print("Sample mapping:", list(label2id.items())[:5])

# encode labels as integers
for ex in pairs:
    ex["label_id"] = label2id[ex["label"]]

def select_examples(indices):
    return {
        "query": [ex["query"] for i, ex in enumerate(pairs) if i in indices],
        "label_id": [ex["label_id"] for i, ex in enumerate(pairs) if i in indices],
        "cancer_type": [ex["cancer_type"] for i, ex in enumerate(pairs) if i in indices],
    }

train_ds = Dataset.from_dict(select_examples(train_idx))
val_ds   = Dataset.from_dict(select_examples(val_idx))
test_ds  = Dataset.from_dict(select_examples(test_idx))

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})


print(dataset)

# BioBert tokenizer
tokenizer = AutoTokenizer.from_pretrained(model)

def tokenize_fn(batch):
    return tokenizer(
        batch["query"],
        truncation=True,
        padding="max_length",
        max_length=length
    )

tokenized_ds = dataset.map(tokenize_fn, batched=True)

# Remove unused columns
tokenized_ds = tokenized_ds.remove_columns(["query", "cancer_type"])
tokenized_ds = tokenized_ds.rename_column("label_id", "labels")

# set format for PyTorch
tokenized_ds.set_format("torch")

print("\nTokenized dataset:", tokenized_ds)
print("\nSample tokenized entry:")
sample = tokenized_ds["train"][0]
print("input_ids:", sample["input_ids"][:10])
print("attention_mask:", sample["attention_mask"][:10])
print("labels:", sample["labels"])


# test
print("\nTokenized dataset:")
print(tokenized_ds)

# show random sample
sample = tokenized_ds["train"][0]
print("\nSample tokenized entry:")
print("input_ids:", sample["input_ids"][:20])
print("attention_mask:", sample["attention_mask"][:20])
print("label:", sample["labels"])

# Optional: label distribution
from collections import Counter
label_counts = Counter(dataset["train"]["label_id"])
print("\nLabel distribution (train):", label_counts)


Loaded 70 records.
Example pair: {'query': 'Query: TP53 p.R248W in breast?', 'label': 'Pathogenic; Level 1 evidence for olaparib. [OncoKB]', 'cancer_type': 'breast'}
Total unique labels: 30
Sample mapping: [('Benign; No actionable therapy. [COSMIC]', 0), ('Benign; No actionable therapy. [ClinVar]', 1), ('Benign; No actionable therapy. [OncoKB]', 2), ('Founder mutation; Genetic counseling required. [COSMIC]', 3), ('Founder mutation; Genetic counseling required. [ClinVar]', 4)]
DatasetDict({
    train: Dataset({
        features: ['query', 'label_id', 'cancer_type'],
        num_rows: 49
    })
    validation: Dataset({
        features: ['query', 'label_id', 'cancer_type'],
        num_rows: 10
    })
    test: Dataset({
        features: ['query', 'label_id', 'cancer_type'],
        num_rows: 11
    })
})


Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]


Tokenized dataset: DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 49
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11
    })
})

Sample tokenized entry:
input_ids: tensor([  101, 15027,  1616,   131,   189,  1643, 24239,   185,   119,   187])
attention_mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
labels: tensor(20)

Tokenized dataset:
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 49
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_

FINE-TUNING BIOBERT


In [None]:
!pip install transformers datasets accelerate bitsandbytes peft wandb


In [30]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import TrainingArguments
import numpy as np
import evaluate

# training args
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,)
#excluded 'save_strategy' and 'evaluation_strategy' due to transformer version conflict

# evaluation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [32]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,)

trainer.train()


  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=12, training_loss=3.3746283849080405, metrics={'train_runtime': 12.4391, 'train_samples_per_second': 11.818, 'train_steps_per_second': 0.965, 'total_flos': 9671762161152.0, 'train_loss': 3.3746283849080405, 'epoch': 3.0})

In [33]:
results = trainer.evaluate(tokenized_ds["test"])
print(results)

trainer.save_model("./biobert_variant_classifier")
tokenizer.save_pretrained("./biobert_variant_classifier")


{'eval_loss': 3.250396728515625, 'eval_accuracy': 0.09090909090909091, 'eval_runtime': 0.1155, 'eval_samples_per_second': 95.217, 'eval_steps_per_second': 8.656, 'epoch': 3.0}


('./biobert_variant_classifier/tokenizer_config.json',
 './biobert_variant_classifier/special_tokens_map.json',
 './biobert_variant_classifier/vocab.txt',
 './biobert_variant_classifier/added_tokens.json',
 './biobert_variant_classifier/tokenizer.json')

Testing LoRA

In [34]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch

# load model
base_model = "dmis-lab/biobert-base-cased-v1.1"
num_labels = len(set(tokenized_ds["train"]["labels"]))

model = AutoModelForSequenceClassification.from_pretrained(
    base_model,
    num_labels=num_labels
)

# LoRA config
lora_config = LoraConfig(
    r=8,                      # rank of the LoRA matrices (tradeoff between capacity and speed)
    lora_alpha=16,            # scaling factor
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"       # sequence classification task
)

# apply LoRA
model = get_peft_model(model, lora_config)

print("Trainable parameters after applying LoRA:")
model.print_trainable_parameters()

# training arguments
training_args_lora = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,)

# define accuracy
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=preds, references=labels)

# trainer
trainer_lora = Trainer(
    model=model,
    args=training_args_lora,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
)

trainer_lora.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable parameters after applying LoRA:
trainable params: 332,593 || all params: 108,680,546 || trainable%: 0.3060


Step,Training Loss


TrainOutput(global_step=12, training_loss=4.094145774841309, metrics={'train_runtime': 2.8608, 'train_samples_per_second': 51.385, 'train_steps_per_second': 4.195, 'total_flos': 9710960103936.0, 'train_loss': 4.094145774841309, 'epoch': 3.0})

In [35]:
results = trainer_lora.evaluate(tokenized_ds["test"])
print(results)

trainer.save_model("./biobert_variant_classifier")
tokenizer.save_pretrained("./biobert_variant_classifier")


{'eval_loss': 3.808743476867676, 'eval_accuracy': 0.09090909090909091, 'eval_runtime': 0.1211, 'eval_samples_per_second': 90.856, 'eval_steps_per_second': 8.26, 'epoch': 3.0}


('./biobert_variant_classifier/tokenizer_config.json',
 './biobert_variant_classifier/special_tokens_map.json',
 './biobert_variant_classifier/vocab.txt',
 './biobert_variant_classifier/added_tokens.json',
 './biobert_variant_classifier/tokenizer.json')

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-690ca52f-043562a27a25ad742ae47d24;4522c847-ee39-4364-9a88-0b4d5e0098bf)

Invalid username or password.

In [41]:
from huggingface_hub import login

login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [42]:
trainer.push_to_hub("biobert-variant-lora")


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...results/model.safetensors:   0%|          |  564kB /  433MB            

  ...results/training_args.bin:   2%|1         |   103B / 5.78kB            

CommitInfo(commit_url='https://huggingface.co/abd-ur/results/commit/733f0d14a98874803e556aad01d356ddf92b5df7', commit_message='biobert-variant-lora', commit_description='', oid='733f0d14a98874803e556aad01d356ddf92b5df7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/abd-ur/results', endpoint='https://huggingface.co', repo_type='model', repo_id='abd-ur/results'), pr_revision=None, pr_num=None)