<a href="https://colab.research.google.com/github/bijayabc/mBERT-finetuning/blob/main/lg_BERT_adapters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install watermark transformers datasets lightning

In [None]:
%load_ext watermark
%watermark --conda -p torch,transformers,datasets,lightning

In [None]:
from datasets import load_dataset
import lightning as L
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

import numpy as np
import pandas as pd
import torch

# Load and clean the dataset

In [None]:
dataset = load_dataset('IRIIS-RESEARCH/sentiment-Analysis-Nepali')
print(dataset)

In [None]:
dataset_raw = dataset['train']
split = dataset_raw.train_test_split(test_size=0.2, seed=42)
train_dataset_raw = split['train']
val_dataset_raw = split['test']
test_dataset_raw = dataset['test']

Tokenize the dataset

In [None]:
# initialize the bert tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize_function(examples):
    texts = [str(t) if t is not None else "" for t in examples['sentences']]
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128)

# Tokenize each split separately
train_dataset = train_dataset_raw.map(tokenize_function, batched=True)
val_dataset = val_dataset_raw.map(tokenize_function, batched=True)
test_dataset = test_dataset_raw.map(tokenize_function, batched=True)

In [None]:
for dataset in [train_dataset, val_dataset, test_dataset]:
  dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'sentiment'])

Set up Dataloaders

In [None]:
from torch.utils.data import DataLoader, Dataset

# try using num_workers=2 and see if it is faster
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=12, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, num_workers=12)
test_loader = DataLoader(test_dataset, batch_size=64, num_workers=12)

for batch in train_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['sentiment']
    # feed these into your model

In [None]:
for dataset in [train_dataset, val_dataset, test_dataset]:
  print(len(dataset))

# Initialize the Model

In [None]:
# Initialize a BERT model for text classification
model_name = 'bert-base-multilingual-cased'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Freeze all layers

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
model

In [None]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total trainable parameters: {count_parameters(model):,}")

Add Adapter Layers

In [None]:
import torch.nn as nn

class Adapter(nn.Module):
  def __init__(self, hidden_size, bottleneck_size=32):
    super().__init__()
    self.down_proj = nn.Linear(hidden_size, bottleneck_size)
    self.activation = nn.GELU()
    self.up_proj = nn.Linear(bottleneck_size, hidden_size)

  def forward(self, x):
    down = self.down_proj(x)
    activated = self.activation(down)
    up = self.up_proj(activated)
    return x + up # residual connection

In [None]:
bottleneck_size = 64 # hyperparameter

for layer_idx in range(12):
    layer = model.bert.encoder.layer[layer_idx]

    ##############################################################################
    # Wrap the first attention output layer with 1st adapter
    ##############################################################################
    orig_layer_1 = layer.attention.output.dense # attention output layer
    layer.attention.output.dense = nn.Sequential(
        orig_layer_1,
        Adapter(orig_layer_1.out_features, bottleneck_size)
    )

    ##############################################################################
    # Wrap the output dense layer with 2nd adapter
    ##############################################################################
    orig_layer_2 = layer.output.dense
    layer.output.dense = nn.Sequential(
        orig_layer_2,
        Adapter(orig_layer_2.out_features, bottleneck_size)
    )

In [None]:
model

In [None]:
print("Total number of trainable parameters:", count_parameters(model))

# Finetuning

Wrap in LightningModule for training

In [None]:
import lightning as L
import torch
import torchmetrics

class CustomLightningModule(L.LightningModule):
  def __init__(self, model, learning_rate=2e-4, weight_decay=0.01):
    super().__init__()
    self.model = model
    self.learning_rate = learning_rate
    self.weight_decay = weight_decay

    self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
    self.test_acc = torchmetrics.Accuracy(task='multiclass', num_classes=2)

  def forward(self, input_ids, attention_mask, labels):
    return self.model(input_ids, attention_mask=attention_mask, labels=labels)

  def training_step(self, batch, batch_idx):
    outputs = self(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        labels=batch["sentiment"]
    )
    self.log("train_loss", outputs.loss, on_step=False, on_epoch=True, prog_bar=True)
    return outputs.loss # This is passed to the optimizer for training

  def validation_step(self, batch, batch_idx):
    outputs = self(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        labels=batch["sentiment"]
    )
    self.log("val_loss", outputs["loss"], on_step=False, on_epoch=True, prog_bar=True)

    logits = outputs['logits']
    predicted_labels = torch.argmax(logits, dim=1)
    self.val_acc(predicted_labels, batch['sentiment'])
    self.log('val_acc', self.val_acc, on_step=False, on_epoch=True, prog_bar=True)

  def test_step(self, batch, batch_idx):
    outputs = self(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        labels=batch["sentiment"]
    )
    logits = outputs['logits']
    predicted_labels = torch.argmax(logits, dim=1)
    self.test_acc(predicted_labels, batch['sentiment'])
    self.log('accuracy', self.test_acc, on_step=False, on_epoch=True, prog_bar=True)

  def configure_optimizers(self):
    # optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
    optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
    return optimizer

lightning_model = CustomLightningModule(model)

In [None]:
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import CSVLogger

callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    ),  # save top 1 model
    EarlyStopping(
        monitor="val_acc",
        mode="max",
        patience=3         # stop if val_acc doesnâ€™t improve after 1 epoch
    )
]
logger = CSVLogger(save_dir='logs/', name='my-model')

In [None]:
trainer = L.Trainer(
    max_epochs=20,
    callbacks=callbacks,
    accelerator="gpu",
    precision="16-mixed",
    devices=[0],
    logger=logger,
    log_every_n_steps=10,
    gradient_clip_val=1.0,
)

In [None]:
import time
start = time.time()

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

end = time.time()
elapsed = end - start
print(f'time elapsed: {elapsed/60:.2f} min')

In [None]:
trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")

In [None]:

trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")

In [None]:
trainer.test(model=lightning_model, dataloaders=test_loader, ckpt_path=None)