# Finetuning a DistilBERT Classifier in Lightning

![](figures/finetuning-ii.png)

In [None]:
!pip install watermark



In [None]:
!pip install transformers



In [None]:
!pip install datasets



In [None]:
!pip install lightning



In [None]:
%load_ext watermark
%watermark --conda -p torch,transformers,datasets,lightning

torch       : 2.0.1+cu118
transformers: 4.30.2
datasets    : 2.13.1
lightning   : 2.0.5

conda environment: n/a



In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')
import sys
sys.path.append('/content/gdrive/MyDrive/lightning_ai/dl-fundamentals_lighning_ai/unit08-large-language-models/exercises/tmp')
import os
os.chdir(globals()['_dh'][0])
os.chdir('..')
os.chdir('./content/gdrive/MyDrive/lightning_ai/dl-fundamentals_lighning_ai/unit08-large-language-models/exercises/tmp')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


# 1 Loading the dataset into DataFrames

In [None]:
import os.path as op

from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import numpy as np
import pandas as pd
import torch

from sklearn.feature_extraction.text import CountVectorizer

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [None]:
# download_dataset()

# df = load_dataset_into_to_dataframe()
# partition_dataset(df)

In [None]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

# 2 Tokenization and Numericalization

**Load the dataset via `load_dataset`**

In [None]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv",
    },
)

print(imdb_dataset)



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


**Tokenize the dataset**

In [None]:
print(imdb_dataset['train'][1]['text'])

Steve Biko was a black activist who tried to resist the white minority governed South Africa in much the same way as Gandhi tried to resist the British empire's colonialism in India. Richard Attenborough's film Cry Freedom is not about Biko or Apartheid as much as it is about Donald Woods, the white liberal newspaper editor who risked his life trying to tell Biko's story. The film has a jarring point of view switch after Biko dies in prison from tortuous behavior at the hands of South African "police". Woods, played by Kevin Kline, must choose whether to do the right thing and flee the country to publish books about Biko or allow his wife, played by Penelope Wilton, to pressure him into forgetting about the books. In that case, Biko dies in vain. What begins as a life-changing friendship between Biko and Woods degenerates into a standard by the numbers escape over the border yarn after Biko's death. Oscar-nominated Denzel Washington is good in only his fourth film as Biko, but somethin

In [None]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # Default from course
# tokenizer = AutoTokenizer.from_pretrained("deepset/deberta-v3-base-injection") # Too big for Colab
# tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 50265


In [None]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [None]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
del imdb_dataset

In [None]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3 Set Up DataLoaders

In [None]:
from torch.utils.data import DataLoader, Dataset


class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [None]:
train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=12,
    shuffle=True,
    num_workers=4
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=12,
    num_workers=4
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    num_workers=4
)



In [None]:
print(train_loader.dataset.partition)

Dataset({
    features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 35000
})


# 4 Initializing DistilBERT

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:


# model = AutoModelForSequenceClassification.from_pretrained(
#     "distilbert-base-uncased", num_labels=2)



### Intializing a different model:

In [None]:

model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")

model
# model.classifier.out_proj = torch.nn.Linear(in_features=28, out_features=2)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
model.classifier.out_proj = torch.nn.Linear(in_features=768, out_features=2)
# model.classifier.out_proj = torch.nn.Sequential(
#     model.classifier.out_proj,
#     torch.nn.Flatten(start_dim=1)
# )


In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(
#     "siebert/sentiment-roberta-large-english", num_labels=2)

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", num_labels =2)


In [None]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
# # Load model directly. Tokenizing too big with this model
# from transformers import AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained("deepset/deberta-v3-base-injection")


## 5 Finetuning

**Wrap in LightningModule for Training**

In [None]:
import lightning as L
import torch
import torchmetrics


class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model

        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]  # this is passed to the optimizer for training

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("val_loss", outputs["loss"], prog_bar=True)

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.test_acc(predicted_labels, batch["label"])
        self.log("accuracy", self.test_acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer


lightning_model = LightningModel(model)

In [None]:
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger


callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )  # save top 1 model
]
logger = CSVLogger(save_dir="logs/", name="my-model")

In [None]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="cpu",
    precision="16-mixed",
    devices=1,
    logger=logger,
    log_every_n_steps=10,
)

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

  rank_zero_warn(
INFO: Using bfloat16 Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using bfloat16 Automatic Mixed Precision (AMP)
INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: 
  | Name     | Type                             | Params
--------------------------------------------------------------
0 | model    | RobertaForSequenceClassification | 124 M 
1 | val_acc  | MulticlassAccuracy               | 0     
2 | test_acc | MulticlassAccuracy               | 0     
---------------------------------------------------

Sanity Checking: 0it [00:00, ?it/s]



ValueError: ignored

In [None]:
trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")