Fine tuning a BERT-based Squence Classification Model, 'distilbert-base-uncased', a pretrained model from the Hugging Face model hub, on a IMDb dataset with LoRA (Low Rank Adaptation) to adapt it for sentiment analysis.

In [None]:
!pip install datasets

In [None]:
!pip install peft

In [None]:
!pip install evaluate

In [None]:
!pip install torch

In [6]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


##Loading dataset

In [7]:
dataset = load_dataset('shawhin/imdb-truncated')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [9]:
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])   # displays the proportion of positive samples (label = 1) in the training set --> sum of the ones/total number of labels

0.5

In [10]:
model_checkpoint = 'distilbert-base-uncased'

# to define label 'dict' maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# to use the model_checkpoint as a Classification Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

##Preprocessing Data

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)   # loads a pretrained tokenizer associated with our model checkpoint to converting text into tokens so that the model can process

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
# adds a padding token if none exists to ensure that all the sequences in a batch have the same length

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [15]:
# creates a function to ensure that the text (extracted from the input examples) is truncated to a maximum length of 512 tokens from the left side

def tokenize_function(examples):

    text = examples["text"]   # extracts the text from the dict 'examples'

    tokenizer.truncation_side = "left"  # tokenizes the text and then truncates it to 512
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs  # returns the truncated numpy arrays

In [16]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)  # leverages the tokenize_function on both the training and the validation datasets

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # creates a data collator object that will handle the padding of tokenized inputs for an efficient batching and training of models (ensures that all the sequences in a batch have the same length)

## Model Evaluation

In [19]:
accuracy = evaluate.load("accuracy")  # creates an accuracy evaluation metric

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
accuracy

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

In [21]:
# creates an evaluation function to pass into trainer later to evaluate the model's performance during training and validation

def compute_metrics(p):  # p is a tuple containing the model's predictions and the true labels

    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}  # calculates the accuracy of the predictions compared to the true labels

## Feeding some random input texts to the untrained model

In [22]:
# feeding in random examples

text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")

for text in text_list:

    inputs = tokenizer.encode(text, return_tensors="pt")  # tokenizes the input text suitable for the model and returns the output as a PyTorch tensor

    logits = model(inputs).logits  # passes the tokenized input through the model to get the logits (the raw, unnormalized predictions output by the model)

    predictions = torch.argmax(logits)  # converts the logits into predicted labels

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


## Training the model by PEFT (Parameter-Efficient Fine Tuning) using LoRA (Low Rank Adaptation)

In [23]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,  # hyperparameter representing the rank of the low-rank decomposition, lower the rank, higher the parameter efficiency
                        lora_alpha=32,  # scaling factor for the LoRA layers; it controls the magnitude of the adaptation weights
                        lora_dropout=0.01,  # dropout rate applied to the LoRA layers to prevent overfitting
                        target_modules = ['q_lin'])

In [24]:
model = get_peft_model(model, peft_config)  # implementing the peft config over the untrained model

In [25]:
model.print_trainable_parameters()  # peft helps us to work on tuning a relatively less number of paramters rather than tuning all the paramters

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [26]:
lr = 1e-3  # determines how much the model's parameters are adjusted during each iteration of training; a higher learning rate means larger updates to the parameters, which can speed up training but may cause instability or convergence issues
batch_size = 4  # refers to the number of data samples processed in one iteration of training; larger batch sizes can speed up training due to parallelization but it requires more memory
num_epochs = 10  # determines how many times the model will see the entire dataset during training; more epochs can allow the model to learn more complex patterns in the data, but training for too many epochs can lead to overfitting, where the model memorizes the training data instead of generalizing to unseen data

In [27]:
import accelerate
import transformers

In [None]:
!pip install -U accelerate
!pip install -U transformers

In [29]:
# putting in all the training arguments

training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [30]:
# creating a trainer to work on the training and validation datasets

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,  # dynamically pad examples in each batch to be of an equal length
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()  # trains the model with the training object

Epoch,Training Loss,Validation Loss


In [None]:
# printing the trained model's predictions

print("Trained model predictions:")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])