# Lightweight Fine-Tuning Project



* PEFT technique: 
* Model: BERT since its better for sentiment analysis (reasoning) than GPT 2 which is basically better for text generation
* Evaluation approach: Split data into train and test, train with train data and check accuracy with test data along with F1, recall and precision score
* Fine-tuning dataset: Stanford SST2 (https://huggingface.co/datasets/stanfordnlp/sst2)

## Loading and Evaluating a Foundation Model


In [3]:
# Install the required version of datasets in case you have an older version
# You will need to choose "Kernel > Restart Kernel" from the menu after executing this cell
! pip install -q "datasets==2.15.0"

[0m

In [4]:
# Load the Stanford SST2 dataset
# See: https://huggingface.co/datasets/stanfordnlp/sst2

from datasets import load_dataset

# The sms_spam dataset only has a train split, so we use the train_test_split method to split it into train and test
dataset = load_dataset("stanfordnlp/sst2", split="train").train_test_split(
    test_size=0.2, shuffle=True, seed=23
)

splits = ["train", "test"]

# View the dataset characteristics
dataset["train"]

Downloading readme:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 3.11M/3.11M [00:00<00:00, 10.4MB/s]
Downloading data: 100%|██████████| 72.8k/72.8k [00:00<00:00, 948kB/s]
Downloading data: 100%|██████████| 148k/148k [00:00<00:00, 1.53MB/s]


Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 53879
})

In [5]:
#label 1 is negative
#label 2 is positive
dataset["train"][0]

{'idx': 14677, 'sentence': 'good performances ', 'label': 1}

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["sentence"], truncation=True), batched=True
    )

# Inspect the available columns in the dataset
tokenized_dataset["train"]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/53879 [00:00<?, ? examples/s]

Map:   0%|          | 0/13470 [00:00<?, ? examples/s]

Dataset({
    features: ['idx', 'sentence', 'label', 'input_ids', 'attention_mask'],
    num_rows: 53879
})

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0: "negative", 1: "positive"},
    label2id={"negative": 0, "positive": 1},
)

# Unfreeze all the model parameters.
# Hint: Check the documentation at https://huggingface.co/transformers/v4.2.2/training.html
for param in model.parameters():
    param.requires_grad = True
    
model.classifier

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Linear(in_features=768, out_features=2, bias=True)

In [8]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [9]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam",
        # Set the learning rate
        learning_rate=2e-3,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        # Evaluate and save the model after each epoch
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# evaluate before training
trainer.evaluate()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.6936307549476624,
 'eval_accuracy': 0.4899034892353378,
 'eval_runtime': 33.3112,
 'eval_samples_per_second': 404.369,
 'eval_steps_per_second': 101.107}

In [10]:
# Make a dataframe with the predictions and the text and the labels
import pandas as pd

items_for_manual_review = tokenized_dataset["test"].select(
    [0, 1, 22, 31, 43, 292, 448, 487]
)

results = trainer.predict(items_for_manual_review)

def calculate_precision_recall_f1(actuals, predictions):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for actual, predicted in zip(actuals, predictions):
        if actual == predicted == 1:
            true_positives += 1
        if predicted == 1 and actual != predicted:
            false_positives += 1
        if predicted == 0 and actual != predicted:
            false_negatives += 1
    
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

precision, recall, f1, = calculate_precision_recall_f1(results.label_ids, results.predictions.argmax(axis=1))

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 0.75
Recall: 0.42857142857142855
F1 Score: 0.5454545454545454


In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6824,0.687,0.555382


Checkpoint destination directory ./data/spam_not_spam/checkpoint-13470 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=13470, training_loss=0.6918074621301097, metrics={'train_runtime': 760.1989, 'train_samples_per_second': 70.875, 'train_steps_per_second': 17.719, 'total_flos': 333643722426996.0, 'train_loss': 0.6918074621301097, 'epoch': 1.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.6870003342628479,
 'eval_accuracy': 0.5553823311061619,
 'eval_runtime': 32.8573,
 'eval_samples_per_second': 409.954,
 'eval_steps_per_second': 102.504,
 'epoch': 1.0}

## Performing Parameter-Efficient Fine-Tuning



In [13]:
from peft import (
    LoraConfig, 
    get_peft_model, 
    TaskType,
    PeftModel
)

In [14]:
# Define the LoRA Configuration
"""
lora_dim (or lora_r in some contexts): This sets the rank of the low-rank matrices. A smaller value means fewer parameters to train.
lora_alpha: This is a scaling factor used to adjust the contribution of the LoRA matrices relative to the original model weights.
lora_dropout: This specifies the dropout rate to apply to the outputs of the LoRA transformations, which helps prevent overfitting.
apply_lora: This parameter decides which parts of the Transformer architecture (like attention or feed-forward layers) are modified using LoRA.
"""

lora_config = LoraConfig(
    r=2, # Rank Number
    lora_alpha=16, # Alpha (Scaling Factor)
    lora_dropout=0.05, # Dropout Prob for Lora
    target_modules=["q_lin", "k_lin","v_lin"], # Which layer to apply LoRA, usually only apply on MultiHead Attention Layer
    bias='none',
    task_type=TaskType.SEQ_CLS # Seqence to Classification Task
)

In [15]:
# Get our LoRA-enabled model
peft_model = get_peft_model(model, 
                            lora_config)

In [None]:
#training with peft

trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam",
        # Set the learning rate
        learning_rate=2e-3,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        # Evaluate and save the model after each epoch
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
peft_model.save_pretrained('./peft_model')

In [None]:
trainer.evaluate()

In [None]:
precision, recall, f1, = calculate_precision_recall_f1(results.label_ids, results.predictions.argmax(axis=1))

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

## Performing Inference with a PEFT Model


In [None]:
loaded_peft_model = AutoModelForSequenceClassification.from_pretrained('./peft_model')

In [None]:
# Make a dataframe with the predictions and the text and the labels
import pandas as pd

items_for_manual_review = tokenized_dataset["test"].select(
    [0, 1, 22, 31, 43, 292, 448, 487]
)

results = trainer.predict(items_for_manual_review)
df = pd.DataFrame(
    {
        "sentiments": [item["sentence"] for item in items_for_manual_review],
        "predictions": results.predictions.argmax(axis=1),
        "labels": results.label_ids,
    }
)
# Show all the cell
pd.set_option("display.max_colwidth", None)
df