In [1]:
pip install --upgrade datasets huggingface_hub

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [3]:
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from peft import LoraConfig
from transformers import AutoModelForCausalLM
from peft import get_peft_model
from peft import AutoPeftModelForCausalLM
from peft import AutoPeftModelForSequenceClassification
import torch

In [4]:
dataset = load_dataset("sms_spam", split="train").train_test_split(
    test_size=0.2, shuffle=True, seed=23
)

splits = ["train", "test"]

# View the dataset characteristics
dataset["train"]

Dataset({
    features: ['sms', 'label'],
    num_rows: 4459
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["sms"], truncation=True), batched=True
    )

tokenized_dataset["train"]



Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 4459
})

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

for param in model.parameters():
    param.requires_grad = True

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 32,
        per_device_eval_batch_size = 32,
        # Evaluate and save the model after each epoch
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        num_train_epochs=10,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.054353,0.989238
2,No log,0.045239,0.990135
3,No log,0.058476,0.990135
4,0.037300,0.060904,0.990135
5,0.037300,0.060663,0.991031
6,0.037300,0.06427,0.991031
7,0.037300,0.066205,0.991031
8,0.001400,0.067723,0.991031
9,0.001400,0.068402,0.991031
10,0.001400,0.068639,0.991031


Checkpoint destination directory ./data/spam_not_spam/checkpoint-140 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-280 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-420 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-560 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-700 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-840 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-980 a

TrainOutput(global_step=1400, training_loss=0.013904576519770282, metrics={'train_runtime': 362.1052, 'train_samples_per_second': 123.141, 'train_steps_per_second': 3.866, 'total_flos': 839113443159492.0, 'train_loss': 0.013904576519770282, 'epoch': 10.0})

In [8]:
normal_model_results = trainer.evaluate()

## LORA finetuning

In [32]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["transformer.layer.*.attention.q_lin", "transformer.layer.*.attention.k_lin", "transformer.layer.*.attention.v_lin"],
    lora_dropout=0.1,
    bias="none"
)

In [33]:
lora_model = get_peft_model(model, config)

In [34]:
lora_model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 68,134,658 || trainable%: 1.7313479433623928


In [35]:
lora_model.save_pretrained("distilbert-base-uncased-lora")

In [36]:
lora_model = AutoPeftModelForSequenceClassification.from_pretrained("distilbert-base-uncased-lora")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
torch.set_grad_enabled(True)
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 32,
        per_device_eval_batch_size = 32,
        # Evaluate and save the model after each epoch
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        num_train_epochs=10,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.260489,0.870852
2,No log,0.152965,0.956054
3,No log,0.10634,0.9713
4,0.229300,0.086092,0.973094
5,0.229300,0.075747,0.974888
6,0.229300,0.07037,0.974888
7,0.229300,0.066818,0.975785
8,0.078900,0.064816,0.975785
9,0.078900,0.063384,0.975785
10,0.078900,0.063043,0.975785


Checkpoint destination directory ./data/spam_not_spam/checkpoint-140 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-280 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-420 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-560 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-700 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-840 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-980 a

TrainOutput(global_step=1400, training_loss=0.12888518469674246, metrics={'train_runtime': 133.3414, 'train_samples_per_second': 334.405, 'train_steps_per_second': 10.499, 'total_flos': 859244312853384.0, 'train_loss': 0.12888518469674246, 'epoch': 10.0})

In [38]:
lora_model_results = trainer.evaluate()

In [39]:
def compare_models(lora_model_results, normal_model_results):
    print("Comparison between LoRA Model and Normal Model:")
    
    lora_accuracy = lora_model_results.get("eval_accuracy", None)
    normal_accuracy = normal_model_results.get("eval_accuracy", None)
    
    print(f"Accuracy Comparison:")
    print(f"LoRA Model Accuracy: {lora_accuracy}")
    print(f"Normal Model Accuracy: {normal_accuracy}")
    print(f"Accuracy Difference: {lora_accuracy - normal_accuracy}\n")

    lora_loss = lora_model_results.get("eval_loss", None)
    normal_loss = normal_model_results.get("eval_loss", None)
    
    print(f"Loss Comparison:")
    print(f"LoRA Model Loss: {lora_loss}")
    print(f"Normal Model Loss: {normal_loss}")
    print(f"Loss Difference: {lora_loss - normal_loss}\n")

    lora_runtime = lora_model_results.get("eval_runtime", None)
    normal_runtime = normal_model_results.get("eval_runtime", None)
    
    print(f"Runtime Comparison:")
    print(f"LoRA Model Runtime: {lora_runtime}")
    print(f"Normal Model Runtime: {normal_runtime}")
    print(f"Runtime Difference: {lora_runtime - normal_runtime}\n")
    
compare_models(lora_model_results, normal_model_results)

Comparison between LoRA Model and Normal Model:
Accuracy Comparison:
LoRA Model Accuracy: 0.9757847533632287
Normal Model Accuracy: 0.9901345291479821
Accuracy Difference: -0.014349775784753382

Loss Comparison:
LoRA Model Loss: 0.06304305791854858
Normal Model Loss: 0.04523885250091553
Loss Difference: 0.017804205417633057

Runtime Comparison:
LoRA Model Runtime: 2.8763
Normal Model Runtime: 2.7443
Runtime Difference: 0.13200000000000012

