In [1]:
pip install --upgrade datasets huggingface_hub

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [3]:
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from peft import LoraConfig
from transformers import AutoModelForCausalLM
from peft import get_peft_model
from peft import AutoPeftModelForCausalLM
from peft import AutoPeftModelForSequenceClassification
import torch

In [4]:
dataset = load_dataset("sms_spam", split="train").train_test_split(
    test_size=0.2, shuffle=True, seed=23
)

splits = ["train", "test"]

dataset["train"]

Dataset({
    features: ['sms', 'label'],
    num_rows: 4459
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["sms"], truncation=True, padding=True),
        batched=True,
        remove_columns=["sms"]
    )

tokenized_dataset["train"]



Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 4459
})

In [6]:
# tokenized_dataset["train"] = tokenized_dataset["train"].remove_columns(["sms"])
# tokenized_dataset["test"] = tokenized_dataset["test"].remove_columns(["sms"])
tokenized_dataset["train"] = tokenized_dataset['train'].rename_column('label', 'labels')
tokenized_dataset["test"] = tokenized_dataset['test'].rename_column('label', 'labels')

In [7]:
print(tokenized_dataset["train"].column_names)

['labels', 'input_ids', 'attention_mask']


In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

for param in model.parameters():
    param.requires_grad = True

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [10]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 32,
        per_device_eval_batch_size = 32,
        # Evaluate and save the model after each epoch
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        num_train_epochs=10,
        weight_decay=0.01,
        load_best_model_at_end=True,
        remove_unused_columns=False,
        label_names=["labels"],
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.048603,0.990135
2,No log,0.042205,0.990135
3,No log,0.053488,0.987444
4,0.039600,0.061794,0.989238
5,0.039600,0.064366,0.989238
6,0.039600,0.060082,0.989238
7,0.039600,0.060216,0.991928
8,0.001900,0.06312,0.989238
9,0.001900,0.072807,0.989238
10,0.001900,0.071885,0.989238


Checkpoint destination directory ./data/spam_not_spam/checkpoint-140 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-280 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-420 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-560 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-700 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-840 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-980 a

TrainOutput(global_step=1400, training_loss=0.01490892652954374, metrics={'train_runtime': 1001.0338, 'train_samples_per_second': 44.544, 'train_steps_per_second': 1.399, 'total_flos': 2745406500275088.0, 'train_loss': 0.01490892652954374, 'epoch': 10.0})

In [11]:
normal_model_results = trainer.evaluate()
normal_model_results

{'eval_loss': 0.0422048382461071,
 'eval_accuracy': 0.9901345291479821,
 'eval_runtime': 7.5638,
 'eval_samples_per_second': 147.413,
 'eval_steps_per_second': 4.627,
 'epoch': 10.0}

## LORA finetuning

In [12]:
torch.set_grad_enabled(True)
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["transformer.layer.*.attention.q_lin", "transformer.layer.*.attention.k_lin", "transformer.layer.*.attention.v_lin"],
    lora_dropout=0.1,
    bias="none"
)

In [13]:
lora_model = get_peft_model(model, config)

In [14]:
lora_model.print_trainable_parameters()

trainable params: 442,368 || all params: 67,397,378 || trainable%: 0.6563578778984548


In [15]:
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=10,
        weight_decay=0.01,
        load_best_model_at_end=True,
        remove_unused_columns=False,
        label_names=["labels"],
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.042753,0.989238
2,No log,0.042504,0.989238
3,No log,0.042415,0.989238
4,0.010400,0.042403,0.989238
5,0.010400,0.042058,0.991031
6,0.010400,0.042164,0.991031
7,0.010400,0.042085,0.991031
8,0.008500,0.042111,0.991031
9,0.008500,0.042116,0.991031
10,0.008500,0.042116,0.991031


Checkpoint destination directory ./data/spam_not_spam/checkpoint-140 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-280 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-420 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-560 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-700 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-840 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/spam_not_spam/checkpoint-980 a

TrainOutput(global_step=1400, training_loss=0.009201802015304565, metrics={'train_runtime': 753.5801, 'train_samples_per_second': 59.171, 'train_steps_per_second': 1.858, 'total_flos': 2773571033924496.0, 'train_loss': 0.009201802015304565, 'epoch': 10.0})

In [16]:
lora_model.save_pretrained("distilbert-base-uncased-lora")

In [17]:
lora_model = AutoPeftModelForSequenceClassification.from_pretrained("distilbert-base-uncased-lora")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
lora_model_results = trainer.evaluate()
lora_model_results

{'eval_loss': 0.0420583076775074,
 'eval_accuracy': 0.9910313901345291,
 'eval_runtime': 8.0437,
 'eval_samples_per_second': 138.617,
 'eval_steps_per_second': 4.351,
 'epoch': 10.0}

In [19]:
def compare_models(lora_model_results, normal_model_results):
    print("Comparison between LoRA Model and Normal Model:")
    
    lora_accuracy = lora_model_results.get("eval_accuracy", None)
    normal_accuracy = normal_model_results.get("eval_accuracy", None)
    
    print(f"Accuracy Comparison:")
    print(f"LoRA Model Accuracy: {lora_accuracy}")
    print(f"Normal Model Accuracy: {normal_accuracy}")
    print(f"Accuracy Difference: {lora_accuracy - normal_accuracy}\n")

    lora_loss = lora_model_results.get("eval_loss", None)
    normal_loss = normal_model_results.get("eval_loss", None)
    
    print(f"Loss Comparison:")
    print(f"LoRA Model Loss: {lora_loss}")
    print(f"Normal Model Loss: {normal_loss}")
    print(f"Loss Difference: {lora_loss - normal_loss}\n")

    lora_runtime = lora_model_results.get("eval_runtime", None)
    normal_runtime = normal_model_results.get("eval_runtime", None)
    
    print(f"Runtime Comparison:")
    print(f"LoRA Model Runtime: {lora_runtime}")
    print(f"Normal Model Runtime: {normal_runtime}")
    print(f"Runtime Difference: {lora_runtime - normal_runtime}\n")
    
compare_models(lora_model_results, normal_model_results)

Comparison between LoRA Model and Normal Model:
Accuracy Comparison:
LoRA Model Accuracy: 0.9910313901345291
Normal Model Accuracy: 0.9901345291479821
Accuracy Difference: 0.0008968609865470656

Loss Comparison:
LoRA Model Loss: 0.0420583076775074
Normal Model Loss: 0.0422048382461071
Loss Difference: -0.00014653056859970093

Runtime Comparison:
LoRA Model Runtime: 8.0437
Normal Model Runtime: 7.5638
Runtime Difference: 0.47989999999999977

