# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

### Load the datasdet dair-air/emotion and explore the data

In [1]:
from datasets import load_dataset

ds = load_dataset("dair-ai/emotion", "split")
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [2]:
import random

# print some random featues and the labels
print("Features:")
indices = random.sample(range(len(ds["train"])), 10)
for i in indices:
    print("{} : {}".format(ds["train"]['text'][i], ds["train"]['label'][i]))

print("\nLabels: {}".format(ds["train"].features["label"].names))

Features:
i started to feel super emotional all the time which was so strange : 1
i do my best but it feels uncomfortable : 4
i feel so stressed out with family problems : 0
i understand that he was feeling devastated and i sympathize : 0
i just feel so depressed and i don t know what would make me happy : 0
i compare your beauty i feel unsure where to begin to angels or nature or what : 4
i feel so sorry for you your family and friends : 0
i do my yoga i open up feel tender two hours later i m nicely swaddled up again happily wrapping layer upon layer over it out of my well meaning habit : 2
i feel like i m always stressed worried or upset about something : 0
i feel curious about the subject matter : 5

Labels: ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [3]:
# create data structures for further processing

# names of the splits
splits=list(ds.keys())
# number of classes
num_classes=len(ds["train"].features["label"].names)

# Dictionairies to translate between label string and label number
id2label = dict(zip(range(num_classes), ds['train'].features['label'].names))
label2id = dict(zip(ds['train'].features['label'].names, range(num_classes)))
print(id2label)
print(label2id)

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Use GPT-2 as a small base model
# Create a variant with classification head
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
model_id = "openai-community/gpt2"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, 
    num_labels=num_classes,
    id2label=id2label,
    label2id=label2id,
    device_map=device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Add tokens to the dataset
tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(
        lambda x: tokenizer(x["text"], truncation=True), batched=True
    )

for param in model.base_model.parameters():
    param.requires_grad = False

# Add the padding token which is missing in GPT-2
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = model.config.eos_token_id
    print("Padding token: {}".format(tokenizer.pad_token))

# metric function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Padding token: [PAD]


In [5]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
import os

temp_path = "/tmp"
save_path = "./data"

model_name = "gpt2_classification"
checkpoint_dir = os.path.join(temp_path, model_name)
save_dir_base = os.path.join(save_path, model_name)

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=checkpoint_dir,
        learning_rate=2e-3,
        per_device_train_batch_size=100,
        per_device_eval_batch_size=100,
        num_train_epochs=5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.19557,0.544
2,No log,1.138781,0.571
3,No log,1.103355,0.5845
4,1.265600,1.120978,0.591
5,1.265600,1.089581,0.582


TrainOutput(global_step=800, training_loss=1.2034161376953125, metrics={'train_runtime': 73.9922, 'train_samples_per_second': 1081.194, 'train_steps_per_second': 10.812, 'total_flos': 2309089289011200.0, 'train_loss': 1.2034161376953125, 'epoch': 5.0})

In [6]:
# Evaluate the model
original_performance=trainer.evaluate()
print(original_performance)

model.save_pretrained(save_dir_base, save_embedding_layers=True)

{'eval_loss': 1.0895805358886719, 'eval_accuracy': 0.582, 'eval_runtime': 1.328, 'eval_samples_per_second': 1506.029, 'eval_steps_per_second': 15.06, 'epoch': 5.0}


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [7]:
from peft import LoraConfig, TaskType, get_peft_model

torch.cuda.empty_cache()

# Use Lora for PEFT
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    task_type=TaskType.TOKEN_CLS,
    fan_in_fan_out=True,
)

# adding PEFT modifies the base model in-place
# so it should be saved for restoring the PEFT model later
model_lora = get_peft_model(model, peft_config)
model_lora.print_trainable_parameters()

model_name = "gpt2_classification_lora"
checkpoint_dir = os.path.join(temp_path, model_name)
save_dir = os.path.join(save_path, model_name)

trainable params: 594,432 || all params: 125,039,616 || trainable%: 0.4754


In [8]:
trainer_lora = Trainer(
    model=model_lora,
    args=TrainingArguments(
        output_dir=checkpoint_dir,
        learning_rate=2e-3,
        per_device_train_batch_size=100,
        per_device_eval_batch_size=100,
        num_train_epochs=5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer_lora.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.235753,0.91
2,No log,0.160215,0.9295
3,No log,0.137495,0.935
4,0.254300,0.125307,0.934
5,0.254300,0.124479,0.9325




TrainOutput(global_step=800, training_loss=0.2025426959991455, metrics={'train_runtime': 145.967, 'train_samples_per_second': 548.069, 'train_steps_per_second': 5.481, 'total_flos': 2325225977856000.0, 'train_loss': 0.2025426959991455, 'epoch': 5.0})

###  ⚠️ IMPORTANT ⚠️

Due to workspace storage constraints, you should not store the model weights in the same directory but rather use `/tmp` to avoid workspace crashes which are irrecoverable.
Ensure you save it in /tmp always.

In [9]:
# Saving the model

model_lora.save_pretrained(save_dir, save_embedding_layers=True)

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [10]:
from peft import PeftModelForTokenClassification

# loading the model
model_base = AutoModelForSequenceClassification.from_pretrained(save_dir_base)
model_loaded = PeftModelForTokenClassification.from_pretrained(model_base, save_dir)

In [11]:
trainer_evaluate = Trainer(
    model=model_loaded,
    args=TrainingArguments(
        output_dir="./data/sentiment_analysis_lora_evaluate",
        per_device_train_batch_size=100,
        per_device_eval_batch_size=100,
        do_train=False,
        do_eval=True,
    ),
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

fine_tuned_performance=trainer_evaluate.evaluate()

In [12]:

print("Original Model:  ", original_performance)
print("Fine-Tuned Model:", fine_tuned_performance)

print("Original Model accurcy:   ", original_performance['eval_accuracy'])
print("Fine-Tuned Model accurcy: ", fine_tuned_performance['eval_accuracy'])

Original Model:   {'eval_loss': 1.0895805358886719, 'eval_accuracy': 0.582, 'eval_runtime': 1.328, 'eval_samples_per_second': 1506.029, 'eval_steps_per_second': 15.06, 'epoch': 5.0}
Fine-Tuned Model: {'eval_loss': 0.12447859346866608, 'eval_model_preparation_time': 0.0019, 'eval_accuracy': 0.9325, 'eval_runtime': 1.43, 'eval_samples_per_second': 1398.587, 'eval_steps_per_second': 13.986}
Original Model accurcy:    0.582
Fine-Tuned Model accurcy:  0.9325


### Use different Quantization: QLoRA

In [13]:
from transformers import BitsAndBytesConfig

torch.cuda.empty_cache()

temp_path = "/tmp"
save_path = "./data"

model_name = "gpt2_classification_4bit_lora"
checkpoint_dir = os.path.join(temp_path, model_name)
save_dir_base = os.path.join(save_path, model_name)

model_id = "openai-community/gpt2"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model4b = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    num_labels=num_classes,
    id2label=id2label,
    label2id=label2id,
    torch_dtype="auto")

model4b.resize_token_embeddings(len(tokenizer))
model4b.config.pad_token_id = model.config.eos_token_id

for param in model4b.base_model.parameters():
    param.requires_grad = False

# peft model
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    task_type=TaskType.TOKEN_CLS,
    fan_in_fan_out=True,
)

model4bl = get_peft_model(model4b, peft_config)
model4bl.print_trainable_parameters()

trainer = Trainer(
    model=model4bl,
    args=TrainingArguments(
        output_dir=checkpoint_dir,
        learning_rate=2e-3,
        per_device_train_batch_size=100,
        per_device_eval_batch_size=100,
        num_train_epochs=5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        fp16=True
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()
validation_lora_q4 = trainer.evaluate()
model4bl.save_pretrained(save_dir, save_embedding_layers=True)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 594,432 || all params: 125,039,616 || trainable%: 0.4754




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.245774,0.911
2,No log,0.163552,0.927
3,No log,0.13895,0.929
4,0.402400,0.127697,0.9325
5,0.402400,0.122325,0.9345




In [14]:
print("Original Model accurcy:         ", original_performance['eval_accuracy'])
print("Fine-Tuned Model accurcy:       ", fine_tuned_performance['eval_accuracy'])
print("Fine-Tuned Model 4 bit accurcy: ", validation_lora_q4['eval_accuracy'])

Original Model accurcy:          0.582
Fine-Tuned Model accurcy:        0.9325
Fine-Tuned Model 4 bit accurcy:  0.9345
