<a href="https://colab.research.google.com/github/adiffloth/UND-GenAI/blob/main/gpt2_qlora_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT-2 QLoRA Fine-Tuning on Rotten Tomatoes data set

This notebook will:

1. Load a movie review data set from Hugging Face
2. Load the pre-trained GPT-2 model
3. Perform a baseline evaluation of GPT-2's classification performance
4. Perform QLoRA fine-tuning using Bits and Bytes
5. Perform an evaluation of the fine-tuned model's classification performance

The `cornell-movie-review-data/rotten_tomatoes` data set consists of two columns: the text of a movie review and a binary label that indicates whether the review is positive or negative. There are 8500 records in the train split, 1000 records in the validation split and 1000 records in the test split. The labels are evenly split between positive and negative cases.

---
## Environment setup

In [1]:
!pip install -qqq accelerate bitsandbytes datasets evaluate peft transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import torch
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed
    )
from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
    )

In [3]:
set_seed(42)
device = 'cuda'

---
## Load and prep data

In [4]:
dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes")
dataset

README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding=False
        )

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['text']
    )
tokenized_datasets

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
})

---
## Evaluate baseline GPT-2 model

In [6]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8
    )

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [7]:
baseline_model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2
    )

baseline_model.config.pad_token_id = tokenizer.pad_token_id
baseline_model.to(device)

baseline_args = TrainingArguments(
    output_dir="gpt2-rotten-baseline",
    per_device_eval_batch_size=8,
    dataloader_drop_last=False,
    report_to="none"
    )

baseline_trainer = Trainer(
    model=baseline_model,
    args=baseline_args,
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    )

baseline_metrics = baseline_trainer.evaluate()
baseline_metrics

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  baseline_trainer = Trainer(


{'eval_loss': 7.024363040924072,
 'eval_model_preparation_time': 0.0033,
 'eval_accuracy': 0.50093808630394,
 'eval_f1': 0.33541458658529155,
 'eval_runtime': 6.7131,
 'eval_samples_per_second': 158.794,
 'eval_steps_per_second': 19.961}

Baseline F1 score on the validation split: 0.3354

---
## Finetune the baseline model using QLoRA

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
    )

qlora_base_model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2,
    quantization_config=bnb_config,
    device_map="auto"
    )

qlora_base_model.config.pad_token_id = tokenizer.pad_token_id
qlora_base_model.gradient_checkpointing_enable()
qlora_base_model = prepare_model_for_kbit_training(qlora_base_model)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "c_fc"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
    )

qlora_model = get_peft_model(qlora_base_model, peft_config)
qlora_model.print_trainable_parameters()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 2,360,832 || all params: 126,802,176 || trainable%: 1.8618


In [12]:
qlora_args = TrainingArguments(
    output_dir="gpt2-rotten-qlora",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    report_to="none",
    optim="paged_adamw_8bit",
        load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    )

qlora_trainer = Trainer(
    model=qlora_model,
    args=qlora_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    )

train_result = qlora_trainer.train()
train_result

  qlora_trainer = Trainer(
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4078,0.336416,0.846154,0.845665
2,0.3422,0.314381,0.858349,0.858271
3,0.3107,0.313069,0.859287,0.859275


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=1602, training_loss=0.3484532717015413, metrics={'train_runtime': 305.0915, 'train_samples_per_second': 83.876, 'train_steps_per_second': 5.251, 'total_flos': 669115574304768.0, 'train_loss': 0.3484532717015413, 'epoch': 3.0})

In [13]:
qlora_trainer.evaluate()

{'eval_loss': 0.3130686283111572,
 'eval_accuracy': 0.8592870544090057,
 'eval_f1': 0.8592746704967895,
 'eval_runtime': 12.8589,
 'eval_samples_per_second': 82.9,
 'eval_steps_per_second': 5.21,
 'epoch': 3.0}

Best F1 score: 0.8593

---
## Save the trained model

In [14]:
adapter_dir = "gpt2-rotten-qlora/adapter"
base_model_dir = "gpt2-rotten-qlora/base"

os.makedirs(adapter_dir, exist_ok=True)
os.makedirs(base_model_dir, exist_ok=True)

qlora_trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)

qlora_trainer.model.get_base_model().save_pretrained(base_model_dir)

print(f"Saved adapter weights to {adapter_dir}")
print(f"Saved base model weights to {base_model_dir}")

Saved adapter weights to gpt2-rotten-qlora/adapter
Saved base model weights to gpt2-rotten-qlora/base


---
## Reload the trained model and evaluate it

In [15]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
    )

reloaded_base_model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2,
    device_map="auto"
    )

reloaded_base_model.config.pad_token_id = tokenizer.pad_token_id

reloaded_peft_model = PeftModel.from_pretrained(
    reloaded_base_model,
    adapter_dir
    )

merged_model = reloaded_peft_model.merge_and_unload()
merged_model.to(device)

merged_eval_args = TrainingArguments(
    output_dir="gpt2-rotten-qlora/merged-eval",
    per_device_eval_batch_size=8,
    dataloader_drop_last=False,
    report_to="none",
    remove_unused_columns=False
    )

merged_trainer = Trainer(
    model=merged_model,
    args=merged_eval_args,
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    )

merged_validation_metrics = merged_trainer.evaluate()
merged_validation_metrics

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  merged_trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


{'eval_loss': 0.30077871680259705,
 'eval_model_preparation_time': 0.0036,
 'eval_accuracy': 0.8658536585365854,
 'eval_f1': 0.8658440958394755,
 'eval_runtime': 3.2754,
 'eval_samples_per_second': 325.455,
 'eval_steps_per_second': 40.911}

F1 score for the reloaded model: 0.8658