# LoRA tuning for binary classification

Model details
- Llama 2 7B of LLamaForSequenceClassification (meta-llama/Llama-2-7b-hf) in Hugging Face
- ~~with 4bit Quantization~~

Data used
- Kaggle dataset

Hyperparameters for training are as follows: (Tentative)
- Epoch = 5
- Batch size = 4
- gradient_accumulation_steps = 2
- Learning rate = 1e-4
- warm-up steps = 100, and decay linearly.

## Set up

In [1]:
#!pip install -q transformers datasets bitsandbytes accelerate evaluate
#!pip install -q git+https://github.com/huggingface/peft.git@main
#!pip install wandb # Optional: if you have an account for wandb

In [3]:
# Check the GPU status
!nvidia-smi

In [None]:
import re
import json
from tqdm.notebook import tqdm

import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset, DatasetDict, load_from_disk, load_metric
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

from peft import (
    PeftModel,
    PeftConfig,
    LoraConfig,
    get_peft_model,
)

from accelerate import Accelerator
import wandb

In [None]:
# Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

In [4]:
# Instantiate one in an accelerator object
accelerator = Accelerator()

## Load and tokenize dataset for LoRA tuning

In [5]:
# parameters
model_name_or_path = "meta-llama/Llama-2-7b-hf"
SEED = 42
MBTI_dimension = 0 # 0/1/2/3 0->E/I, 1->N/S,...

# Load preprocessed dataset
dataset = load_from_disk("data/processed_data_25tweets").shuffle(seed=SEED)

#############################
# Added to change to binary classification

# Extract Nth dimension from 4-dimension labels
def extract_nth_dimension(example, n):
    new_label = example['label'][n]
    return {'label': new_label}

dataset = dataset.map(lambda example: extract_nth_dimension(example, MBTI_dimension))
#############################

# Dictionary to switch labels and IDs
label2id = {l: id for id, l in enumerate(sorted(set(dataset['train']['label'])))}
id2label = {id: l for l, id in label2id.items()}


# Tokenize text data
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="right")
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def get_tokenized(dataset, tokenizer):
    def tokenize_add_label(batch):
        batch["input_ids"] = tokenizer(batch["text"]).input_ids
        batch["labels"] = label2id[batch['label']]
        return batch

    dataset = dataset.map(tokenize_add_label, remove_columns=dataset.column_names["train"], num_proc=4)
    return dataset

tokenized_dataset = get_tokenized(dataset, tokenizer) 
train_data = tokenized_dataset['train'].train_test_split(test_size=0.1) # dataset for fine-tuning; train -> train data, test -> eval data

print(train_data)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7807
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 868
    })
})


## Build model and set configuration for LoRA

In [7]:
# Build a model and set configuration for LoRA
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.bfloat16,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    device_map="auto",
    use_flash_attention_2=True,
)
if getattr(model.config, "pad_token_id") is None:
    model.config.pad_token_id = model.config.eos_token_id

config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,259,840 || all params: 6,611,668,992 || trainable%: 0.06442911774854926


## Implement training

In [8]:
"""
Saving Directories
"""
OUTPUT_DIR = "./lora_result_25tweets_lr-1e-4"
SAVE_DIR_FOR_LAST_STATE = "./lora_result_25tweets_lr-1e-4/after-5epoch"


"""
Training arguments/ hyperparameters for training
"""
TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 4
GRAD_ACCUM_STEPS = 2
NUM_EPOCHS = 5
LR=1e-4
WARMUP_STEPS = 100
LR_SCHEDULER = "linear"


# Optional: Login to wandb to visualize the training log
wandb.login()
wandb.init(project="capstone-llama2-finetuning")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jupyter/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mya2488[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
# Implement LoRA tuning

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,    # increase by 2x for every 2x decrease in batch size
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LR,
    bf16=True,
    warmup_steps=WARMUP_STEPS,
    logging_steps=200,
    save_steps=200,
    evaluation_strategy="steps",
    #max_steps=STEPS,                 # only for testing purposes, remove this from your final run :)
    remove_unused_columns=False,      # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],           # same reason as above
    group_by_length=True,             # Group sequences into batches with same length    # Saves memory and speeds up training considerably
    lr_scheduler_type=LR_SCHEDULER,     
    ddp_find_unused_parameters=False,
    report_to="wandb",
)

data_collator = DataCollatorWithPadding(tokenizer, padding="longest")

trainer = Trainer(
    args=training_args,
    model=model,
    train_dataset=train_data["train"],
    eval_dataset=train_data["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[],
)

trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,3.528,2.929694
200,2.7204,2.694715
300,2.3443,2.518287
400,1.8109,1.542087
500,1.3064,1.239815
600,1.2267,1.143519
700,1.1495,1.069764
800,1.0981,1.00972
900,0.9467,0.960054
1000,0.9811,0.962917


TrainOutput(global_step=4780, training_loss=0.809502499871673, metrics={'train_runtime': 36055.0933, 'train_samples_per_second': 1.061, 'train_steps_per_second': 0.133, 'total_flos': 1.6100942388534804e+18, 'train_loss': 0.809502499871673, 'epoch': 5.0})

In [11]:
# Save the weight at the last state
trainer.save_model(SAVE_DIR_FOR_LAST_STATE)