Copyright (c) Meta Platforms, Inc. and affiliates.
This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.

<a href="https://colab.research.google.com/github/meta-llama/llama-cookbook/blob/main/getting-started/finetuning/quickstart_peft_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PEFT Finetuning Quick Start Notebook

This notebook shows how to train a Meta Llama 3 model on a single GPU (e.g. A10 with 24GB) using int8 quantization and LoRA finetuning.

**_Note:_** To run this notebook on a machine with less than 24GB VRAM (e.g. T4 with 16GB) the context length of the training dataset needs to be adapted.
We do this based on the available VRAM during execution.
If you run into OOM issues try to further lower the value of train_config.context_length.

### Step 0: Install pre-requirements and convert checkpoint

We need to have llama-cookbook and its dependencies installed for this notebook. Additionally, we need to log in with the huggingface_cli and make sure that the account is able to to access the Meta Llama weights.

In [None]:
# uncomment if running from Colab T4
# ! pip install llama-cookbook ipywidgets

# import huggingface_hub
# huggingface_hub.login()

### Step 1: Load the model

Setup training configuration and load the model and tokenizer.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
import bitsandbytes as bnb
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from datasets import Dataset
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
import re
import wandb
from huggingface_hub import login
login(token="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")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "LLaMAX/LLaMAX3-8B"

model = AutoModelForCausalLM.from_pretrained(
          model_name,
          quantization_config=bnb_config,
          torch_dtype=torch.bfloat16,
          device_map="auto",
        )

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

### Step 2: Check base model

Run the base model on an example input:

We can see that the base model only repeats the conversation.

### Step 3: Load the preprocessed dataset

We load and preprocess the samsum dataset which consists of curated pairs of dialogs and their summarization:

In [None]:
def format_example(row: dict):
    system_prompt = """
    You are a Palestinian historian and geographer specialising in the land, people, and cultural heritage of Palestine.

    General task
    Answer questions or carry out instructions accurately and neutrally while preserving the authentic Palestinian narrative.
    Use Modern Standard Arabic unless the user asks for another language.

    Narrative-integrity rules
    Give the Arabic place-name first; add other names in brackets when needed.
    Avoid terms that erase Palestinian identity (e.g. “Judea and Samaria”) unless requested for comparison.
    Present the Palestinian account first, then mention other viewpoints briefly if relevant.
    Replace every standalone occurrence of “Israel” with “the Israeli occupation” or “the occupying entity”.

    Accuracy rules
    Do not state information without a reliable source.

    Output format
    If asked to “extract a span”, return that text only.
    Otherwise:
    • Optional one-line intro, then short bullets or mini paragraphs.
    • End with “References”.
    Do not exceed 150 words unless the user asks for more.

    Uncertainty
    If no trustworthy data exist, state: “No reliable data available as of the last update.”
    """
    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {"role": "user", "content": row["Question"]},
        {"role": "assistant", "content": row["Answer"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

dataset = pd.read_excel("train_split.xlsx")

In [None]:
df["text"] = df.apply(format_example, axis=1)
print(df.text.iloc[0])

In [None]:
def count_tokens(row: dict) -> int:
    return len(
        tokenizer(
            row["text"],
            add_special_tokens=True,
            return_attention_masks=False,
        )["input_ids"]
    )
df["token_count"] = df.apply(count_tokens, axis=1)


In [None]:
plt.hist(df.token_count, weights=np.ones(len(df.token_count)) / len(df.token_count))
plt.gca().yaxis.set_major_formatter(PrecentFormatter(1))
plt.xlabel("tokens")
plt.ylabel("Percentage")
plt.show()

### Step 4: Prepare model for PEFT

Let's prepare the model for Parameter Efficient Fine Tuning (PEFT):

In [None]:
from peft import get_peft_model, prepare_model_for_kbit_training, LoraConfig
from dataclasses import asdict
# CIDAR configuration values for fine-tuning
LORA_R = 32
LORA_DROPOUT = 0.1  
LORA_ALPHA = 64  

peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    r=LORA_R,
    lora_dropout = LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:

# hyperparamerter configuration
QUANTIZE_4BIT = True
USE_GRAD_CHECKPOINTING = True
TRAIN_BATCH_SIZE = 4
TRAIN_MAX_SEQ_LENGTH = 1024
USE_FLASH_ATTENTION = True
GRAD_ACC_STEPS = 64
LORA_R = 32
LORA_DROPOUT = 0.1
NUM_TRAIN_EPOCHS = 1
PER_DEVICE_EVAL_BATCH_Size = 4
LEARNING_RATE = 4e-4
OPTIM = "paged_adamw_8bit"
GROUP_BY_LENGTH = True
LORA_ALPHA = 64
WARMUP_RATIO = 0.08
WEIGHT_DECAY = 0.001

In [None]:

from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(
    dataset,
    test_size=0.2,       # 20% for validation
    random_state=42,     # for reproducible results
    shuffle=True         # Shuffle the data before splitting
)
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False)

### Step 5: Fine tune the model

Here, we fine tune the model for a single epoch.

In [None]:


training_arguments = SFTConfig(
    output_dir="results",
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    gradient_checkpointing=USE_GRAD_CHECKPOINTING,
    optim=OPTIM,
    save_steps=50,
    logging_steps=10,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type="linear",
    report_to="none",
    max_seq_length=TRAIN_MAX_SEQ_LENGTH,
    evaluation_strategy="epoch",
)

peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    r=LORA_R,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

trainer = SFTTrainer(
    model=model,  # Ensure model is defined earlier
    peft_config=peft_config,
    tokenizer=tokenizer,  # Ensure tokenizer is defined earlier
    args=training_arguments,
    formatting_func=format_example,  # Ensure this function is defined
    train_dataset=train_dataset,  # Ensure this is defined or passed to the function
    eval_dataset=val_dataset,  # Ensure this is defined or passed to the function
)

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

### Step 6:
Save model checkpoint

In [None]:
model.save_pretrained(save_directory='LLaMAX3-qlora')

### Step 7:
Try the fine tuned model on the same example again to see the learning progress: