# Requirements

In [None]:
!pip install accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets torch

In [None]:
import os
import torch
# torch.cuda.empty_cache()
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
################################################################################
# QLoRA parameters
################################################################################

# # LoRA attention dimension
# lora_r = 64

# # Alpha parameter for LoRA scaling
# lora_alpha = 16

# # Dropout probability for LoRA layers
# lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# # Activate 4-bit precision base model loading
# use_4bit = True

# # Compute dtype for 4-bit base models
# bnb_4bit_compute_dtype = "float16"

# # Quantization type (fp4 or nf4)
# bnb_4bit_quant_type = "nf4"

# # Activate nested quantization for 4-bit base models (double quantization)
# use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# # Output directory where the model predictions and checkpoints will be stored
# output_dir = "./results"

# # Number of training epochs
# num_train_epochs = 100

# # Enable fp16/bf16 training (set bf16 to True with an A100)
# fp16 = False
# bf16 = False

# # Batch size per GPU for training
# per_device_train_batch_size = 4

# # Batch size per GPU for evaluation
# per_device_eval_batch_size = 4

# # Number of update steps to accumulate the gradients for
# gradient_accumulation_steps = 1

# # Enable gradient checkpointing
# gradient_checkpointing = True

# # Maximum gradient normal (gradient clipping)
# max_grad_norm = 0.3

# # Initial learning rate (AdamW optimizer)
# learning_rate = 2e-4

# # Weight decay to apply to all layers except bias/LayerNorm weights
# weight_decay = 0.001

# # Optimizer to use
# optim = "paged_adamw_32bit"

# # Learning rate schedule (constant a bit better than cosine)
# lr_scheduler_type = "constant"

# # Number of training steps (overrides num_train_epochs)
# max_steps = -1

# # Ratio of steps for a linear warmup (from 0 to learning rate)
# warmup_ratio = 0.03

# # Group sequences into batches with same length
# # Saves memory and speeds up training considerably
# group_by_length = True

# # Save checkpoint every X updates steps
# save_steps = 25

# # Log every X updates steps
# logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# # Maximum sequence length to use
# max_seq_length = None

# # Pack multiple short examples in the same input sequence to increase efficiency
# packing = False

# Dataset

In [None]:
dataset_name = "mlabonne/guanaco-llama2-1k"
dataset = load_dataset(dataset_name, split="train")

# Quantization

In [None]:
compute_dtype = getattr(torch, "float16")
use_4bit = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,                    # Activate 4-bit precision base model loading
    bnb_4bit_quant_type="nf4",            # Quantization type (fp4 or nf4)
    bnb_4bit_compute_dtype=compute_dtype, # Compute dtype for 4-bit base models
    bnb_4bit_use_double_quant=False,      # Activate nested quantization for 4-bit base models (double quantization)
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Model

In [None]:
# Load base model
model_name = "NousResearch/llama-2-7b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map= {"": 0}               # Load the entire model on the GPU 0
)

# Model configs
model.config.use_cache = False
model.config.pretraining_tp = 1

# Tokenizer

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Tokenizer configs
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# PEFT & LORA

In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,            # Alpha parameter for LoRA scaling
    lora_dropout=0.1,         # Dropout probability for LoRA layers
    r=64,                     # LoRA attention dimension
    bias="none",
    task_type="CAUSAL_LM",
)

# Fine-Tuning

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir="./results",           # Output directory where the model predictions and checkpoints will be stored
    num_train_epochs=100,             # Number of training epochs
    per_device_train_batch_size=4,    # Batch size per GPU for training
    gradient_accumulation_steps=1,    # Number of update steps to accumulate the gradients for
    optim="paged_adamw_32bit",        # Optimizer to use
    save_steps=25,                    # Save checkpoint every X updates steps
    logging_steps=25,                 # Log every X updates steps
    learning_rate=2e-4,               # Initial learning rate (AdamW optimizer)
    weight_decay=0.001,               # Weight decay to apply to all layers except bias/LayerNorm weights
    bf16=False,                       # Enable fp16/bf16 training (set bf16 to True with an A100)
    fp16=False,
    max_grad_norm=0.3,                # Maximum gradient normal (gradient clipping)
    max_steps=-1,                     # Number of training steps (overrides num_train_epochs)
    warmup_ratio=0.03,                # Ratio of steps for a linear warmup (from 0 to learning rate)
    group_by_length=True,             # Group sequences into batches with same length (Saves memory and speeds up training considerably)
    lr_scheduler_type="constant",     # Learning rate schedule (constant a bit better than cosine)
    report_to="tensorboard"
)

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,          # Maximum sequence length to use
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,                # Pack multiple short examples in the same input sequence to increase efficiency
)

In [None]:
# Train model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.3467
50,1.6114
75,1.2076
100,1.4338
125,1.178
150,1.3563
175,1.171
200,1.4529
225,1.1538
250,1.5218




In [None]:
# Save trained model
new_model = "llama-2-7b-miniguanaco"
trainer.model.save_pretrained(new_model)