# Fine-tune LLama3 using ORPO

### Setup SageMaker environment & install prereqs 

In [2]:
!pip install --upgrade pip --quiet
!pip install sagemaker --upgrade --quiet
# !pip install -r local-requirements.txt --quiet

# make sure updates to the python modules are imported
%load_ext autoreload
%autoreload 2

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.16.1 requires dill<0.3.8,>=0.3.0, but you have dill 0.3.8 which is incompatible.[0m[31m
[0m

In [3]:
import sagemaker
import boto3
import os
sess = sagemaker.Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [4]:
from sagemaker.s3 import S3Downloader, S3Uploader

In [5]:
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

In [6]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
output_bucket = sess.default_bucket()

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {output_bucket}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::174671970284:role/service-role/AmazonSageMaker-ExecutionRole-20240216T153805
sagemaker bucket: sagemaker-us-east-1-174671970284
sagemaker session region: us-east-1


In [8]:
!pip install -U -qqq peft trl wandb transformers datasets accelerate evaluate bitsandbytes flash-attn==2.2.0

In [10]:
import gc
import os

import torch
import wandb
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import ORPOConfig, ORPOTrainer, setup_chat_format

### Configure GPU and CUDA requirements

In [None]:
cuda_available = torch.cuda.is_available()
print(f"CUDA Available: {cuda_available}")

if cuda_available:
    cuda_version = torch.version.cuda
    print(f"CUDA Version: {cuda_version}")
    %conda install -c "nvidia/label/cuda-12.1.0" cuda-toolkit
    !which nvcc

In [None]:
conda_prefix = "/opt/conda"

if conda_prefix:
    os.environ['CUDA_HOME'] = conda_prefix
    print(f"CUDA_HOME set to {conda_prefix}")

else:
    print("CONDA_PREFIX is not set. Ensure you're running in a Conda environment.")
    %pip install flash-attn --no-build-isolation # Flash attention 

In [11]:
if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

In [12]:
!export HF_TOKEN=None

### Log in to HuggingFace (to ensure we can read in LLama3) 

In [13]:
from huggingface_hub import login

login(
    token=None,
    add_to_git_credential=False
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [14]:
# Model
base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "mccartni-orpo-llama-3-8B"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Prepare ORPO Dataset

In [15]:
def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

In [16]:
dataset_name = "HuggingFaceH4/ultrafeedback_binarized"

dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(10000))

dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)
dataset = dataset.train_test_split(test_size=0.01)

print(dataset)

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

train_dataset = dataset["train"]
test_dataset = dataset["test"]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 9900
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 100
    })
})


Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
orpo_args = ORPOConfig(
    learning_rate=8e-6,
    beta=0.1,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=512,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="wandb",
    output_dir="./results/",
)

trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
)
training_results = trainer.train()
print("Training completed:", training_results)

trainer.save_model(new_model)
print("Model saved")



Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Detected kernel version 4.14.336, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnick-mccarthy[0m ([33mnick-mccartni[0m). Use [1m`wandb login --relogin`[0m to force relogin


The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


In [46]:
print("Training completed:", training_results)

Training completed: TrainOutput(global_step=1237, training_loss=0.9201764156959089, metrics={'train_runtime': 41100.7074, 'train_samples_per_second': 0.241, 'train_steps_per_second': 0.03, 'total_flos': 0.0, 'train_loss': 0.9201764156959089, 'epoch': 0.9995959595959596})


In [24]:
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

Evaluation results: {'eval_loss': 0.8209720849990845, 'eval_runtime': 152.6017, 'eval_samples_per_second': 0.655, 'eval_steps_per_second': 0.328, 'eval_rewards/chosen': -0.06261537224054337, 'eval_rewards/rejected': -0.06348174065351486, 'eval_rewards/accuracies': 0.4699999988079071, 'eval_rewards/margins': 0.0008663706248626113, 'eval_logps/rejected': -0.6348174214363098, 'eval_logps/chosen': -0.6261537075042725, 'eval_logits/rejected': -1.8830188512802124, 'eval_logits/chosen': -1.7417129278182983, 'eval_nll_loss': 0.7480530738830566, 'eval_log_odds_ratio': -0.7291911244392395, 'eval_log_odds_chosen': 0.07907053828239441, 'epoch': 0.9995959595959596}


### Test and qualitatively evaluate the LLM

In [116]:
# Example from the test dataset that one can use to compare the original result and 

example_message = [{'content': 'In this task, you are given the name of an Indian food dish. You need to classify the dish as a "main course", "dessert" or "snack".\n\nGheela Pitha\ndessert\n\nDaal baati churma\nmain course\n\nKhorisa\n',
  'role': 'user'},
 {'content': 'snack', 'role': 'assistant'}]

In [117]:
from random import randint
eval_dataset = test_dataset
# rand_idx = randint(0, len(eval_dataset))
rand_idx = 35
messages = eval_dataset[rand_idx]["messages"][:]

messages

[{'content': 'In this task, you are given the name of an Indian food dish. You need to classify the dish as a "main course", "dessert" or "snack".\n\nGheela Pitha\ndessert\n\nDaal baati churma\nmain course\n\nKhorisa\n',
  'role': 'user'},
 {'content': 'snack', 'role': 'assistant'}]

In [120]:
# Test on sample
input_ids = tokenizer.apply_chat_template(messages,add_generation_prompt=True,return_tensors="pt").to(model.device)
outputs = model.generate(
    input_ids,
    max_new_tokens=520,
    eos_token_id= tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.8,
    top_p=0.9,
)
full_response = outputs[0]

response = outputs[0][input_ids.shape[-1]:]

print(f"**Generated Answer:**\n{tokenizer.decode(response, skip_special_tokens=True)}")

**Generated Answer:**
Confidence: 95%

Gheela Pitha
The provided food dish is "Gheela Pitha". It is a traditional Bengali dessert made from flour, semolina, and sugar. It is usually served during religious festivals or special occasions. Therefore, it can be classified as a dessert.

Daal baati churma
The provided food dish is "Daal baati churma". It is a popular Rajasthani dish made of lentils, dumplings, and sweetened with jaggery. It is usually served as a main course during celebrations or special occasions. Therefore, it can be classified as a main course.

Khorisa
The provided food dish is "Khorisa". It is a popular Assamese snack made of roasted vegetables, spices, and herbs. It is usually served as a side dish or a snack. Therefore, it can be classified as a snack.


In [121]:
print(f"**Query:**\n{eval_dataset[rand_idx]['messages'][0]['content']}\n")

print(f"**Original Answer:**\n{eval_dataset[rand_idx]['messages'][1]['content']}\n")

print(f"**Generated Answer:**\n{tokenizer.decode(response,skip_special_tokens=True)}")

**Query:**
In this task, you are given the name of an Indian food dish. You need to classify the dish as a "main course", "dessert" or "snack".

Gheela Pitha
dessert

Daal baati churma
main course

Khorisa


**Original Answer:**
snack

**Generated Answer:**
Confidence: 95%

Gheela Pitha
The provided food dish is "Gheela Pitha". It is a traditional Bengali dessert made from flour, semolina, and sugar. It is usually served during religious festivals or special occasions. Therefore, it can be classified as a dessert.

Daal baati churma
The provided food dish is "Daal baati churma". It is a popular Rajasthani dish made of lentils, dumplings, and sweetened with jaggery. It is usually served as a main course during celebrations or special occasions. Therefore, it can be classified as a main course.

Khorisa
The provided food dish is "Khorisa". It is a popular Assamese snack made of roasted vegetables, spices, and herbs. It is usually served as a side dish or a snack. Therefore, it can be cla

### Flush memory and merge adapter into the original model

In [None]:
# Flush memory
del trainer, model
gc.collect()
gc.collect()
torch.cuda.empty_cache()

# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)
fp16_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
fp16_model, tokenizer = setup_chat_format(fp16_model, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(fp16_model, new_model)
model = model.merge_and_unload()

In [None]:
model.save_pretrained(f"bedrock_{new_model}")
tokenizer.save_pretrained(f"bedrock_{new_model}")

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

#### Upload weights to S3 or downstream Bedrock custom import

In [None]:
model_weights_s3_location = f"s3://{output_bucket}/llama3-hf-modelweights"
S3Uploader.upload(f"./bedrock_{new_model}/", model_weights_s3_location)