In [1]:
# install Unsloth and dependencies
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
# load Unsloth Model
from unsloth import FastLanguageModel
import torch

max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [3]:
# add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.4.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
# load a Reward Modeling dataset
from datasets import load_dataset

dataset = load_dataset("Anthropic/hh-rlhf", split="train[:1%]")  # Small slice for fast demo

README.md:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/743k [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/875k [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8552 [00:00<?, ? examples/s]

In [5]:
# format the dataset
EOS_TOKEN = tokenizer.eos_token

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def format_prompt(sample):
    sample["prompt"] = alpaca_prompt.format("Follow the task carefully.", "", "")
    sample["chosen"] = sample["chosen"] + EOS_TOKEN
    sample["rejected"] = sample["rejected"] + EOS_TOKEN
    return sample

dataset = dataset.map(format_prompt)

Map:   0%|          | 0/1608 [00:00<?, ? examples/s]

In [6]:
# patch trainer for unsloth
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

# setup ORPOTrainer
from trl import ORPOConfig, ORPOTrainer
from unsloth import is_bfloat16_supported

trainer = ORPOTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    args = ORPOConfig(
        max_length = max_seq_length,
        max_prompt_length = max_seq_length//2,
        max_completion_length = max_seq_length//2,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        beta = 0.1,
        logging_steps = 1,
        optim = "adamw_8bit",
        lr_scheduler_type = "linear",
        max_steps = 30,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        output_dir = "outputs",
        report_to = "none",
    ),
)

trainer.train()



Map (num_proc=12):   0%|          | 0/1608 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/1608 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/1608 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,608 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen,log_odds_ratio,log_odds_chosen,eval_logits / chosen,eval_logits / rejected,nll_loss
1,11.4257,-0.250587,-0.260891,0.625,0.010305,-2.608915,-2.505869,-0.978484,-1.050771,-0.646974,0.110838,0,0,2.791721
2,10.9003,-0.229243,-0.259738,0.875,0.030496,-2.597383,-2.292428,-1.199643,-1.261881,-0.573346,0.320377,No Log,No Log,2.667738
3,11.6259,-0.248083,-0.239316,0.75,-0.008767,-2.393162,-2.480834,-1.080697,-1.08001,-0.767003,-0.087255,No Log,No Log,2.829784
4,10.6846,-0.245826,-0.24297,0.5,-0.002857,-2.429699,-2.458264,-1.058668,-0.97786,-0.724468,-0.030883,No Log,No Log,2.598693
5,10.2186,-0.222293,-0.226232,0.625,0.003939,-2.262321,-2.222934,-1.057455,-0.990702,-0.674259,0.041686,No Log,No Log,2.487219
6,10.083,-0.224061,-0.218745,0.375,-0.005316,-2.187455,-2.240611,-1.053939,-0.878028,-0.726457,-0.060199,No Log,No Log,2.448105
7,9.0032,-0.205277,-0.202572,0.625,-0.002705,-2.02572,-2.05277,-1.025667,-0.858779,-0.709309,-0.02781,No Log,No Log,2.179859
8,9.7983,-0.22487,-0.223624,0.5,-0.001247,-2.236236,-2.248701,-1.042608,-0.760805,-0.703904,-0.013187,No Log,No Log,2.379186
9,9.7506,-0.24496,-0.252915,0.75,0.007956,-2.529153,-2.449595,-0.741007,-0.939986,-0.656764,0.083276,No Log,No Log,2.371982
10,9.0525,-0.221191,-0.214631,0.375,-0.00656,-2.146312,-2.211907,-0.819632,-0.925007,-0.731543,-0.070821,No Log,No Log,2.189982


TrainOutput(global_step=30, training_loss=8.408188962936402, metrics={'train_runtime': 69.1844, 'train_samples_per_second': 3.469, 'train_steps_per_second': 0.434, 'total_flos': 0.0, 'train_loss': 8.408188962936402, 'epoch': 0.14925373134328357})

In [7]:
# inference
FastLanguageModel.for_inference(model)

inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Explain how the reward modeling works in AI systems.",
            "",
            ""
        )
    ],
    return_tensors="pt",
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Explain how the reward modeling works in AI systems.

### Input:


### Response:
AI systems use reward modeling to learn how to perform tasks by receiving feedback from their environment. The feedback is used to adjust the system's behavior and improve its performance over time. Reward modeling involves defining a reward function that describes the desired behavior of the system and the conditions under which it should receive a reward. The system then learns to maximize the reward by exploring different actions and adjusting its behavior based on the feedback it receives.


In [8]:
# save LoRA adapters
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')