# Install Unsloth and Required Libraries

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

# Load Unsloth Model (Llama 3.2 3B Instruct)

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# Add LoRA adapters to the model

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.4.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


# Prepare Dataset (HuggingFace Ultrachat 200k)

In [6]:
from datasets import load_dataset

# load dataset
dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")

def formatting_prompts_func(examples):
    all_texts = []
    for convo in examples["messages"]:
        conversation = []
        for turn in convo:
            conversation.append({"role": turn["role"], "content": turn["content"]})
        text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
        all_texts.append(text)
    return {"text": all_texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/207865 [00:00<?, ? examples/s]

# Finetune the Model Using SFTTrainer

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/207865 [00:00<?, ? examples/s]

# Mask User Inputs During Training

In [8]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=12):   0%|          | 0/207865 [00:00<?, ? examples/s]

# Start Training

In [9]:
trainer_stats = trainer.train()

print(f"Training time (seconds): {trainer_stats.metrics['train_runtime']}")
print(f"Training time (minutes): {round(trainer_stats.metrics['train_runtime']/60, 2)}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 207,865 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856/3,000,000,000 (0.81% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.354
2,1.2229
3,1.1721
4,1.1354
5,1.2277
6,1.3379
7,1.1328
8,1.0669
9,1.087
10,0.9911


Training time (seconds): 137.9464
Training time (minutes): 2.3


# Inference — Test the Finetuned Model!

In [10]:
FastLanguageModel.for_inference(model)

messages = [
    {"role": "user", "content": "Can you tell me a fun fact about dolphins?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=100,
    use_cache=True,
    temperature=1.5,
    min_p=0.1,
)

print(tokenizer.batch_decode(outputs))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCan you tell me a fun fact about dolphins?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nSure, here is a fun fact about dolphins: Dolphins have their own signature whistles that they use to identify themselves and communicate with other dolphins. They can use these whistles for all sorts of purposes, like saying hello or goodbye, or even just to give their friends a shout-out to say "Hey, what\'s up?!" Isn\'t that cool?<|eot_id|>']


# Saving the Model

In [11]:
model.save_pretrained("lora_finetuned_model")
tokenizer.save_pretrained("lora_finetuned_model")

('lora_finetuned_model/tokenizer_config.json',
 'lora_finetuned_model/special_tokens_map.json',
 'lora_finetuned_model/tokenizer.json')