In [1]:
%%capture
!pip install unsloth optuna
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git

* We support Llama, Mistral, Phi-3, Gemma, Yi, DeepSeek, Qwen, TinyLlama, Vicuna, Open Hermes etc
* We support 16bit LoRA or 4bit QLoRA. Both 2x faster.
* `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.
* [**NEW**] We make Gemma-2 9b / 27b **2x faster**! See our [Gemma-2 9b notebook](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)
* [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
] # More models at https://huggingface.co/unsloth

base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
from google.colab import drive
import os

drive.mount('/content/drive')
checkpoint_dir = '/content/drive/MyDrive/llama-3b-lora-checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Preparations

In [4]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True,        # Maps <|im_end|> to <|eot_id|> instead
)

def general_formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

def tool_formatting_prompts_func(examples):
    convos = []

    # Iterate through each item in the batch (examples are structured as lists of values)
    for query, tools, answers in zip(examples['query'], examples['tools'], examples['answers']):
        tool_user = {
            "content": f"You are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate response to the user utterance. Use the following tools or function calls as required:\n{tools}",
            "role": "system"
        }
        ques_user = {
            "content": f"{query}",
            "role": "user"
        }
        assistant = {
            "content": f"{answers}",
            "role": "assistant"
        }
        convos.append([tool_user, ques_user, assistant])

    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

In [5]:
from unsloth.chat_templates import standardize_sharegpt
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')

if hf_token is None:
    raise EnvironmentError("HF_TOKEN is not set in the environment variables.")
login(hf_token)

general_dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
general_dataset = general_dataset.select(range(50000))
general_dataset = standardize_sharegpt(general_dataset)
general_dataset = general_dataset.map(general_formatting_prompts_func, batched = True)
print(f"Using a sample size of {len(general_dataset)} for general fine-tuning.")

tool_dataset = load_dataset("Salesforce/xlam-function-calling-60k", split="train", token=hf_token)
tool_dataset = tool_dataset.select(range(25000))
tool_dataset = tool_dataset.map(tool_formatting_prompts_func, batched = True)
print(f"Using a sample size of {len(tool_dataset)} for tool fine-tuning.")

dataset = concatenate_datasets([general_dataset, tool_dataset])
print(f"Using a total dataset of {len(dataset)} for fine-tuning.")

Using a sample size of 50000 for general fine-tuning.
Using a sample size of 25000 for tool fine-tuning.
Using a total dataset of 75000 for fine-tuning.


In [6]:
dataset[5]

{'conversations': [{'content': 'How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?',
   'role': 'user'},
  {'content': 'Astronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.',
   'role': 'assistant'}],
 'source': 'WebInstructSub_axolotl',
 'score': 5.025244235992432,
 'text': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow do ast

In [7]:
dataset[-5]

{'conversations': None,
 'source': None,
 'score': None,
 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate response to the user utterance. Use the following tools or function calls as required:\n[{"name": "email_extractor", "description": "Extracts emails from the provided URL using the RapidAPI email scraper service.", "parameters": {"url": {"description": "The URL from which to extract emails.", "type": "str", "default": "https://en.wikipedia.org/wiki/Email"}}}]<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCan you help me find emails from a website about sustainable fashion? Also, I need emails from a tech blog that talks about AI and machine learning. And could you also extract emails from a local bakery\'s website?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n[{"name": "email

## Hyperparameter Search

### Coarse-grained Search

In [9]:
import os
import random
import gc
import torch

from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

# =========================================================
# 1. SETUP PARAMS AND DATA
# =========================================================

# General Training Parameters
max_seq_length = 2048
load_in_4bit = True
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16

# Fixed Seed for overall training reproducibility
BASE_SEED = 3407
N_TRIALS = 16
FIXED_STEPS = 100

# --- Dataset Loading ---
dataset_split = dataset.train_test_split(test_size=0.01, seed=BASE_SEED)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

# =========================================================
# 2. DETERMINISTIC HYPERPARAMETER SAMPLING
# =========================================================

def generate_deterministic_hparams(seed: int):
    """
    Manually samples hyperparameters deterministically using a per-trial seed.
    This mimics Optuna's deterministic random sampler.
    """
    rng = random.Random(seed)

    # Categorical sampling
    r_value = rng.choice([8, 16, 32])
    alpha_ratio = rng.choice([1, 2])
    lora_alpha = r_value * alpha_ratio
    grad_accum = rng.choice([2, 4, 8])

    # Log-Uniform sampling (Learning Rate)
    log_min = torch.log(torch.tensor(2e-5)).item()
    log_max = torch.log(torch.tensor(2e-4)).item()
    log_lr = rng.uniform(log_min, log_max)
    learning_rate = torch.exp(torch.tensor(log_lr)).item()

    return {
        "r": r_value,
        "alpha_ratio": alpha_ratio,
        "lora_alpha": lora_alpha,
        "learning_rate": learning_rate,
        "gradient_accumulation_steps": grad_accum,
    }


# =========================================================
# 3. TRAINING FUNCTION (Fixed 100-step budget)
# =========================================================

def run_training_trial(hparams, trial_number):
    """Runs a single training session with a fixed budget."""
    gc.collect()
    torch.cuda.empty_cache()

    r_value = hparams["r"]
    lora_alpha = hparams["lora_alpha"]
    learning_rate = hparams["learning_rate"]
    grad_accum = hparams["gradient_accumulation_steps"]

    print(f"\n--- Starting Trial {trial_number} ---")
    print(f"Params: LR={learning_rate:.2e}, r={r_value}, alpha={lora_alpha}, accum={grad_accum}")

    # Build model (Unsloth optimized loading and PEFT-wrapping)
    base_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

    model = FastLanguageModel.get_peft_model(
        base_model,
        r=r_value,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=lora_alpha,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=BASE_SEED,
    )

    # TrainingArguments
    trial_output_dir = os.path.join(checkpoint_dir, f"trial_{trial_number}")
    os.makedirs(trial_output_dir, exist_ok=True)

    args = TrainingArguments(
        output_dir=trial_output_dir,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=grad_accum,
        warmup_ratio=0.1,
        max_steps=FIXED_STEPS,
        learning_rate=learning_rate,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=5,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=BASE_SEED,
        report_to="none",
        save_strategy='no',
    )

    # SFTTrainer Initialization and Training
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        dataset_num_proc=10,
        packing=False,
        args=args,
    )

    trainer = train_on_responses_only(
        trainer,
        instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
        response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
    )

    trainer.train()

    final_eval_loss = trainer.evaluate()["eval_loss"]

    # Clean up memory
    del model, base_model, trainer, tokenizer
    gc.collect()
    torch.cuda.empty_cache()

    return final_eval_loss


# =========================================================
# 4. DETERMINISTIC EXECUTION LOOP
# =========================================================

results = []
best_loss = float('inf')
best_hparams = {}

print("=" * 60)
print(f"Starting Deterministic Fixed-Budget Search: {N_TRIALS} Trials at {FIXED_STEPS} Steps Each")
print("=" * 60)

for i in range(N_TRIALS):
    trial_number = i + 1

    try:
        # Generate parameters using a unique, deterministic seed
        hparams = generate_deterministic_hparams(BASE_SEED + i)

        # Run the fixed-budget training
        loss = run_training_trial(hparams, trial_number)

        # Store and track results
        results.append({"trial": trial_number, "loss": loss, "hparams": hparams})

        print(f"Loss for Trial {trial_number}: {loss}")

        if loss < best_loss:
            best_loss = loss
            best_hparams = hparams
            print(f"üèÜ NEW BEST LOSS FOUND: {best_loss:.4f} at Trial {trial_number}")

    except KeyboardInterrupt:
        print("\nStopped manually. Saving current progress.")
        break
    except Exception as e:
        print(f"\nTrial {trial_number} failed with error: {e}")
        results.append({"trial": trial_number, "loss": float('nan'), "hparams": hparams})
        continue


# =========================================================
# 5. RESULTS SUMMARY
# =========================================================

print("\n" + "=" * 60)
print("FINAL RESULTS SUMMARY (Deterministic Fixed-Budget)")
print("=" * 60)

if best_hparams:
    print(f"\nBest Eval Loss: {best_loss:.4f}")
    print("\nBEST HYPERPARAMETERS:")
    for key, value in best_hparams.items():
        if key == "learning_rate":
             print(f"  - {key}: {value:.2e}")
        else:
             print(f"  - {key}: {value}")
else:
    print("\nNo successful trials completed.")

Starting Deterministic Fixed-Budget Search: 16 Trials at 100 Steps Each

--- Starting Trial 1 ---
Params: LR=4.28e-05, r=8, alpha=16, accum=2
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 5,636,096 of 1,241,450,496 (0.45% trained)


Step,Training Loss
5,0.9509
10,0.9227
15,0.9014
20,0.9474
25,0.8632
30,0.8388
35,0.9469
40,1.077
45,0.8875
50,0.8868


Loss for Trial 1: 0.8473519682884216
üèÜ NEW BEST LOSS FOUND: 0.8474 at Trial 1

--- Starting Trial 2 ---
Params: LR=3.40e-05, r=8, alpha=16, accum=4
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 5,636,096 of 1,241,450,496 (0.45% trained)


Step,Training Loss
5,0.96
10,0.9366
15,0.9053
20,1.048
25,0.9187
30,0.9518
35,0.9616
40,0.9842
45,0.8416
50,0.865


Loss for Trial 2: 0.8476158976554871

--- Starting Trial 3 ---
Params: LR=2.24e-05, r=8, alpha=16, accum=2
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 5,636,096 of 1,241,450,496 (0.45% trained)


Step,Training Loss
5,0.9514
10,0.9292
15,0.9118
20,0.9639
25,0.8814
30,0.8586
35,0.9707
40,1.0989
45,0.9147
50,0.9128


Loss for Trial 3: 0.870863676071167

--- Starting Trial 4 ---
Params: LR=6.19e-05, r=16, alpha=16, accum=4
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9595
10,0.9301
15,0.8933
20,1.0301
25,0.8969
30,0.9313
35,0.9404
40,0.964
45,0.8219
50,0.8459


Loss for Trial 4: 0.830998957157135
üèÜ NEW BEST LOSS FOUND: 0.8310 at Trial 4

--- Starting Trial 5 ---
Params: LR=1.08e-04, r=8, alpha=16, accum=2
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 5,636,096 of 1,241,450,496 (0.45% trained)


Step,Training Loss
5,0.9487
10,0.9087
15,0.8754
20,0.9143
25,0.836
30,0.8178
35,0.9161
40,1.0519
45,0.8572
50,0.8584


Loss for Trial 5: 0.8257739543914795
üèÜ NEW BEST LOSS FOUND: 0.8258 at Trial 5

--- Starting Trial 6 ---
Params: LR=4.06e-05, r=16, alpha=16, accum=2
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9511
10,0.9242
15,0.9034
20,0.9504
25,0.8661
30,0.8414
35,0.9508
40,1.0799
45,0.8913
50,0.8897


Loss for Trial 6: 0.849261462688446

--- Starting Trial 7 ---
Params: LR=3.81e-05, r=16, alpha=16, accum=4
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.96
10,0.9362
15,0.9043
20,1.0464
25,0.9161
30,0.9492
35,0.9586
40,0.9809
45,0.8383
50,0.8614


Loss for Trial 7: 0.8444796800613403

--- Starting Trial 8 ---
Params: LR=4.58e-05, r=16, alpha=32, accum=2
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9495
10,0.9135
15,0.8866
20,0.9272
25,0.8478
30,0.8252
35,0.928
40,1.0621
45,0.8691
50,0.8703


Loss for Trial 8: 0.8348084092140198

--- Starting Trial 9 ---
Params: LR=1.05e-04, r=16, alpha=32, accum=8
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9491
10,0.9692
15,0.9165
20,0.9464
25,0.8356
30,0.8976
35,0.8153
40,0.878
45,0.8582
50,0.8173


Loss for Trial 9: 0.8058328628540039
üèÜ NEW BEST LOSS FOUND: 0.8058 at Trial 9

--- Starting Trial 10 ---
Params: LR=1.22e-04, r=32, alpha=32, accum=8
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 22,544,384 of 1,258,358,784 (1.79% trained)


Step,Training Loss
5,0.9484
10,0.9653
15,0.9117
20,0.9437
25,0.8317
30,0.8943
35,0.8129
40,0.8753
45,0.8555
50,0.8147


Loss for Trial 10: 0.8030117750167847
üèÜ NEW BEST LOSS FOUND: 0.8030 at Trial 10

--- Starting Trial 11 ---
Params: LR=1.46e-04, r=16, alpha=32, accum=8
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9471
10,0.9604
15,0.9061
20,0.9405
25,0.8278
30,0.8905
35,0.8098
40,0.8718
45,0.8519
50,0.8119


Loss for Trial 11: 0.7997848987579346
üèÜ NEW BEST LOSS FOUND: 0.7998 at Trial 11

--- Starting Trial 12 ---
Params: LR=2.66e-05, r=8, alpha=8, accum=2
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 5,636,096 of 1,241,450,496 (0.45% trained)


Step,Training Loss
5,0.9516
10,0.9321
15,0.9177
20,0.9712
25,0.889
30,0.8681
35,0.981
40,1.1086
45,0.926
50,0.9233


Loss for Trial 12: 0.8811953067779541

--- Starting Trial 13 ---
Params: LR=2.88e-05, r=8, alpha=16, accum=4
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 5,636,096 of 1,241,450,496 (0.45% trained)


Step,Training Loss
5,0.9601
10,0.9383
15,0.9082
20,1.0522
25,0.9247
30,0.9574
35,0.9675
40,0.991
45,0.8479
50,0.8717


Loss for Trial 13: 0.853173017501831

--- Starting Trial 14 ---
Params: LR=3.18e-05, r=16, alpha=32, accum=8
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9532
10,0.9915
15,0.9538
20,0.9833
25,0.8708
30,0.935
35,0.8433
40,0.9075
45,0.889
50,0.8456


Loss for Trial 14: 0.8320459723472595

--- Starting Trial 15 ---
Params: LR=1.67e-04, r=16, alpha=32, accum=4
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9528
10,0.8969
15,0.8553
20,0.9881
25,0.853
30,0.8961
35,0.9092
40,0.9355
45,0.7911
50,0.817


Loss for Trial 15: 0.8051513433456421

--- Starting Trial 16 ---
Params: LR=3.54e-05, r=16, alpha=16, accum=2
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9512
10,0.9257
15,0.9058
20,0.9544
25,0.8703
30,0.8453
35,0.9561
40,1.0848
45,0.8975
50,0.8955


Loss for Trial 16: 0.8537077903747559

FINAL RESULTS SUMMARY (Deterministic Fixed-Budget)

Best Eval Loss: 0.7998

BEST HYPERPARAMETERS:
  - r: 16
  - alpha_ratio: 2
  - lora_alpha: 32
  - learning_rate: 1.46e-04
  - gradient_accumulation_steps: 8


In [11]:
from pprint import pprint
pprint(results)

[{'hparams': {'alpha_ratio': 2,
              'gradient_accumulation_steps': 2,
              'learning_rate': 4.284925671527162e-05,
              'lora_alpha': 16,
              'r': 8},
  'loss': 0.8473519682884216,
  'trial': 1},
 {'hparams': {'alpha_ratio': 2,
              'gradient_accumulation_steps': 4,
              'learning_rate': 3.403441223781556e-05,
              'lora_alpha': 16,
              'r': 8},
  'loss': 0.8476158976554871,
  'trial': 2},
 {'hparams': {'alpha_ratio': 2,
              'gradient_accumulation_steps': 2,
              'learning_rate': 2.240711728518363e-05,
              'lora_alpha': 16,
              'r': 8},
  'loss': 0.870863676071167,
  'trial': 3},
 {'hparams': {'alpha_ratio': 1,
              'gradient_accumulation_steps': 4,
              'learning_rate': 6.185057281982154e-05,
              'lora_alpha': 16,
              'r': 16},
  'loss': 0.830998957157135,
  'trial': 4},
 {'hparams': {'alpha_ratio': 2,
              'gradient_accumulat

### Fine-grained Search

In [8]:
import os
import random
import gc
import torch

from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

# =========================================================
# 1. SETUP PARAMS AND DATA
# =========================================================

# General Training Parameters
max_seq_length = 2048
load_in_4bit = True
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16

# Fixed Seed for overall training reproducibility
BASE_SEED = 3407
FIXED_STEPS = 300

# --- Dataset Loading ---
dataset_split = dataset.train_test_split(test_size=0.01, seed=BASE_SEED)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

# =========================================================
# 2. DETERMINISTIC HYPERPARAMETER SAMPLING
# =========================================================

def generate_deterministic_hparams(seed: int):
    """
    Manually samples hyperparameters deterministically using a per-trial seed.
    This mimics Optuna's deterministic random sampler.
    """
    rng = random.Random(seed)

    # Categorical sampling
    r_value = rng.choice([8, 16, 32])
    alpha_ratio = rng.choice([1, 2])
    lora_alpha = r_value * alpha_ratio
    grad_accum = rng.choice([2, 4, 8])

    # Log-Uniform sampling (Learning Rate)
    log_min = torch.log(torch.tensor(2e-5)).item()
    log_max = torch.log(torch.tensor(2e-4)).item()
    log_lr = rng.uniform(log_min, log_max)
    learning_rate = torch.exp(torch.tensor(log_lr)).item()

    return {
        "r": r_value,
        "alpha_ratio": alpha_ratio,
        "lora_alpha": lora_alpha,
        "learning_rate": learning_rate,
        "gradient_accumulation_steps": grad_accum,
    }


# =========================================================
# 3. TRAINING FUNCTION (Fixed 300-step budget)
# =========================================================

def run_training_trial(hparams, trial_number):
    """Runs a single training session with a fixed budget."""
    gc.collect()
    torch.cuda.empty_cache()

    r_value = hparams["r"]
    lora_alpha = hparams["lora_alpha"]
    learning_rate = hparams["learning_rate"]
    grad_accum = hparams["gradient_accumulation_steps"]

    print(f"\n--- Starting Trial {trial_number} ---")
    print(f"Params: LR={learning_rate:.2e}, r={r_value}, alpha={lora_alpha}, accum={grad_accum}")

    # Build model (Unsloth optimized loading and PEFT-wrapping)
    base_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

    model = FastLanguageModel.get_peft_model(
        base_model,
        r=r_value,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=lora_alpha,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=BASE_SEED,
    )

    # TrainingArguments
    trial_output_dir = os.path.join(checkpoint_dir, f"trial_{trial_number}")
    os.makedirs(trial_output_dir, exist_ok=True)

    args = TrainingArguments(
        output_dir=trial_output_dir,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=grad_accum,
        warmup_ratio=0.1,
        max_steps=FIXED_STEPS,
        learning_rate=learning_rate,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=5,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=BASE_SEED,
        report_to="none",
        save_strategy='no',
    )

    # SFTTrainer Initialization and Training
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        dataset_num_proc=10,
        packing=False,
        args=args,
    )

    trainer = train_on_responses_only(
        trainer,
        instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
        response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
    )

    trainer.train()

    final_eval_loss = trainer.evaluate()["eval_loss"]

    # Clean up memory
    del model, base_model, trainer, tokenizer
    gc.collect()
    torch.cuda.empty_cache()

    return final_eval_loss


# =========================================================
# 4. DETERMINISTIC EXECUTION LOOP
# =========================================================

results_fine = []
best_loss = float('inf')
best_hparams = {}

print("=" * 60)
print(f"Starting Deterministic Fixed-Budget Search: 4 Trials at {FIXED_STEPS} Steps Each")
print("=" * 60)

for trial_number in [9, 10, 11, 15]:

    try:
        # Generate parameters using a unique, deterministic seed
        hparams = generate_deterministic_hparams(BASE_SEED + (trial_number - 1))
        print(f"")

        # Run the fixed-budget training
        loss = run_training_trial(hparams, trial_number)

        # Store and track results
        results_fine.append({"trial": trial_number, "loss": loss, "hparams": hparams})

        print(f"Loss for Trial {trial_number}: {loss}")

        if loss < best_loss:
            best_loss = loss
            best_hparams = hparams
            print(f"üèÜ NEW BEST LOSS FOUND: {best_loss:.4f} at Trial {trial_number}")

    except KeyboardInterrupt:
        print("\nStopped manually. Saving current progress.")
        break
    except Exception as e:
        print(f"\nTrial {trial_number} failed with error: {e}")
        results_fine.append({"trial": trial_number, "loss": float('nan'), "hparams": hparams})
        continue


# =========================================================
# 5. RESULTS SUMMARY
# =========================================================

print("\n" + "=" * 60)
print("FINAL RESULTS SUMMARY (Deterministic Fixed-Budget)")
print("=" * 60)

if best_hparams:
    print(f"\nBest Eval Loss: {best_loss:.4f}")
    print("\nBEST HYPERPARAMETERS:")
    for key, value in best_hparams.items():
        if key == "learning_rate":
             print(f"  - {key}: {value:.2e}")
        else:
             print(f"  - {key}: {value}")
else:
    print("\nNo successful trials completed.")

Starting Deterministic Fixed-Budget Search: 4 Trials at 300 Steps Each


--- Starting Trial 9 ---
Params: LR=1.05e-04, r=16, alpha=32, accum=8
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.11.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/74250 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/750 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/74250 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/750 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
5,0.953
10,0.9901
15,0.9505
20,0.9732
25,0.8549
30,0.9132
35,0.8227
40,0.8838
45,0.8618
50,0.8193


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Loss for Trial 9: 0.777212381362915
üèÜ NEW BEST LOSS FOUND: 0.7772 at Trial 9


--- Starting Trial 10 ---
Params: LR=1.22e-04, r=32, alpha=32, accum=8
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 22,544,384 of 1,258,358,784 (1.79% trained)


Step,Training Loss
5,0.9527
10,0.9881
15,0.946
20,0.9681
25,0.8504
30,0.9094
35,0.82
40,0.8808
45,0.8588
50,0.8164


Loss for Trial 10: 0.7743467688560486
üèÜ NEW BEST LOSS FOUND: 0.7743 at Trial 10


--- Starting Trial 11 ---
Params: LR=1.46e-04, r=16, alpha=32, accum=8
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9522
10,0.985
15,0.9405
20,0.9621
25,0.8457
30,0.9045
35,0.8172
40,0.877
45,0.8552
50,0.8136


Loss for Trial 11: 0.7720085978507996
üèÜ NEW BEST LOSS FOUND: 0.7720 at Trial 11


--- Starting Trial 15 ---
Params: LR=1.67e-04, r=16, alpha=32, accum=4
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
5,0.9582
10,0.9221
15,0.8802
20,1.0099
25,0.8705
30,0.9069
35,0.9164
40,0.9393
45,0.7942
50,0.8185


Loss for Trial 15: 0.7790784239768982

FINAL RESULTS SUMMARY (Deterministic Fixed-Budget)

Best Eval Loss: 0.7720

BEST HYPERPARAMETERS:
  - r: 16
  - alpha_ratio: 2
  - lora_alpha: 32
  - learning_rate: 1.46e-04
  - gradient_accumulation_steps: 8


In [10]:
from pprint import pprint
pprint(results_fine)

[{'hparams': {'alpha_ratio': 2,
              'gradient_accumulation_steps': 8,
              'learning_rate': 0.0001049246930051595,
              'lora_alpha': 32,
              'r': 16},
  'loss': 0.777212381362915,
  'trial': 9},
 {'hparams': {'alpha_ratio': 1,
              'gradient_accumulation_steps': 8,
              'learning_rate': 0.00012154182331869379,
              'lora_alpha': 32,
              'r': 32},
  'loss': 0.7743467688560486,
  'trial': 10},
 {'hparams': {'alpha_ratio': 2,
              'gradient_accumulation_steps': 8,
              'learning_rate': 0.00014625844778493047,
              'lora_alpha': 32,
              'r': 16},
  'loss': 0.7720085978507996,
  'trial': 11},
 {'hparams': {'alpha_ratio': 2,
              'gradient_accumulation_steps': 4,
              'learning_rate': 0.00016686950402799994,
              'lora_alpha': 32,
              'r': 16},
  'loss': 0.7790784239768982,
  'trial': 15}]


## Training

In [None]:
import os
import random
import gc
import torch

from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

# =========================================================
# 1. SETUP PARAMS AND DATA
# =========================================================

# General Training Parameters
max_seq_length = 2048
load_in_4bit = True
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16

# Fixed Seed for overall training reproducibility
BASE_SEED = 3407

# --- Dataset Loading ---
dataset_split = dataset.train_test_split(test_size=0.01, seed=BASE_SEED)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]


# =========================================================
# 2. DEFINE MODEL
# =========================================================
gc.collect()
torch.cuda.empty_cache()

r_value = 16
lora_alpha = 32
learning_rate = 1.46e-04
grad_accum = 8

# Build model (Unsloth optimized loading and PEFT-wrapping)
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    base_model,
    r=r_value,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=lora_alpha,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=BASE_SEED,
)


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=10,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=8,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=grad_accum,
        warmup_ratio=0.1,
        learning_rate=learning_rate,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=BASE_SEED,
        num_train_epochs = 1,
        save_strategy='steps',
        save_steps=100,
        save_total_limit=3,
        output_dir=checkpoint_dir,
        gradient_checkpointing=True,
        report_to = "none",
    ),
)

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith('checkpoint')]
checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
if checkpoints:
    latest_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])
    print('Resuming from:', latest_checkpoint)
else:
    latest_checkpoint = None
    print('No checkpoint found. Starting fresh.')
trainer_stats = trainer.train(resume_from_checkpoint=latest_checkpoint)

==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.


No checkpoint found. Starting fresh.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74,250 | Num Epochs = 1 | Total steps = 1,161
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss
50,0.9255
100,0.8647


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Describe a tall tower in the capital of France."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)
2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)
3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)
5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with ü§ó HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!
7. `ChatML` for ShareGPT datasets, [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing)
8. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)
9. [**NEW**] We make Phi-3 Medium / Mini **2x faster**! See our [Phi-3 Medium notebook](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)
10. [**NEW**] We make Gemma-2 9b / 27b **2x faster**! See our [Gemma-2 9b notebook](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)
11. [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)
12. [**NEW**] We make Mistral NeMo 12B 2x faster and fit in under 12GB of VRAM! [Mistral NeMo notebook](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing)

<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/u54VK8m8tk"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://ko-fi.com/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png" width="145"></a></a> Support our work if you can! Thanks!
</div>