# 🚀 OpenSloth Demo Training Notebook

This notebook demonstrates how to fine-tune large language models using opensloth's multi-GPU capabilities. It's equivalent to running:

```bash
opensloth-train examples/example_sharegpt_lora_2gpus.py
```

## What This Demo Does

- **Multi-GPU Training**: Uses 2 GPUs with NCCL synchronization
- **Adaptive Batching**: Optimizes sequence sorting and padding
- **LoRA Fine-tuning**: Efficient parameter updates with Low-Rank Adaptation
- **Response-only Loss**: Calculates loss only on assistant responses

## Prerequisites

1. opensloth installed: `pip install git+https://github.com/anhvth/opensloth.git`
2. At least 2 GPUs available (adjust `gpus=[0, 1]` if needed)
3. Sufficient VRAM (reduce batch size if needed)

## ⚙️ Configuration Setup

HyperSloth uses Pydantic models for type-safe configuration. We'll set up:

1. **Data Configuration**: Dataset and tokenization settings
2. **Training Configuration**: GPU allocation and loss calculation
3. **Model Configuration**: Base model and LoRA parameters
4. **Training Arguments**: Learning rate, batch size, and optimization settings

In [None]:
from opensloth.scripts.opensloth_sft_trainer import run_mp_training, setup_envs
from opensloth.opensloth_config import (
    OpenSlothConfig,
    HFDatasetConfig,
    FastModelArgs,
    LoraArgs,
    TrainingArguments,
)
from loguru import logger

# from transformers.training_args import TrainingArguments


# # Main configuration using Pydantic models
def get_configs(devices) -> tuple[OpenSlothConfig, TrainingArguments]:
    num_gpu = len(devices)
    opensloth_config = OpenSlothConfig(
        data=HFDatasetConfig(
            tokenizer_name="Qwen/Qwen3-8B",
            chat_template="qwen3",
            instruction_part="<|im_start|>user\n",
            response_part="<|im_start|>assistant\n",
            num_samples=10000,
            nproc=52,
            max_seq_length=4096,
            source_type="hf",
            dataset_name="mlabonne/FineTome-100k",
            split="train",
        ),
        devices=devices,  # list of int representing GPU ids
        fast_model_args=FastModelArgs(
            model_name="model_store/unsloth/Qwen3-14B-bnb-4bit",
            max_seq_length=4096,
            load_in_4bit=True,
        ),
        lora_args=LoraArgs(
            r=8,
            lora_alpha=16,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
            lora_dropout=0,
            bias="none",
            use_rslora=False,
        ),
        sequence_packing=True,
    )

    # # Training arguments using Pydantic model
    training_config = TrainingArguments(
        output_dir=f"outputs/exps/qwen3-14b-FineTome-{num_gpu}gpus-seql-packing",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,  # Adjust based on n_gpu
        learning_rate=1e-5,
        logging_steps=1,
        num_train_epochs=1,
        lr_scheduler_type="linear",
        warmup_steps=5,
        save_total_limit=1,
        weight_decay=0.01,
        optim="adamw_8bit",
        seed=3407,
        report_to="wandb",  # tensorboard or wawndb
    )
    setup_envs(opensloth_config, training_config)
    return opensloth_config, training_config


if __name__ == "__main__":
    opensloth_config, training_config = get_configs(devices=[0,2])
    run_mp_training(opensloth_config.devices, opensloth_config, training_config)

Global batch size: 64
[MP] Running on 2 GPUs


[32m16:18:36[0m | [1mINFO    [0m | [36mGPU0[0m | [36mopensloth_sft_trainer.py:41[0m | [1mTraining on GPU 0 with output_dir outputs/exps/qwen3-14b-FineTome-2gpus-seql-packing[0m
[32m16:18:36[0m | [1mINFO    [0m | [36mGPU0[0m | [36mopensloth_sft_trainer.py:44[0m | [1m🚀 Starting total training timer[0m
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Using compiler location: .cache/unsloth_compiled_cache_0
==((====))==  Unsloth 2025.5.9: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.189 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Using compiler location: .cache/unsloth_compiled_cache_1
==((====))==  Unsloth 2025.5.9: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.189 GB.

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]
[32m16:19:01[0m | [1mINFO    [0m | [36mGPU0[0m | [36mlogging_config.py:161[0m | [1m⏱️  model_loading: 11.96s[0m
[32m16:19:01[0m | [1mINFO    [0m | [36mGPU0[0m | [36mnccl_grad_sync.py:124[0m | [1m[GPU=0] NCCL env: RANK=0, WORLD_SIZE=2, MASTER_ADDR=127.0.0.1, MASTER_PORT=29501[0m
[32m16:19:01[0m | [1mINFO    [0m | [36mGPU0[0m | [36mnccl_grad_sync.py:128[0m | [1m[GPU=0] Setting current CUDA device to:0, os.environ['CUDA_VISIBLE_DEVICES']='0'[0m
[32m16:19:02[0m | [1mINFO    [0m | [36mGPU0[0m | [36minit_modules.py:50[0m | [1mModel loaded on device cuda:0, tokenizer: Qwen2TokenizerFast[0m
[32m16:19:07[0m | [1mINFO    [0m | [36mGPU0[0m | [36mlogging_config.py:161[0m | [1m⏱️  lora_setup: 5.67s[0m
[32m16:19:07[0m | [1mINFO    [0m | [36mGPU0[0m | [36minit_modules.py:74[0m | [1mApplied chat temp

Unsloth: Making `model.base_model.model.model` require gradients


Tokenizing dataset (num_proc=52): 100%|██████████| 10000/10000 [00:11<00:00, 861.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 81555.71 examples/s]
[32m16:19:29[0m | [1mINFO    [0m | [36mGPU0[0m | [36minit_modules.py:148[0m | [1mCreating final SFTTrainer with prepared dataset...[0m
[32m16:19:31[0m | [1mINFO    [0m | [36mGPU0[0m | [36minit_modules.py:161[0m | [1mReplacing DataCollatorForLanguageModeling with DataCollatorForSeq2Seq for better sequence handling[0m
[32m16:19:31[0m | [1mINFO    [0m | [36mGPU0[0m | [36minit_modules.py:169[0m | [1mTrainer setup completed successfully[0m
[32m16:19:31[0m | [1mINFO    [0m | [36mGPU0[0m | [36mlogging_config.py:161[0m | [1m⏱️  trainer_setup: 23.11s[0m
[32m16:19:31[0m | [1mINFO    [0m | [36mGPU0[0m | [36minit_modules.py:122[0m | [1mAdd callback ShuffleData to Trainer UnslothSFTTrainer[0m
[32m16:19:31[0m | [1mINFO    [0m | [36mGPU0[0m | [36mlogging_

[LOCAL_RANK=1] Patching log. Dir: outputs/exps/qwen3-14b-FineTome-2gpus-seql-packing, GPUs: 2
[LOCAL_RANK=1] Log patch initialization complete.
🔧 Patching Trainer to use RandomSamplerSeededByEpoch


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
wandb: Currently logged in as: anhvth to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.19.11
wandb: Run data is saved locally in /home/anhvth5/projects/opensloth/wandb/run-20250609_161932-fu95llxn
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run outputs/exps/qwen3-14b-FineTome-2gpus-seql-packing
wandb: ⭐️ View project at https://wandb.ai/anhvth/huggingface
wandb: 🚀 View run at https://wandb.ai/anhvth/huggingface/runs/fu95llxn


Unsloth: Making `model.base_model.model.model` require gradients
[LOCAL_RANK=0] Patching log. Dir: outputs/exps/qwen3-14b-FineTome-2gpus-seql-packing, GPUs: 2
[LOCAL_RANK=0] Log patch initialization complete.
🔧 Patching Trainer to use RandomSamplerSeededByEpoch


  0%|          | 0/157 [00:00<?, ?it/s][32m16:19:33[0m | [1mINFO    [0m | [36mGPU0[0m | [36mpatch_sampler.py:21[0m | [1m🔄 Starting epoch 1[0m
[32m16:19:33[0m | [1mINFO    [0m | [36mGPU0[0m | [36mpatch_sampler.py:52[0m | [1m🎲 Sampler epoch 0: emitting 10000 indices
First ids dataset samples: [3771, 6672, 7261, 760, 3779, 1772, 7509, 2679, 2305, 9215]
...Last ids: [9674, 1424, 8935, 1679, 2286, 3657, 4012, 4506, 409, 1824][0m
[32m16:19:34[0m | [1mINFO    [0m | [36mGPU0[0m | [36mpatch_sampler.py:28[0m | [1m📋 Dataloader examples logged to .log/dataloader_examples.html[0m
[32m16:19:34[0m | [1mINFO    [0m | [36mGPU0[0m | [36mpatch_sampler.py:52[0m | [1m🎲 Sampler epoch 0: emitting 10000 indices
First ids dataset samples: [3771, 6672, 7261, 760, 3779, 1772, 7509, 2679, 2305, 9215]
...Last ids: [9674, 1424, 8935, 1679, 2286, 3657, 4012, 4506, 409, 1824][0m
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  6%|▋     

## Unsloth default 

In [None]:
import os

from opensloth.patching.patch_sampler import patch_sampler

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["HYPERSLOTH_LOCAL_RANK"] = "0"


def train_qwen3_model():
    """Train Qwen3 model with minimal setup."""
    from opensloth.dataset_utils import get_tokenized_dataset, HFDatasetConfig

    text_dataset = get_tokenized_dataset(
        HFDatasetConfig(
            tokenizer_name="Qwen/Qwen3-8B",
            chat_template="qwen3",
            instruction_part="<|im_start|>user\n",
            response_part="<|im_start|>assistant\n",
            num_samples=10000,
            nproc=52,
            max_seq_length=4096,
            source_type="hf",
            dataset_name="mlabonne/FineTome-100k",
            split="train",
        ),
        do_tokenize=False,
    )
    from unsloth import FastLanguageModel
    import torch
    from trl import SFTTrainer, SFTConfig

    # Load model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Qwen3-0.6B-bnb-4bit",
        max_seq_length=4096,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
    )

    # Add LoRA adapters
    model = FastLanguageModel.get_peft_model(
        model,
        r=8,
        lora_alpha=16,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing=True,
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )
    args = SFTConfig(
        output_dir="outputs/exps/qwen3-14b-FineTome-unsloth",
        dataset_text_field="text",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8*2, # Adjust based on n_gpu
        warmup_steps=5,
        learning_rate=1e-5,
        num_train_epochs=1,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        report_to="wandb",  
    )

    # args.skip_prepare_dataset = True
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=text_dataset,
        eval_dataset=None,
        args=args,
    )
    from unsloth_zoo.dataset_utils import train_on_responses_only

    trainer = train_on_responses_only(
        trainer,
        tokenizer=tokenizer,
        instruction_part="<|im_start|>user\n",
        response_part="<|im_start|>assistant\n",
    )

    # Show memory stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    # Train the model

    # from ._patch_sampler import patch_sampler

    trainer = patch_sampler(trainer)
    trainer_stats = trainer.train()

    # Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(
        f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
    )
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

    return model, tokenizer


if __name__ == "__main__":
    model, tokenizer = train_qwen3_model()
    print("Training completed successfully!")