In [None]:
from HyperSloth.hypersloth_config import *
from HyperSloth.scripts.hp_trainer import run_multiprocess_training, setup_envs

# Main configuration using Pydantic models
hyper_config_model = HyperConfig(
    data=HFDatasetConfig(
        dataset_name="llamafactory/OpenThoughts-114k",
        split="train",
        tokenizer_name="Qwen/Qwen3-8B",  # does not matter same family qwen3
        num_samples=1000,
        instruction_part="<|im_start|>user\n",
        response_part="<|im_start|>assistant\n",
        chat_template="chatml",
    ),
    training=TrainingConfig(
        gpus=[0, 1],
        loss_type="response_only",
    ),
    fast_model_args=FastModelArgs(
        model_name="unsloth/Qwen3-0.6b-bnb-4bit",
        max_seq_length=32_000,
        load_in_4bit=True,
    ),
    lora_args=LoraArgs(
        r=8,
        lora_alpha=16,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_dropout=0,
        bias="none",
        use_rslora=False,
    ),
)

# Training arguments using Pydantic model
training_config_model = TrainingArgsConfig(
    output_dir="outputs/qwen3-8b-openthought-2gpus/",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    logging_steps=3,
    num_train_epochs=3,
    lr_scheduler_type="linear",
    warmup_steps=5,
    save_total_limit=2,
    weight_decay=0.01,
    optim="adamw_8bit",
    seed=3407,
    report_to="none",  # tensorboard or wawndb
)

In [None]:

setup_envs(hyper_config_model, training_config_model)

run_multiprocess_training(
    hyper_config_model.training.gpus, hyper_config_model, training_config_model
)

Global batch size: 16
[MP] Running on 2 GPUs


[32m03:05:41[0m | [1mINFO    [0m | [36mGPU1[0m | [36mhp_trainer.py:44[0m | [1m🔧 GPU 1 (Rank 1/1) | Model: unsloth/Qwen3-0.6b-bnb-4bit[0m
[32m03:05:41[0m | [1mINFO    [0m | [36mGPU1[0m | [36mhp_trainer.py:50[0m | [1mTraining on GPU 1 with output_dir outputs/qwen3-8b-openthought-2gpus/[0m
[32m03:05:41[0m | [1mINFO    [0m | [36mGPU1[0m | [36mhp_trainer.py:53[0m | [1m🚀 Starting total training timer[0m
[32m03:05:41[0m | [1mINFO    [0m | [36mGPU0[0m | [36mhp_trainer.py:44[0m | [1m🔧 GPU 0 (Rank 0/1) | Model: unsloth/Qwen3-0.6b-bnb-4bit[0m
[32m03:05:41[0m | [1mINFO    [0m | [36mGPU0[0m | [36mhp_trainer.py:50[0m | [1mTraining on GPU 0 with output_dir outputs/qwen3-8b-openthought-2gpus/[0m
[32m03:05:41[0m | [1mINFO    [0m | [36mGPU0[0m | [36mhp_trainer.py:53[0m | [1m🚀 Starting total training timer[0m


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.9: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.189 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
==((====))==  Unsloth 2025.5.9: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.189 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfl

[32m03:06:24[0m | [1mINFO    [0m | [36mGPU1[0m | [36mlogging_config.py:407[0m | [1m⏱️  model_loading: 15.26s[0m
[32m03:06:24[0m | [1mINFO    [0m | [36mGPU1[0m | [36mnccl_grad_sync.py:127[0m | [1m[GPU=1] NCCL env: RANK=1, WORLD_SIZE=2, MASTER_ADDR=127.0.0.1, MASTER_PORT=29500[0m
[32m03:06:24[0m | [1mINFO    [0m | [36mGPU1[0m | [36mnccl_grad_sync.py:146[0m | [1m[GPU=1] Setting current CUDA device to:0[0m
[32m03:06:25[0m | [1mINFO    [0m | [36mGPU0[0m | [36mlogging_config.py:407[0m | [1m⏱️  model_loading: 14.46s[0m
[32m03:06:25[0m | [1mINFO    [0m | [36mGPU0[0m | [36mnccl_grad_sync.py:127[0m | [1m[GPU=0] NCCL env: RANK=0, WORLD_SIZE=2, MASTER_ADDR=127.0.0.1, MASTER_PORT=29500[0m
[32m03:06:25[0m | [1mINFO    [0m | [36mGPU0[0m | [36mnccl_grad_sync.py:146[0m | [1m[GPU=0] Setting current CUDA device to:0[0m
[32m03:06:26[0m | [1mINFO    [0m | [36mGPU1[0m | [36mnccl_grad_sync.py:160[0m | [1m[GPU=1] NCCL setup complete: rank=1,

Unsloth: Making `model.base_model.model.model` require gradients
Unsloth: Making `model.base_model.model.model` require gradients


[32m03:06:29[0m | [1mINFO    [0m | [36mGPU0[0m | [36mlogging_config.py:407[0m | [1m⏱️  model_init: 18.04s[0m
[32m03:06:29[0m | [1mINFO    [0m | [36mGPU0[0m | [36mdataset_utils.py:283[0m | [1mLoading dataset... and tokenize[0m
[32m03:06:29[0m | [1mINFO    [0m | [36mGPU0[0m | [36mdataset_utils.py:299[0m | [1mFinal dataset loaded with 1000 samples (cache_id: abb36c5638db...)[0m
[32m03:06:29[0m | [1mINFO    [0m | [36mGPU0[0m | [36minit_modules.py:131[0m | [1mCreating final SFTTrainer with prepared dataset...[0m
[32m03:06:29[0m | [1mINFO    [0m | [36mGPU1[0m | [36mlogging_config.py:407[0m | [1m⏱️  model_init: 19.57s[0m
[32m03:06:29[0m | [1mINFO    [0m | [36mGPU1[0m | [36mdataset_utils.py:283[0m | [1mLoading dataset... and tokenize[0m
[32m03:06:29[0m | [1mINFO    [0m | [36mGPU1[0m | [36mdataset_utils.py:299[0m | [1mFinal dataset loaded with 1000 samples (cache_id: abb36c5638db...)[0m
[32m03:06:29[0m | [1mINFO    [0m | 


=== EXAMPLE #1 ===
[92m<|im_start|>system
You are an assistant that thoroughly explores questions through a systematic long thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarization, exploration, reassessment, reflection, backtracing, and iteration to develop a well-considered thinking process. Detail your reasoning process using the specified format: <think>thought with steps separated by '

'</think> Each step should include detailed considerations such as analyzing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. Based on various attempts, explorations, and reflections from the thoughts, you should systematically present the final solution that you deem correct. The solution should remain a logical, accurate, concise expression style and detail necessary steps needed to reach 

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


More training debug examples written to .log/dataloader_examples.html


[32m03:06:37[0m | [1mINFO    [0m | [36mGPU0[0m | [36mpatch_sampler.py:28[0m | [1m📋 Dataloader examples logged to .log/dataloader_examples.html[0m
[32m03:06:37[0m | [1mINFO    [0m | [36mGPU0[0m | [36mpatch_sampler.py:52[0m | [1m🎲 Sampler epoch 0: emitting 1000 indices
First ids: [776, 507, 895, 922, 33, 483, 85, 750, 354, 523]
Last ids: [104, 754, 142, 228, 250, 281, 759, 25, 114, 654][0m
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!
Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
3,0.6852
6,0.7039
9,0.6951
12,0.713
15,0.7363


Step,Training Loss


[32m03:07:31[0m | [1mINFO    [0m | [36mGPU0[0m | [36minner_training_loop.py:100[0m | [1m
🚀 HyperSloth Token Efficiency Report (Rank 0)
+---------------------+----------------+-----------------------+
| Stage               |   Total Tokens |   Token Utilization % |
| Before Optimization |      1,283,882 |                67.39% |
+---------------------+----------------+-----------------------+
| After Optimization  |        865,203 |               100.00% |
+---------------------+----------------+-----------------------+
[0m
[32m03:07:31[0m | [1mINFO    [0m | [36mGPU1[0m | [36minner_training_loop.py:100[0m | [1m
🚀 HyperSloth Token Efficiency Report (Rank 1)
+---------------------+----------------+-----------------------+
| Stage               |   Total Tokens |   Token Utilization % |
| Before Optimization |      1,283,882 |                67.39% |
+---------------------+----------------+-----------------------+
| After Optimization  |        865,203 |               10