In [1]:
import os
import json
import torch
import mlflow
import datetime as dt
from torchinfo import summary
from unsloth import FastModel
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import get_chat_template, standardize_data_formats, train_on_responses_only

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-05-28 16:07:38.485958: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748416058.509916 2529908 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748416058.517213 2529908 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748416058.536627 2529908 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748416058.536659 2529908 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748416058.536661 2529908 computation_placer.cc:177] computation placer alr

🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
# Load and process model + tokenizer
model, tokenizer = FastModel.from_pretrained(
    model_name='unsloth/Qwen3-1.7B',
    max_seq_length=2048,
    dtype=torch.float32,
    load_in_4bit=False,
    load_in_8bit=False,
    full_finetuning=False,
    use_gradient_checkpointing='unsloth',
    fullgraph=True,
    unsloth_force_compile=False,
)

tokenizer = get_chat_template(
    tokenizer=tokenizer,
    chat_template='chatml',
    system_message=None,
)

model = FastModel.get_peft_model(
    model=model,
    r=8,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
    lora_alpha=8,
    lora_dropout=0,
    bias='none',
    finetune_vision_layers=False,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    use_gradient_checkpointing=True,
    random_state=3407,
    use_rslora=False,
    task_type='CAUSAL_LM',
)

summary(model, depth=5)

==((====))==  Unsloth 2025.5.7: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Quadro RTX 6000. Num GPUs = 1. Max memory: 23.638 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


Unsloth: Making `model.base_model.model.model` require gradients


Layer (type:depth-idx)                                            Param #
PeftModelForCausalLM                                              --
├─LoraModel: 1-1                                                  --
│    └─Qwen3ForCausalLM: 2-1                                      --
│    │    └─Qwen3Model: 3-1                                       --
│    │    │    └─Embedding: 4-1                                   (311,164,928)
│    │    │    └─ModuleList: 4-2                                  --
│    │    │    │    └─Qwen3DecoderLayer: 5-1                      50,647,296
│    │    │    │    └─Qwen3DecoderLayer: 5-2                      50,647,296
│    │    │    │    └─Qwen3DecoderLayer: 5-3                      50,647,296
│    │    │    │    └─Qwen3DecoderLayer: 5-4                      50,647,296
│    │    │    │    └─Qwen3DecoderLayer: 5-5                      50,647,296
│    │    │    │    └─Qwen3DecoderLayer: 5-6                      50,647,296
│    │    │    │    └─Qwen3DecoderLayer

In [3]:
# Prepare dataset for SFT
def format_prompts(examples):
   texts = tokenizer.apply_chat_template(examples['conversations'], tokenize=False, add_generation_prompt=False, enable_thinking=False)
   return {'text': texts}

dataset = load_dataset('mlabonne/FineTome-100k', split='train').shuffle(seed=42).select(range(1000))
dataset = standardize_data_formats(dataset=dataset, tokenizer=tokenizer)
dataset = dataset.map(format_prompts, batched=True)
print(dataset[0]['text'])

<|im_start|>user
Give three types of computer graphics.<|im_end|>
<|im_start|>assistant
1. Raster Graphics: These are also called bitmap graphics and are composed of pixels arranged in a grid. Each pixel can have a different color and shade. Raster graphics excel at representing photographic images and digital painting.

2. Vector Graphics: These graphics are constructed using mathematical formulas representing geometric shapes like lines, curves, and polygons. They are resolution-independent, meaning they can be scaled up or down in size without losing quality. Vector graphics are commonly used for logos, icons, typography and illustrations.

3. 3D Graphics: These graphics are used to create three-dimensional digital representations of objects. 3D graphics use techniques like modeling, rendering, and shading to simulate depth and surface properties. These graphics are used in animation, video games, architecture, engineering, and virtual reality.<|im_end|>



In [4]:
run_name = 'SFT_Run'
output_dir = 'lora_model'
sft_model_name = 'sft_model'

mlflow.set_experiment(f'Model Training @ {dt.datetime.now().strftime('%Y-%m-%d %H-%M-%S')}')
with mlflow.start_run(run_name=run_name):
    # MLFlow logging
    mlflow.set_tag('Training Info', f'Unsloth SFT')
    
    with open('model_summary.txt', 'w') as f:
        f.write(str(summary(model, depth=10)))
    mlflow.log_artifact('model_summary.txt')
    os.remove('model_summary.txt')
    
    
    # Setup SFT trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        eval_dataset=None,
        peft_config=None,
        args=SFTConfig(
            output_dir='./logs',
            # eval_strategy='steps',
            dataset_text_field='text',
            dataset_num_proc=2,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            gradient_accumulation_steps=4,
            # eval_accumulation_steps=1,
            learning_rate=2e-4,
            weight_decay=0.01,
            max_grad_norm=1.0,
            # num_train_epochs=1,
            max_steps=30,
            lr_scheduler_type='linear',
            warmup_steps=5,
            logging_steps=1,
            seed=3407,
            # bf16=is_bfloat16_supported(),
            # fp16=not is_bfloat16_supported(),
            run_name=mlflow.active_run().info.run_name,
            optim='adamw_8bit',
            report_to='mlflow',
            torch_compile=False,
        ),
    )

    trainer = train_on_responses_only(
        trainer=trainer,
        instruction_part='<|im_start|>user\n',
        response_part='<|im_start|>assistant\n',
        tokenizer=tokenizer,
    )
    print(tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[0]['labels']]).replace(tokenizer.pad_token, ''))
    
    
    # Perform SFT
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f'GPU = {gpu_stats.name}. Max memory = {max_memory} GB.')
    print(f'{start_gpu_memory} GB of memory reserved.')

    trainer_stats = trainer.train()

    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
    print(f'Peak reserved memory = {used_memory} GB.')
    print(f'Peak reserved memory for training = {used_memory_for_lora} GB.')
    print(f'Peak reserved memory % of max memory = {used_percentage} %.')
    print(f'Peak reserved memory for training % of max memory = {lora_percentage} %.')
    
    
    # Save finetuned model
    def patch_config(output_dir):
        with open(output_dir + '/config.json', 'r', encoding='utf-8') as f:
            config_data = json.load(f)
        
        config_data['architectures'] = [trainer.model.base_model.model.__class__.__name__]
        
        with open(output_dir + '/config.json', 'w', encoding='utf-8') as f:
            json.dump(config_data, f, indent=2)

    ## Save to VLLM
    trainer.model.save_pretrained_merged(output_dir, tokenizer, save_method='lora')
    # trainer.model.save_pretrained_merged(output_dir, tokenizer, save_method='merged_16bit')
    patch_config(output_dir)

    ## Save to GGUF
    gguf_files = trainer.model.save_pretrained_gguf(output_dir, quantization_type='f16')
    os.system(f'mv {gguf_files[0]} {output_dir}')
    with open(f'{output_dir}/Modelfile', 'w', encoding='utf-8') as f:
        f.write(tokenizer._ollama_modelfile.replace('{__FILE_LOCATION__}', gguf_files[0]))
    print(f'run `ollama create {sft_model_name} -f {output_dir}/Modelfile`')
    
    mlflow.log_artifacts(output_dir, artifact_path=sft_model_name)

2025/05/28 16:08:04 INFO mlflow.tracking.fluent: Experiment with name 'Model Training @ 2025-05-28 16-08-04' does not exist. Creating a new experiment.


1. Raster Graphics: These are also called bitmap graphics and are composed of pixels arranged in a grid. Each pixel can have a different color and shade. Raster graphics excel at representing photographic images and digital painting.

2. Vector Graphics: These graphics are constructed using mathematical formulas representing geometric shapes like lines, curves, and polygons. They are resolution-independent, meaning they can be scaled up or down in size without losing quality. Vector graphics are commonly used for logos, icons, typography and illustrations.

3. 3D Graphics: These graphics are used to create three-dimensional digital representations of objects. 3D graphics use techniques like modeling, rendering, and shading to simulate depth and surface properties. These graphics are used in animation, video games, architecture, engineering, and virtual reality.<|im_end|>

GPU = Quadro RTX 6000. Max memory = 23.638 GB.
7.639 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 8,716,288/1,729,291,264 (0.50% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.5456
2,1.9669
3,1.2782
4,1.4838
5,1.4725
6,1.1277
7,1.5232
8,1.1058
9,1.0103
10,1.1097


Peak reserved memory = 11.518 GB.
Peak reserved memory for training = 3.879 GB.
Peak reserved memory % of max memory = 48.727 %.
Peak reserved memory for training % of max memory = 16.41 %.
Found HuggingFace hub cache directory: /home/blim/.cache/huggingface/hub
Checking cache directory for required files...
Successfully copied all 1 files from cache to lora_model.


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:22<00:00, 22.79s/it]


Unsloth: Updating system package directories
Unsloth: All commands will now use admin permissions (sudo)
Unsloth: Install GGUF and other packages
Unsloth GGUF:hf-to-gguf:Loading model: lora_model
Unsloth GGUF:hf-to-gguf:Model architecture: Qwen3ForCausalLM
Unsloth GGUF:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
Unsloth GGUF:hf-to-gguf:Exporting model...
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model.safetensors'
Unsloth GGUF:hf-to-gguf:token_embd.weight,         torch.bfloat16 --> F16, shape = {2048, 151936}
Unsloth GGUF:hf-to-gguf:output_norm.weight,        torch.bfloat16 --> F32, shape = {2048}
Unsloth GGUF:hf-to-gguf:Set meta model
Unsloth GGUF:hf-to-gguf:Set model parameters
Unsloth GGUF:hf-to-gguf:gguf: context length = 40960
Unsloth GGUF:hf-to-gguf:gguf: embedding length = 2048
Unsloth GGUF:hf-to-gguf:gguf: feed forward length = 6144
Unsloth GGUF:hf-to-gguf:gguf: head count = 16
Unsloth GGUF:hf-to-gguf:gguf: key-value head count = 8
Unsloth GGUF:hf-t

Unsloth: GGUF conversion:   0%|          | 0/100 [00:00<?, ?it/s]

Unsloth GGUF:hf-to-gguf:Model successfully exported to ./
Unsloth: Converted to lora_model.F16.gguf with size = 3.4G
Unsloth: Successfully saved GGUF to:
lora_model.F16.gguf
run `ollama create sft_model -f lora_model/Modelfile`


In [5]:
# Load finetuned model
loaded_model, loaded_tokenizer = FastModel.from_pretrained(
    model_name=output_dir,
    max_seq_length=2048,
    dtype=torch.float32,
    load_in_4bit=False,
    load_in_8bit=False,
    full_finetuning=False,
    use_gradient_checkpointing='unsloth',
    fullgraph=True,
    unsloth_force_compile=False,
)

torch.eq(
    (trainer.model.base_model.model.model.layers[3].self_attn.q_proj.base_layer.weight +
     trainer.model.base_model.model.model.layers[3].self_attn.q_proj.lora_B['default'].weight @
     trainer.model.base_model.model.model.layers[3].self_attn.q_proj.lora_A['default'].weight),
    loaded_model.model.layers[3].self_attn.q_proj.weight
).all()

==((====))==  Unsloth 2025.5.7: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Quadro RTX 6000. Num GPUs = 1. Max memory: 23.638 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


tensor(True, device='cuda:0')