In [None]:
# # Kill all processess on GPU
!fuser -v /dev/nvidia* -k

# Libraries

In [4]:
%%capture
import os
if 'COLAB_' not in ''.join(os.environ.keys()):
    %pip install unsloth
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    %pip install --no-deps cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth

In [None]:
import torch
from datetime import datetime
from datasets import load_dataset
from unsloth import FastLanguageModel, UnslothTrainer, UnslothTrainingArguments, is_bf16_supported
from trl import SFTTrainer
from transformers import TrainingArguments

# Config

In [12]:
# Project configs
seed = 69
lang = 'en' # 'en' | 'id'
task = 'gsm8k' # 'wikipedia' | 'gsm8k'

# Data Configs
hf_data_id = 'wikimedia/wikipedia'
hf_data_dir = '20231101.en'
hf_data_split = 'train[:2500]'
max_seq_length = 1024

# Model configs
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# LoRA configs
lora_target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
lora_r = 8
lora_alpha = 16

resume_from_checkpoint = True
if resume_from_checkpoint:
    hub_model_id = ''
    project_name = hub_model_id.split('/')[-1]
    model_name = project_name

    from huggingface_hub import snapshot_download
    snapshot_download(repo_id=hub_model_id, local_dir=model_name)
else:
    model_name = 'unsloth/Meta-Llama-3.1-8B'
    project_name = f'L3.1-8B-{task}-{lang}-v{datetime.now().strftime("%Y%m%d%H%M%S")}'
    hub_model_id = f'alxxtexxr/{project_name}'
print("Resume from checkpoint:", resume_from_checkpoint)
print("Project name:", project_name)
print("Hub model ID:", hub_model_id)

Fetching 239 files:   0%|          | 0/239 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/16.7k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/19.9k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/21.5k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/23.1k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/24.7k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/26.3k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/29.5k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/31.1k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/32.7k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/34.3k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/5.51k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/7.08k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/8.67k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

(…).tfevents.1741182714.5172c9540b89.8159.0:   0%|          | 0.00/50.1k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

Resume from checkpoint: True
Project name: L3.1-8B-wikipedia-en-v20250305134947
Hub model ID: alxxtexxr/L3.1-8B-wikipedia-en-v20250305134947


# Model

In [13]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.3.4: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

Unsloth 2025.3.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [14]:
model = FastLanguageModel.get_peft_model(
    model,
    random_state=seed,
    target_modules=lora_target_modules,
    r=lora_r,
    lora_alpha=lora_alpha,   
    lora_dropout=0, # Supports any, but = 0 is optimized
    bias='none',    # Supports any, but = 'none' is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing=False, # True or 'unsloth' for very long context
    use_rslora=False,
    loftq_config=None,
)

Unsloth: Already have LoRA adapters! We shall skip this step.


# Data

In [15]:
dataset = load_dataset(hf_data_id, data_dir=hf_data_dir, split=hf_data_split)
eos_token = tokenizer.eos_token
def format_prompts(examples):
    return {'text': [example + eos_token for example in examples['text']]}
dataset = dataset.map(format_prompts, batched=True)

README.md:   0%|          | 0.00/131k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

train-00000-of-00041.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

train-00001-of-00041.parquet:   0%|          | 0.00/351M [00:00<?, ?B/s]

train-00002-of-00041.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

train-00003-of-00041.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

train-00004-of-00041.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00005-of-00041.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

train-00006-of-00041.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00007-of-00041.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00008-of-00041.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

train-00009-of-00041.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00010-of-00041.parquet:   0%|          | 0.00/234M [00:00<?, ?B/s]

train-00011-of-00041.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

train-00012-of-00041.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

train-00013-of-00041.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00014-of-00041.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

train-00015-of-00041.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

train-00016-of-00041.parquet:   0%|          | 0.00/503M [00:00<?, ?B/s]

train-00017-of-00041.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

train-00018-of-00041.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

train-00019-of-00041.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00020-of-00041.parquet:   0%|          | 0.00/225M [00:00<?, ?B/s]

train-00021-of-00041.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00022-of-00041.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

train-00023-of-00041.parquet:   0%|          | 0.00/213M [00:00<?, ?B/s]

train-00024-of-00041.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

train-00025-of-00041.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

train-00026-of-00041.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

train-00027-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00028-of-00041.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00029-of-00041.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

train-00030-of-00041.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

train-00031-of-00041.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

train-00032-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00033-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00034-of-00041.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

train-00035-of-00041.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

train-00036-of-00041.parquet:   0%|          | 0.00/610M [00:00<?, ?B/s]

train-00037-of-00041.parquet:   0%|          | 0.00/674M [00:00<?, ?B/s]

train-00038-of-00041.parquet:   0%|          | 0.00/538M [00:00<?, ?B/s]

train-00039-of-00041.parquet:   0%|          | 0.00/465M [00:00<?, ?B/s]

train-00040-of-00041.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [16]:
dataset_split = dataset.train_test_split(test_size=0.2)
print(dataset_split)

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 500
    })
})


In [17]:
# Sanity check
for row in dataset_split['train'][:3]["text"]:
    print("=========================")
    print(row)

Capricorn (pl. capricorni or capricorns) may refer to:

Places
Capricorn and Bunker Group, islands of the southern Great Barrier Reef, Australia
Capricorn District Municipality, Limpopo province, South Africa

Animals
Capricorn, an animal from the ibex family, particularly the Alpine ibex
Capricornis, a genus of goat-like or antelope-like animals

Astronomy and astrology 
 Capricornus, one of the constellations of the zodiac
 Capricorn (astrology)

Arts, entertainment, and media

Fictional characters
Capricorn (comics), several Marvel Comics characters
Capricorn (Inkworld), Inkheart character

Music

Groups and labels
 Capricorn Records, an American record label active 1969–1979
 Capricorn (ensemble), a British chamber ensemble active 1973–2000

Albums
Capricorn (Jay Chou album), 2008
Capricorn (Trevor Powers album), 2020
Capricorn (Mike Tramp album), 1997
"Capricorn (A Brand New Name)", a 2002 single by 30 Seconds to Mars from their self-titled album

Songs
"Capricorn", a song by IQ f

# Training

In [18]:
trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_split['train'],
    # eval_dataset=dataset_split['test'],
    dataset_text_field='text',
    max_seq_length=max_seq_length,
    dataset_num_proc=8,

    args=TrainingArguments(
        seed=seed,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        # max_steps=3, # For debugging
        warmup_ratio=0.05,
        learning_rate=2e-4,
        lr_scheduler_type='cosine',
        optim='paged_adamw_8bit', # 'paged_adamw_8bit' | 'adamw_8bit'
        weight_decay=0.00,
        max_grad_norm=0.3,
        fp16=(not is_bf16_supported()),
        bf16=is_bf16_supported(),

        # Eval arguments
        # eval_strategy='steps',
        # eval_steps=10,
        
        # Logging arguments
        logging_strategy='steps',
        logging_steps=1,
        # logging_first_step=True,
        report_to=['tensorboard', 'wandb'],

        # Saving arguments
        save_strategy='steps',
        save_steps=50,
        # save_steps=1, # For debugging
        save_total_limit=5, # 1 best + 4 recent checkpoints. Warning: It doesn't work
        
        # With load_best_model_at_end=True, your save_strategy will be ignored and default to eval_strategy.
        # So you will find one checkpoint at the end of each epoch.
        # https://discuss.huggingface.co/t/trainer-not-saving-after-save-steps/5464
        # load_best_model_at_end=True, 

        output_dir=project_name,
        hub_model_id=hub_model_id,
        push_to_hub=True,

        hub_strategy='all_checkpoints',
        hub_always_push=True,
    ),
)

Tokenizing to ["text"] (num_proc=8):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [19]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.746 GB of memory reserved.


In [None]:
trainer_stats = trainer.train(resume_from_checkpoint=resume_from_checkpoint)