In [1]:
# # Kill all processess on GPU
!fuser -v /dev/nvidia* -k

# Libraries

In [2]:
%%capture
import os
if 'COLAB_' not in ''.join(os.environ.keys()):
    %pip install unsloth
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    %pip install --no-deps cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth

In [3]:
import torch
from datetime import datetime
from datasets import load_dataset
from unsloth import FastLanguageModel, UnslothTrainer, UnslothTrainingArguments, is_bf16_supported
from trl import SFTTrainer
from transformers import TrainingArguments

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1+cu121 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.11 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


# Config

In [4]:
# Project configs
seed = 69
lang = 'id' # 'en' | 'id'
task = 'wikipedia' # 'wikipedia' | 'gsm8k'

# Data Configs
max_data_length = 2500
max_seq_length = 1024
test_size = 0.2 # 2500 * 0.2 = 500 test data
hf_data_id = 'wikimedia/wikipedia' # 'wikimedia/wikipedia' | 'openai/gsm8k'
hf_data_dir = '20231101.id' # 'wikipedia': '20231101.en' | '20231101.id' || 'gsm8k': 'main'
hf_data_split = f'train[:{max_data_length}]'

# Model configs
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# LoRA configs
lora_target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
lora_r = 8
lora_alpha = 16

resume_from_checkpoint = False
if resume_from_checkpoint:
    hub_model_id = ''
    project_name = hub_model_id.split('/')[-1]
    model_name = project_name

    from huggingface_hub import snapshot_download
    snapshot_download(repo_id=hub_model_id, local_dir=model_name)
else:
    model_name = 'unsloth/Meta-Llama-3.1-8B'
    project_name = f'L3.1-8B-{task}-{lang}-LoRA-v{datetime.now().strftime("%Y%m%d%H%M%S")}'
    hub_model_id = f'alxxtexxr/{project_name}'
print("Resume from checkpoint:", resume_from_checkpoint)
print("Project name:", project_name)
print("Hub model ID:", hub_model_id)

Resume from checkpoint: False
Project name: L3.1-8B-wikipedia-id-LoRA-v20250401105723
Hub model ID: alxxtexxr/L3.1-8B-wikipedia-id-LoRA-v20250401105723


# Model

In [5]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    random_state=seed,
    target_modules=lora_target_modules,
    r=lora_r,
    lora_alpha=lora_alpha,   
    lora_dropout=0, # Supports any, but = 0 is optimized
    bias='none',    # Supports any, but = 'none' is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing=False, # True or 'unsloth' for very long context
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Data

In [7]:
dataset = load_dataset(hf_data_id, data_dir=hf_data_dir, split=hf_data_split)
eos_token = tokenizer.eos_token

def format_gsm8k_prompts(examples):
    gsm8k_prompt = """### Instruction:
Solve the following math problem step by step.

### Question: 
{question}

### Answer: 
{answer}""" + eos_token
    
    return {'text': [gsm8k_prompt.format(question=question, answer=answer) for question, answer in zip(examples['question'], examples['answer'])]}

def format_prompts(examples):
    return {'text': [example + eos_token for example in examples['text']]}

if task == 'gsm8k':
    dataset = dataset.map(format_gsm8k_prompts, batched=True)
else:
    dataset = dataset.map(format_prompts, batched=True)

README.md:   0%|          | 0.00/131k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/267M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/146M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/170M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [8]:
dataset_split = dataset.train_test_split(test_size=test_size)
print(dataset_split)

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 500
    })
})


In [9]:
# Sanity check
for row in dataset_split['train'][:3]["text"]:
    print("================================================================")
    print(row)

Martha Christina Tiahahu () adalah seorang gadis dari desa Abubu,Nusalaut, Maluku Tengah. Pada usia 17 tahun, ia ikut mengangkat senjata melawan tentara Belanda. Ayahnya adalah Kapitan Paulus Tiahahu, seorang kapitan dari negeri Abubu yang membantu Thomas Matulessy dalam Perang Pattimura pada 1817.

M.C. Tiahahu merupakan seorang pejuang kemerdekaan. Ketika ikut dalam pertempuran melawan tentara Belanda saat Perang Pattimura (1817), ia masih remaja. Keberaniannya terkenal di kalangan pejuang, masyarakat luas, dan bahkan musuh-musuhnya.

Sejak awal perjuangan, ia selalu ikut mengambil bagian dan pantang mundur. Dengan rambut panjangnya yang terurai ke belakang serta berikat kepala sehelai kain berang (merah), ia setia mendampingi ayahnya dalam setiap pertempuran, baik di Pulau Nusalaut maupun di Pulau Saparua. Siang dan malam ia selalu hadir dan ikut dalam pembuatan kubu-kubu pertahanan. Ia juga membangkitkan semangat kaum wanita di sekitarnya agar ikut membantu kaum pria di setiap meda

# Training

In [10]:
trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_split['train'],
    # eval_dataset=dataset_split['test'],
    dataset_text_field='text',
    max_seq_length=max_seq_length,
    dataset_num_proc=8,

    args=TrainingArguments(
        seed=seed,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        # max_steps=3, # For debugging
        warmup_ratio=0.05,
        learning_rate=2e-4,
        lr_scheduler_type='cosine',
        optim='paged_adamw_8bit', # 'paged_adamw_8bit' | 'adamw_8bit'
        weight_decay=0.00,
        max_grad_norm=0.3,
        fp16=(not is_bf16_supported()),
        bf16=is_bf16_supported(),

        # Eval arguments
        # eval_strategy='steps',
        # eval_steps=10,
        
        # Logging arguments
        logging_strategy='steps',
        logging_steps=1,
        # logging_first_step=True,
        report_to=['tensorboard', 'wandb'],

        # Saving arguments
        save_strategy='steps',
        save_steps=100,
        # save_steps=1, # For debugging
        save_total_limit=5, # 1 best + 4 recent checkpoints. Warning: It doesn't work
        
        # With load_best_model_at_end=True, your save_strategy will be ignored and default to eval_strategy.
        # So you will find one checkpoint at the end of each epoch.
        # https://discuss.huggingface.co/t/trainer-not-saving-after-save-steps/5464
        # load_best_model_at_end=True, 

        output_dir=project_name,
        hub_model_id=hub_model_id,
        push_to_hub=True,

        hub_strategy='all_checkpoints',
        hub_always_push=True,
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [11]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.67 GB of memory reserved.


In [12]:
# Start training
trainer_stats = trainer.train(resume_from_checkpoint=resume_from_checkpoint)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 3 | Total steps = 750
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 20,971,520/8,000,000,000 (0.26% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33malimtegar[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.7247
2,0.8245
3,0.901
4,0.8658
5,0.6851
6,1.043
7,1.0726
8,0.9088
9,0.5635
10,0.3265


: 