# 🇲🇾 Malaya LLM Pro: FULL Training (Phase 3)

**This is the FULL overnight training run!**

- **Dataset**: 99,602 Malaysian instruction pairs
- **Training Time**: ~5-8 hours
- **Expected Improvement**: 82% → 95%+

⚠️ Keep your laptop open with screen dimmed. Do NOT close the lid.

In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" "trl<0.8.0" peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    # Switched to Qwen 2.5 7B for feasible training on Free Colab
    model_name = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    # model_name = "unsloth/Qwen2.5-14B-Instruct-bnb-4bit", # <-- Requires A100 or paid tier
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
print("✅ Model loaded! Using Qwen 2.5 7B (Lightweight Version)")

In [None]:
from datasets import load_dataset, concatenate_datasets

# Load ALL 3 clean files from Mesolitica
print("📥 Loading Malaysian datasets...")

ds1 = load_dataset("mesolitica/chatgpt-malay-instructions", 
                   data_files="synthetic-malaysian-general-qa.jsonl", split="train")
print(f"   File 1: {len(ds1):,} rows")

ds2 = load_dataset("mesolitica/chatgpt-malay-instructions", 
                   data_files="synthetic-malaysian-general-qa-v2.jsonl", split="train")
print(f"   File 2: {len(ds2):,} rows")

ds3 = load_dataset("mesolitica/chatgpt-malay-instructions", 
                   data_files="synthetic-alpaca_data_cleaned.jsonl", split="train")
print(f"   File 3: {len(ds3):,} rows")

# Combine all datasets
dataset = concatenate_datasets([ds1, ds2, ds3])
print(f"\n✅ Total: {len(dataset):,} Malaysian examples loaded!")

In [None]:
# Format into Alpaca-style prompts
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)
print("✅ Dataset formatted!")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Checkpoint directory in Drive
checkpoint_dir = "/content/drive/MyDrive/Malaya_LLM_Checkpoints"
import os
os.makedirs(checkpoint_dir, exist_ok=True)

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

# Check for existing checkpoints to resume
last_checkpoint = None
if os.path.isdir(checkpoint_dir):
    checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]
    if checkpoints:
        # Sort by step number
        checkpoints.sort(key=lambda x: int(x.split("-")[1]))
        last_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])
        print(f"🔄 Found checkpoint: {last_checkpoint}. Will resume training!")

# FULL TRAINING - With Steps Limit for Feasibility
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 100,
        num_train_epochs = 1, 
        max_steps = 3000,      # <-- CAPPED AT 3000 STEPS (Approx 4-5 hours) 
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 50,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = checkpoint_dir,
        save_steps = 100,      # Save often to avoid data loss
        save_total_limit = 2,  # Keep only last 2 checkpoints
    ),
)

print("✅ Trainer ready! Optimized for Free Colab (3000 Steps Limit).")

In [None]:
# IMPORTANT: When wandb asks, type 3 and press Enter!
import os
os.environ["WANDB_DISABLED"] = "true"  # Skip wandb prompt

print("🚀 Starting training...")
# If checkpoint was found in previous cell, resume from it
if last_checkpoint:
    print(f"▶️ Resuming from {last_checkpoint}...")
    trainer_stats = trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    trainer_stats = trainer.train()

print("\n" + "="*50)
print("🎉 TRAINING COMPLETE!")
print("="*50)

In [None]:
# Export to Ollama GGUF format
print("📦 Exporting to GGUF (this takes ~30-45 mins)...")
model.save_pretrained_gguf("malaya_pro_full", tokenizer, quantization_method = "q4_k_m")
print("\n" + "="*50)
print("✅ MODEL EXPORTED!")
print("📁 Download the 'malaya_pro_full' folder from the Files panel")
print("="*50)

In [None]:
# [OPTIONAL] Push to Hugging Face Hub (Run this after training)
# 1. Get your token from https://huggingface.co/settings/tokens
# 2. Un-comment the lines below and run

# from huggingface_hub import login
# login()
# model.push_to_hub_gguf(
#     "YOUR_USERNAME/malaya-llm-7b-instruct-v1", # <--- Change this to your repo name
#     tokenizer,
#     quantization_method = "q4_k_m"
# )
# print("✅ Model Pushed to Hugging Face!")