In [4]:
%%capture
%load_ext autoreload
%autoreload 2
%cd /home/ubuntu/projects/hyper-sloth

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
# Install latest Hugging Face for Gemma-3!
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Choose any GPU you want to use
import torch
from unsloth import FastModel


fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-1.5B-Instruct",
    max_seq_length = 16_000, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-15 09:47:58 [__init__.py:256] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.14: Fast Qwen2 patching. Transformers: 4.50.0.dev0. vLLM: 0.7.4.dev473+g9ed6ee92.precompiled.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.643 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [34]:
# model = FastModel.get_peft_model(
#     model,
#     finetune_vision_layers     = False, # Turn off for just text!
#     finetune_language_layers   = True,  # Should leave on!
#     finetune_attention_modules = True,  # Attention good for GRPO
#     finetune_mlp_modules       = True,  # SHould leave on always!

#     r = 8,           # Larger = higher accuracy, but might overfit
#     lora_alpha = 8,  # Recommended alpha == r at least
#     lora_dropout = 0,
#     bias = "none",
#     random_state = 3407,
# )
from unsloth import FastModel
model, tokenizer = FastModel.from_pretrained(
    model_name = "outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 2048,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.3.14: Fast Qwen2 patching. Transformers: 4.50.0.dev0. vLLM: 0.7.4.dev473+g9ed6ee92.precompiled.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.643 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
def merge_and_save_lora(
    base_model_name: str, lora_path: str, output_path: str = None
) -> None:
    """
    Merges a LoRA model with its base model and saves the result.

    Args:
        base_model_name: Name of the base model on HuggingFace Hub
        lora_path: Local path to the LoRA adapter weights
        output_path: Where to save the merged model (defaults to lora_path + "-merged")
    """
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer

    if output_path is None:
        output_path = f"{lora_path}-merged"

    # Load the LoRA model
    model = AutoPeftModelForCausalLM.from_pretrained(
        lora_path, device_map="auto", trust_remote_code=True
    ).eval()

    # Merge the LoRA weights with the base model
    merged_model = model.merge_and_unload()

    # Save the merged model
    merged_model.save_pretrained(
        output_path, max_shard_size="2048MB", safe_serialization=True
    )

    # Save the tokenizer from the base model
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.save_pretrained(output_path)

    print(f"Merged model saved to {output_path}")


# Example usage
merge_and_save_lora(
    base_model_name="Qwen/Qwen2.5-1.5B-Instruct",
    lora_path="outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH",
)

('outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH-merge/tokenizer_config.json',
 'outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH-merge/special_tokens_map.json',
 'outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH-merge/vocab.json',
 'outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH-merge/merges.txt',
 'outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH-merge/added_tokens.json',
 'outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH-merge/tokenizer.json')

NameError: name 'AutoTokenizer' is not defined

In [4]:
from pyexpat.errors import messages
from typing import Any
from speedy_utils.all import *
from datasets import load_dataset
from unsloth.chat_templates import standardize_data_formats
from datasets import Dataset


def get_chat_dataset(
    dataset_name: str, split: str = None, num_samples: int=None, tokenizer: Any = None
) -> Any:
    """
    Load and preprocess the dataset.

    Args:
        dataset_name (str): The name of the dataset to load.
        split (str): The dataset split to load.
        num_samples (int): The number of samples to select from the dataset.

    Returns:
        Any: The preprocessed dataset.
    """

    if os.path.exists(dataset_name):
        dataset = Dataset.from_json(dataset_name)
    else:
        dataset = load_dataset(dataset_name, split=split)
    dataset = standardize_data_formats(dataset)

    def apply_chat_template(examples):
        messages_key = "messages" if "messages" in examples else "conversations"
        texts = tokenizer.apply_chat_template(examples[messages_key], tokenize=False)
        return {"text": texts}
    if num_samples:
        num_samples = min(num_samples, len(dataset))
        dataset = dataset.shuffle(seed=42)
        dataset = dataset.select(range(num_samples))
    if tokenizer:
        dataset = dataset.map(apply_chat_template, batched=True)
    return dataset


# Example usage
dataset = get_chat_dataset("./data/cod_6k5.json", tokenizer=tokenizer)

In [5]:



from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_num_proc=4,
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 8, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
print(tokenizer.decode(trainer.train_dataset[0]['input_ids']))

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>user",
    response_part = "<|im_start|>assistant",
)

In [None]:
trainer_stats = trainer.train()

In [11]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.643 GB.
2.645 GB of memory reserved.


In [None]:
# model.save_pretrained('./outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH')
# model.save_pretrained_merged('./outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH-merged', tokenizer=tokenizer)

Unsloth: Merging weights into 16bit:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

In [None]:
# from llm_utils import get_conversation_one_turn


# item = dataset[0]
# # messages = item["messages"][:-1]
# messages = get_conversation_one_turn(None, 'hoàng sa của nước nào, hãy trả lời bằng tiếng việt')

# text = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt = True, # Must add for generation
#     tokenize=False,nbs/test_unsloth.ipynbnbs/test_unsloth.ipynb˛˛
# )
# text += '<think>\n'


### VLLM INFerence