In [4]:
%%capture
%load_ext autoreload
%autoreload 2
%cd /home/ubuntu/projects/hyper-sloth

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
# Install latest Hugging Face for Gemma-3!
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Choose any GPU you want to use
import torch
from unsloth import FastModel


fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-1.5B-Instruct",
    max_seq_length = 16_000, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-15 09:10:39 [__init__.py:256] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.14: Fast Qwen2 patching. Transformers: 4.50.0.dev0. vLLM: 0.7.4.dev473+g9ed6ee92.precompiled.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.643 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

In [6]:
from pyexpat.errors import messages
from typing import Any
from speedy_utils.all import *
from datasets import load_dataset
from unsloth.chat_templates import standardize_data_formats
from datasets import Dataset


def get_chat_dataset(
    dataset_name: str, split: str = None, num_samples: int=None, tokenizer: Any = None
) -> Any:
    """
    Load and preprocess the dataset.

    Args:
        dataset_name (str): The name of the dataset to load.
        split (str): The dataset split to load.
        num_samples (int): The number of samples to select from the dataset.

    Returns:
        Any: The preprocessed dataset.
    """

    if os.path.exists(dataset_name):
        dataset = Dataset.from_json(dataset_name)
    else:
        dataset = load_dataset(dataset_name, split=split)
    dataset = standardize_data_formats(dataset)

    def apply_chat_template(examples):
        messages_key = "messages" if "messages" in examples else "conversations"
        texts = tokenizer.apply_chat_template(examples[messages_key], tokenize=False)
        return {"text": texts}
    if num_samples:
        num_samples = min(num_samples, len(dataset))
        dataset = dataset.shuffle(seed=42)
        dataset = dataset.select(range(num_samples))
    if tokenizer:
        dataset = dataset.map(apply_chat_template, batched=True)
    return dataset


# Example usage
dataset = get_chat_dataset("./data/cod_6k5.json", tokenizer=tokenizer)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/6197 [00:00<?, ? examples/s]

In [7]:



from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_num_proc=4,
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 8, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/6197 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
print(tokenizer.decode(trainer.train_dataset[0]['input_ids']))

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
For any positive integer $a$, let $\tau(a)$ be the number of positive divisors of $a$. Find, with proof, the largest possible value of $4\tau(n)-n$ over all positive integers $n$.<|im_end|>
<|im_start|>assistant
<think>
Find max(4τ(n) - n). τ(n) divisors of n.

Explore small n: pattern or intuition.

n=1 to 32: compute 4τ(n) - n.

Max observed: 12 at n=12.

n=24: 4τ(24) - 24 = 8, which < 12.

n=36: τ(36)=9, so 4*9-36=0.

n=48: τ(48)=10, 4*10 - 48 = -8.

n=60: τ(60)=12, 4*12 - 60 = -12.

n=12 gives 12 (max).

Check numbers between 12 and 60.

Check n=24, 36, 48, etc., or revisit.

n=16: 4τ(16) - 16 = 4, worse.

Check larger numbers.

n=60: 4τ(60) - 60 = -12.

n=120: 4τ(120) - 120= -56.

n=28: 4τ(28)-28=-4.

Numbers with many prime factors?

n=24=2³*3, τ(n) = 8, result is 8.

n=2ᵏ: decreasing results.

n=3*2ᵏ: similar pattern.

n=30=2*3*5: τ(n)=8, result is 2.

n=60=2²*3*5: 

In [9]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>user",
    response_part = "<|im_start|>assistant",
)

Map (num_proc=96):   0%|          | 0/6197 [00:00<?, ? examples/s]

In [10]:
trainer_stats = trainer.train()

In [11]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.643 GB.
2.645 GB of memory reserved.


In [12]:
model.save_pretrained('./outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH')

In [23]:
# from llm_utils import get_conversation_one_turn


# item = dataset[0]
# # messages = item["messages"][:-1]
# messages = get_conversation_one_turn(None, 'hoàng sa của nước nào, hãy trả lời bằng tiếng việt')

# text = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt = True, # Must add for generation
#     tokenize=False,
# )
# text += '<think>\n'


In [24]:


# from transformers import TextStreamer
# with torch.inference_mode():
#     _ = model.generate(
#         **tokenizer([text], return_tensors = "pt").to("cuda"),
#         max_new_tokens = 6400, # Increase for longer outputs!
#         # Recommended Gemma-3 settings!
#         temperature = 1.0, top_p = 0.95, top_k = 64,
#         streamer = TextStreamer(tokenizer, skip_prompt = False),
#     )

In [25]:
model.save_pretrained_merged("mymodel", tokenizer)

Downloading safetensors index for unsloth/qwen2.5-14b-instruct...


model.safetensors.index.json:   0%|          | 0.00/47.5k [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  17%|█▋        | 1/6 [00:57<04:48, 57.71s/it]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  33%|███▎      | 2/6 [01:57<03:55, 58.76s/it]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 3/6 [02:51<02:50, 56.77s/it]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  67%|██████▋   | 4/6 [03:47<01:52, 56.47s/it]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  83%|████████▎ | 5/6 [04:44<00:56, 56.69s/it]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 6/6 [05:38<00:00, 56.36s/it]


### VLLM INFerence