In [None]:
from unsloth import FastLanguageModel
# import torch
from datasets import load_dataset
# from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
from vllm import SamplingParams

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-21 17:16:56 __init__.py:207] Automatically detected platform cuda.


In [2]:
from IPython.display import Markdown
import html

def escape_markdown_text(text: str) -> str:
    """Escapes special HTML characters and ensures proper line breaks for Markdown rendering."""
    escaped_text = html.escape(text)  # Escape HTML characters
    # return escaped_text
    return escaped_text.replace("\n", "\n\n")  # Ensure Markdown respects newlines


## Load model and data

In [5]:
ds = load_dataset("mlabonne/smoltldr")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 200
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 200
    })
})


In [3]:
# model_id = "/mnt/d/Eng/MLearning/LLMs_folder/SmolLM-135M-Instruct"
# model_id = "/mnt/d/Eng/MLearning/LLMs_folder/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B"
model_id = "/mnt/d/Eng/MLearning/LLMs_folder/gemma-3-1b-it"

max_seq_length = 1024  # Can increase for longer reasoning traces
lora_rank = 32  # Larger rank = smarter, but slower

In [None]:
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype="auto",
#     device_map="auto",
#     attn_implementation="flash_attention_2",
# )
# tokenizer = AutoTokenizer.from_pretrained(model_id)

In [14]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # False for LoRA 16bit
    fast_inference=True,  # Enable vLLM fast inference
    max_lora_rank=lora_rank,
    gpu_memory_utilization=0.6,  # Reduce if out of memory
)
# model = FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.3.17: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.7.3.
   \\   /|    NVIDIA GeForce RTX 3050 Ti Laptop GPU. Num GPUs = 1. Max memory: 4.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Test model before fine-tuning

In [8]:
SYSTEM_PROMPT = """
Answer the next question in the following format:
<reason>
some reasoning here
</reason>
<answer>
some answer here
</answer>
"""
SYSTEM_PROMPT = ""
question = "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"

prompt = [{"role": "system", "content": SYSTEM_PROMPT},
          {"role": "user", "content": question}
          ]

# prompt = [{"role": "user", "content": f"\n{SYSTEM_PROMPT}\n\n### Question\n{question}"}]

In [15]:
text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
# model_inputs = tokenizer(text, return_tensors="pt").to(model.device)

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=1024,
)

output_ids = model.fast_generate(text, sampling_params=sampling_params)
# output_text = tokenizer.decode(output_ids[0])

AttributeError: 'Gemma3ForCausalLM' object has no attribute 'fast_generate'

In [None]:
Markdown(escape_markdown_text(msg))

&lt;bos&gt;&lt;start_of_turn&gt;user





Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?&lt;end_of_turn&gt;

&lt;start_of_turn&gt;model



In [21]:
Markdown(escape_markdown_text(output_text))

&lt;bos&gt;&lt;bos&gt;&lt;start_of_turn&gt;user





Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?&lt;end_of_turn&gt;

&lt;start_of_turn&gt;model

Let $x$ be the number of clips Natalia sold in April.

Natalia sold $x$ clips in April.

In May, she sold half as many clips as she sold in April, so she sold $\frac{1}{2}x$ clips in May.

The total number of clips sold in April and May is $x + \frac{1}{2}x = \frac{2}{2}x + \frac{1}{2}x = \frac{3}{2}x$.

We are given that Natalia sold clips to 48 of her friends in April.

So, $x = 48$.

Then the number of clips sold in May is $\frac{1}{2}x = \frac{1}{2}(48) = 24$.

The total number of clips sold in April and May is $48 + 24 = 72$.

Alternatively, let $C$ be the number of clips Natalia sold in

## Load LoRA model

In [None]:


model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Remove QKVO if out of memory
    lora_alpha=lora_rank,
    use_gradient_checkpointing="unsloth",  # Enable long context finetuning
    random_state=3407,
)

print(model.print_trainable_parameters())

trainable params: 4,884,480 || all params: 139,399,488 || trainable%: 3.5039
None


In [7]:
# Reward function
ideal_length = 50


def reward_len(completions, **kwargs):
    return [-abs(ideal_length - len(completion)) for completion in completions]

In [16]:
# Training arguments
training_args = GRPOConfig(
    output_dir="GRPO",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    max_prompt_length=512,
    max_completion_length=96,
    num_generations=8,
    optim="adamw_8bit",
    num_train_epochs=1,
    bf16=True,
    report_to=[],
    remove_unused_columns=False,
    logging_steps=1,
    push_to_hub=False
)

In [22]:
# Trainer
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_len],
    args=training_args,
    train_dataset=ds["train"],
)

# Train model
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


KeyboardInterrupt: 

In [23]:
msg = tokenizer.apply_chat_template([{'role': 'user', 'content': 'Hello, how are you?'}], tokenize=False)
model_inputs = tokenizer(msg, return_tensors="pt").to(model.device)

output_ids = model.generate(**model_inputs, num_beams=5, num_return_sequences=2)
tokenizer.decode(output_ids[0])

'<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nHello! How can I help you today?<|im_end|>'

In [24]:
output_ids

tensor([[    1,  4093,   198, 19556,    28,   638,   359,   346,    47,     2,
           198,     1,   520,  9531,   198, 19556,    17,  1073,   416,   339,
           724,   346,  1834,    47,     2],
        [    1,  4093,   198, 19556,    28,   638,   359,   346,    47,     2,
           198,     1,   520,  9531,   198, 19556,    17,  1073,   359,   346,
          1834,    47,     2,     2,     2]], device='cuda:0')

"<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm [Your Name], a software engineer at [Your Company]. It's been a pleasure to meet you. I'm excited to share my insights on the latest trends in AI"