In [1]:
## Install vllm for fast inference and unsloth for optimized models
!pip install -qqq unsloth vllm
!pip install -qqq --upgrade pillow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.5/192.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.3/265.3 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.6/71.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.6/87.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HuggingFace")

!huggingface-cli login --token {secret_value_0}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `kaggle` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `kaggle`


In [3]:
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# from vllm import SamplingParams
# from peft import LoraConfig, get_peft_model

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-24 17:07:10 __init__.py:207] Automatically detected platform cuda.


In [4]:
# model_id = "/mnt/d/Eng/MLearning/LLMs_folder/SmolLM-135M-Instruct"
# model_id = "/mnt/d/Eng/MLearning/LLMs_folder/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B"
model_id = "/mnt/d/Eng/MLearning/LLMs_folder/gemma-3-1b-it"
# model_id = "meta-llama/Llama-3.2-1B-Instruct"
# model_id = "google/gemma-3-1b-it"

max_seq_length = 1024  # Can increase for longer reasoning traces
# use high ranks (16, 32, 64) for small models (<1B) and small ranks (8, 16) for large models (>1B).
# Also take memory resource into your considrations.
lora_r = 8
lora_dropout = 0
seed = 3407

## Data Preperation

### Prompt Format

In [5]:
# Define the system prompt that instructs the model to use a specific format
reasoning_start = "<reasoning>"
reasoning_end   = "</reasoning>"
solution_start = "<answer>"
solution_end = "</answer>"

SYSTEM_PROMPT = \
f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""

### Helpers

In [6]:
import html
import re
from datasets import load_dataset, Dataset

# For GSM8K dataset, We notice all answers like about have a ####, so we extract it
def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()


# Function to prepare the GSM8K dataset
def get_gsm8k_questions(system_prompt:str) -> Dataset:
    dataset = load_dataset("openai/gsm8k", "main", split="train")
    dataset = dataset.map(
        lambda x: {
            "prompt": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": x["question"]},
            ],
            "answer": extract_hash_answer(x["answer"]),
        }
    )
    return dataset

In [7]:
dataset = get_gsm8k_questions(system_prompt=SYSTEM_PROMPT)
dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 7473
})

## Load LLM

In [8]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # Enables memory-efficient training
    fast_inference=True,  # Enable vLLM fast inference
    # max_lora_rank=lora_rank,
    # gpu_memory_utilization=0.6,  # Reduce if out of memory
)

# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to('cuda:0')
# tokenizer = AutoTokenizer.from_pretrained(model_id)

==((====))==  Unsloth 2025.3.17: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.7.3.
   \\   /|    NVIDIA GeForce RTX 3050 Ti Laptop GPU. Num GPUs = 1. Max memory: 4.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Test model before fine-tuning

In [10]:
question = dataset[2]["question"]

prompt = [{"role": "system", "content": SYSTEM_PROMPT},
          {"role": "user", "content": question}
          ]

# prompt = [{"role": "user", "content": f"\n{SYSTEM_PROMPT}\n\n### Question\n{question}"}]

text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
print(text)

<bos><start_of_turn>user
You are given a problem.
Think about the problem and provide your working out.
Place it between <reasoning> and </reasoning>.
Then, provide your solution between <answer></answer>

Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?<end_of_turn>
<start_of_turn>model



In [None]:
model_inputs = tokenizer(text, return_tensors="pt").to(model.device)
output_ids = model.generate(**model_inputs)
output_text = tokenizer.decode(output_ids[0])
print(output_text)

## Setup LoRA model

In [11]:
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_r,  # Adjustable, 16 is a good starting point
    lora_alpha=lora_r,  # Should be at least r
    lora_dropout=lora_dropout,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],  # Covers attention & MLP layers
    use_gradient_checkpointing="unsloth",  # Memory-efficient fine-tuning
    bias="none",  # No additional trainable bias
    random_state=seed,
)

print(model.print_trainable_parameters())

# from peft import LoraConfig, get_peft_model

# # Define LoRA configuration
# lora_config = LoraConfig(
#     r=lora_r,  # Adjustable, 16 is a good starting point
#     lora_alpha=lora_r,  # Should be at least r
#     lora_dropout=lora_dropout,
#     target_modules=[
#         "q_proj", "k_proj", "v_proj", "o_proj",
#         "gate_proj", "up_proj", "down_proj",
#     ],  # Covers attention & MLP layers
#     bias="none",  # No additional trainable bias
#     task_type="CAUSAL_LM",  # Define the task type
# )

# # Apply LoRA to the model
# model = get_peft_model(model, lora_config)

# # Print trainable parameters
# model.print_trainable_parameters()


Unsloth: Making `model.base_model.model.model` require gradients
trainable params: 6,522,880 || all params: 1,006,408,832 || trainable%: 0.6481
None


## GRPO fine-tuning and Reward functions

In [12]:
# Helper functions to extract answers from different formats
def extract_xml_answer(text: str) -> str:
    answer = text.split(solution_start)[-1]
    answer = answer.split(solution_end)[0]
    return answer.strip()


# Reward function that checks if the answer is correct
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]["content"] for completion in completions]
    q = prompts[0][-1]["content"]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print(
        "-" * 50,
        "\n",
        f"Question:\n{q}",
        f"\nAnswer:\n{answer[0]}",
        f"\nResponse:\n{responses[0]}",
        f"\nExtracted:\n{extracted_responses[0]}",
    )
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]


# Reward function that checks if the answer is an integer
def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]["content"] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]


# Reward function that checks if the completion follows the strict format
def strict_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]


# Reward function that checks if the completion follows a more relaxed format
def soft_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"^{}\n.*?\n{}\n{}\n.*?\n{}\n$".format(reasoning_start, reasoning_end, solution_start, solution_end)
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]


# Reward function that counts XML tags and penalizes extra content
def count_xml(text) -> float:
    count = 0.0
    if text.count(f"{reasoning_start}\n") == 1:
        count += 0.125
    if text.count(f"\n{reasoning_end}\n") == 1:
        count += 0.125
    if text.count(f"\n{solution_start}\n") == 1:
        count += 0.125
        count -= len(text.split(f"\n{solution_end}\n")[-1]) * 0.001
    if text.count(f"\n{solution_end}") == 1:
        count += 0.125
        count -= (len(text.split(f"\n{solution_end}")[-1]) - 1) * 0.001
    return count


def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [13]:
max_prompt_length = 256

training_args = GRPOConfig(
    learning_rate=1e-5,
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    logging_steps=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,  # Increase to 4 for smoother training
    num_generations=4,  # Decrease if out of memory
    max_prompt_length=max_prompt_length,
    max_completion_length=max_seq_length - max_prompt_length,
    max_steps=250,
    save_steps=250,
    max_grad_norm=0.1,
    report_to="none",  # Can use Weights & Biases
    output_dir="outputs",
    fp16=True,
    # bf16=True,
    # use_vllm=True,
    # vllm_dtype='float16',
    # vllm_device='cuda:1'
)

trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args=training_args,
    train_dataset=dataset,
)

Unsloth: Switching to float32 training since model cannot work with float16


In [14]:
# Train model
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 6,522,880/1,000,000,000 (0.65% trained)
`generation_config` default values have been modified to match model-specific defaults: {'max_length': 32768, 'bos_token_id': 2, 'eos_token_id': [1, 106]}. If this is not desired, please set these values explicitly.


-------------------------------------------------- 
 Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
<reasoning>
Mr. Benson bought 12 tickets, and the cost of each ticket is $40. So the total cost without the discount is 12 * $40 = $480.
He received a 5% discount for every ticket he bought that exceeds 10. This means he gets a 5% discount on the $10 threshold ($10 * 0.05 = $0.50).
The number of tickets exceeding the $10 threshold is 12 - 10 = 2 tickets.
The discount amount for these 2 tickets is 2 * $0.50 = $1.00.
Therefore, the final cost is $480 - $1.00 = $479.
Alternatively, we can calculate the discount amount as 5% of the ticket price exceeding $10, which is 5% of $10 = $0.50.
Then the total cost is $480 - 0.50 = $479.

</reasoning>
<answer>479</answer> 
Extracted:
479


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / xmlcount_reward_func,rewards / soft_format_reward_func,rewards / strict_format_reward_func,rewards / int_reward_func,rewards / correctness_reward_func
10,0.0,0.426975,0.382187,542.6875,7e-06,0.076975,0.0,0.0,0.125,0.225
20,0.0,0.600713,0.598822,429.8,9e-05,0.075712,0.0,0.0,0.175,0.35
30,0.0,0.541762,0.626154,479.825,0.000811,0.079262,0.0,0.0,0.1375,0.325
40,0.0,0.659462,0.613508,478.825,0.001019,0.128213,0.0,0.0,0.15625,0.375
50,0.0001,0.780537,0.630474,328.8625,0.00273,0.093037,0.0,0.0,0.2375,0.45
60,0.0001,1.157813,0.503472,384.0,0.00372,0.132813,0.0,0.0,0.275,0.75
70,0.0003,0.556975,0.45301,458.1875,0.008483,0.138225,0.0,0.0,0.19375,0.225
80,0.0002,0.9677,0.407926,489.6,0.0056,0.1802,0.0,0.0,0.2375,0.55
90,0.0004,1.080588,0.605391,394.525,0.009522,0.193088,0.0,0.0,0.2625,0.625
100,0.0003,1.19625,0.440797,447.5,0.006381,0.19,0.0,0.0,0.28125,0.725


Unsloth: Will smartly offload gradients to save VRAM!
-------------------------------------------------- 
 Question:
Janet pays $40/hour for 3 hours per week of clarinet lessons and $28/hour for 5 hours a week of piano lessons. How much more does she spend on piano lessons than clarinet lessons in a year? 
Answer:
1040 
Response:
<reasoning>
Let’s calculate Janet’s weekly spending on each activity.
Clarinet lessons: 3 hours/week * $40/hour = $120/week
Piano lessons: 5 hours/week * $28/hour = $140/week
Total weekly spending on clarinet and piano: $120 + $140 = $260/week
Now, we need to calculate her annual spending on each activity.
Annual clarinet spending: $260/week * 52 weeks = $13,200
Annual piano spending: $140/week * 52 weeks = $7040
The difference between annual piano spending and annual clarinet spending is: $7040 - $13200 = -$6160
However, we want to find how much more she spends on piano lessons than clarinet lessons in a year.
So, we want to find the difference between the an

TrainOutput(global_step=250, training_loss=0.0005469313734211028, metrics={'train_runtime': 29571.4783, 'train_samples_per_second': 0.068, 'train_steps_per_second': 0.008, 'total_flos': 0.0, 'train_loss': 0.0005469313734211028})

## Save LoRA adapters

In [15]:
model.save_pretrained("saved_lora_adapter")
!zip -r ./saved_lora_adapter.zip ./saved_lora_adapter
# model.save_lora("grpo_saved_lora")

  adding: saved_lora_adapter/ (stored 0%)
  adding: saved_lora_adapter/README.md (deflated 66%)
  adding: saved_lora_adapter/adapter_config.json (deflated 56%)
  adding: saved_lora_adapter/adapter_model.safetensors (deflated 7%)


## Test LLM after fine-tuning

In [16]:
model_test, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # Enables memory-efficient training
    fast_inference=True,  # Enable vLLM fast inference
    # gpu_memory_utilization=0.6,  # Reduce if out of memory
)

# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to('cuda:0')
# tokenizer = AutoTokenizer.from_pretrained(model_id)

==((====))==  Unsloth 2025.3.18: Fast Gemma3 patching. Transformers: 4.50.0. vLLM: 0.8.1.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


In [None]:
from peft import PeftModel

# Load the saved LoRA adapter into the base model
model_test = PeftModel.from_pretrained(model_test, "./outputs/checkpoint-250")

In [18]:
question = dataset[0]["question"]

prompt = [{"role": "system", "content": SYSTEM_PROMPT},
          {"role": "user", "content": question}
          ]

# prompt = [{"role": "user", "content": f"\n{SYSTEM_PROMPT}\n\n### Question\n{question}"}]

text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
print(text)

<bos><start_of_turn>user
You are given a problem.
Think about the problem and provide your working out.
Place it between <reasoning> and </reasoning>.
Then, provide your solution between <answer></answer>

Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<end_of_turn>
<start_of_turn>model



In [19]:
model_inputs = tokenizer(text, return_tensors="pt").to(model.device)
output_ids = model_test.generate(**model_inputs)
output_text = tokenizer.decode(output_ids[0])
print(output_text)

<bos><bos><start_of_turn>user
You are given a problem.
Think about the problem and provide your working out.
Place it between <reasoning> and </reasoning>.
Then, provide your solution between <answer></answer>

Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<end_of_turn>
<start_of_turn>model
<answer>576</answer>
<end_of_turn>
