# RL GRPO Finetuning with SmolLM2-135M for Math Reasoning

In [1]:
%%capture
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

from torch import __version__ as torch_version
from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(torch_version) < V("2.4.0") else "xformers"

!pip install -q --no-deps {xformers} trl peft accelerate bitsandbytes datasets wandb huggingface_hub sentencepiece

In [2]:
import os, re, random, numpy as np, torch
from getpass import getpass
from huggingface_hub import login
import wandb

# reproducibility
random.seed(42); np.random.seed(42); torch.manual_seed(42)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


# Authenticate to Hugging Face and Weights & Biases (W&B)

In [3]:
hf_token = getpass("üîë HF token (press Enter to skip): ").strip()
if hf_token:
    login(hf_token)
else:
    print("HF login skipped.")

wb_token = getpass("üîë W&B token (press Enter to skip): ").strip()
if wb_token:
    wandb.login(key=wb_token)
    run = wandb.init(project="GRPO_SmolLM2_GSM8K", job_type="training", anonymous="allow")
else:
    os.environ["WANDB_DISABLED"] = "true"
    print("W&B disabled.")

üîë HF token (press Enter to skip): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
üîë W&B token (press Enter to skip): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maditya_rajpurohit[0m ([33maditya_rajpurohit-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Load GSM8K (problems + gold answers) and build prompts

In [4]:
from datasets import load_dataset

raw = load_dataset("gsm8k", "main")

train_src = raw["train"].select(range(2000))
eval_src  = raw["test"].select(range(200))

def extract_gold(answer_text: str):
    m = re.search(r"####\s*([^\n]+)", answer_text)
    return m.group(1).strip() if m else ""

def to_grpo(batch):
    prompts, golds = [], []
    for q, a in zip(batch["question"], batch["answer"]):
        prompt = (
            "You are a careful math assistant. Solve the problem step by step, "
            "then clearly state the final answer on a separate line as:\n"
            "'Final Answer: <number or expression>'.\n\n"
            f"Problem:\n{q}\n\nSolution:\n"
        )
        gold = extract_gold(a)
        if gold:
            prompts.append(prompt)
            golds.append(gold)
    return {"prompt": prompts, "gold": golds}

train = train_src.map(to_grpo, batched=True, remove_columns=train_src.column_names)
valid = eval_src.map(to_grpo,  batched=True, remove_columns=eval_src.column_names)

print("Train size:", len(train), "| Valid size:", len(valid))
print("Sample prompt:\n", train[0]["prompt"][:400])
print("Gold:", train[0]["gold"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Train size: 2000 | Valid size: 200
Sample prompt:
 You are a careful math assistant. Solve the problem step by step, then clearly state the final answer on a separate line as:
'Final Answer: <number or expression>'.

Problem:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Solution:

Gold: 72


# Load model (4-bit) + patch GRPO + chat template

In [5]:
from unsloth import FastLanguageModel, is_bfloat16_supported, PatchFastRL
from unsloth.chat_templates import get_chat_template

PatchFastRL("GRPO", FastLanguageModel)

max_seq_length = 1024
dtype, load_in_4bit = None, True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/SmolLM2-135M-Instruct",
    max_seq_length = max_seq_length,
    dtype          = dtype,
    load_in_4bit   = load_in_4bit,
    token          = hf_token if hf_token else None,
)

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role":"from", "content":"value", "user":"human", "assistant":"gpt"},
    map_eos_token = True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("‚úÖ Model & tokenizer ready.")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


‚úÖ Model & tokenizer ready.


# Add LoRA adapters

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
)
print("‚úÖ LoRA adapters attached.")

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.3 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


‚úÖ LoRA adapters attached.


# Reward function

In [7]:
def extract_final_answer_from_text(text: str):
    m = re.search(r"(?i)final\s*answer\s*:\s*([^\n]+)", text)
    if m:
        return m.group(1).strip()
    nums = re.findall(r"[-+]?\d+(?:/\d+)?(?:\.\d+)?", text)
    return nums[-1].strip() if nums else text.strip()[-50:]

def reward_fn(prompts, completions, completion_ids=None, rewards=None, gold=None, golds=None, **kwargs):
    gold_list = None
    for cand in (rewards, gold, golds, kwargs.get("labels"), kwargs.get("targets")):
        if cand is not None:
            gold_list = cand
            break
    if gold_list is None:
        gold_list = [""] * len(completions)

    out = []
    for comp, g in zip(completions, gold_list):
        pred = extract_final_answer_from_text(comp)
        norm_pred = pred.replace(",", "").strip()
        norm_gold = str(g).replace(",", "").strip()
        r = 1.0 if norm_pred == norm_gold else 0.0
        penalty = 0.1 if len(comp) > 1500 else 0.0
        out.append(max(0.0, r - penalty))
    return out

print("‚úÖ Reward function ready.")

‚úÖ Reward function ready.


# GRPOConfig

In [8]:
from trl import GRPOConfig

report_to = "wandb" if os.environ.get("WANDB_DISABLED","false") != "true" else "none"

grpo_cfg = GRPOConfig(
    output_dir                  = "smollm2-grpo-gsm8k",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 8,
    learning_rate               = 1e-5,
    num_train_epochs            = 1,
    max_steps                   = 50,
    warmup_ratio                = 0.05,
    logging_steps               = 10,
    save_strategy               = "epoch",
    bf16                        = (torch.cuda.is_available() and torch.cuda.is_bf16_supported()),
    fp16                        = (torch.cuda.is_available() and not torch.cuda.is_bf16_supported()),
    report_to                   = report_to,

    # GRPO sampling
    num_generations             = 6,
    max_prompt_length           = 512,
    max_completion_length       = 384,
    temperature                 = 0.7,
    top_p                       = 0.9,
)
print("‚úÖ GRPOConfig ready.")

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 2 to the `num_generations` of 6
‚úÖ GRPOConfig ready.


# Initialize GRPOTrainer

In [9]:
from trl import GRPOTrainer

def make_prompt(ex):
    return ex["prompt"]

generate_kwargs = {"temperature": 0.7, "top_p": 0.9, "do_sample": True}

try:
    trainer = GRPOTrainer(
        model            = model,
        processing_class = tokenizer,
        args             = grpo_cfg,
        train_dataset    = train,
        eval_dataset     = valid,
        reward_funcs     = [reward_fn],
        reward_func_names= ["em_exact"],
        reward_weights   = [1.0],
        reward_column    = "gold",
        prompt_func      = make_prompt,
        generate_kwargs  = generate_kwargs,
    )
except TypeError:
    trainer = GRPOTrainer(
        model            = model,
        processing_class = tokenizer,
        args             = grpo_cfg,
        train_dataset    = train,
        eval_dataset     = valid,
        reward_func      = reward_fn,
        reward_column    = "gold",
        prompt_func      = make_prompt,
        generate_kwargs  = generate_kwargs,
    )

    print("‚úÖ GRPOTrainer initialized.")

# Train the Model

In [10]:
if torch.cuda.is_available():
    gpu = torch.cuda.get_device_properties(0)
    print(f"GPU: {gpu.name} | VRAM: {round(gpu.total_memory/1e9,2)} GB")

train_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.


GPU: Tesla T4 | VRAM: 15.83 GB


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 6 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (6 x 8 x 1) = 48
 "-____-"     Trainable parameters = 4,884,480 of 139,400,064 (3.50% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / reward_fn / mean,rewards / reward_fn / std
10,0.0,0.00625,0.015309,317.947925,54.8,384.0,0.595833,220.705414,54.8,362.2,0,0,0,0,0,1.4e-05,0.00625,0.043301
20,0.0,0.00625,0.015309,331.604172,64.9,384.0,0.654167,232.782462,64.9,365.5,No Log,No Log,No Log,No Log,No Log,1.5e-05,0.00625,0.043301
30,0.0,0.005625,0.006162,332.043759,57.5,384.0,0.691667,214.397693,57.5,357.6,No Log,No Log,No Log,No Log,No Log,1.5e-05,0.005625,0.022016
40,0.0,0.010417,0.025516,325.727094,62.7,384.0,0.622917,230.26069,62.7,372.7,No Log,No Log,No Log,No Log,No Log,1.6e-05,0.010417,0.063495
50,0.0,0.0,0.0,320.31459,60.9,384.0,0.635417,209.579317,60.9,354.0,No Log,No Log,No Log,No Log,No Log,1.6e-05,0.0,0.0


# Runtime Statistics

In [11]:
mins = round(train_stats.metrics.get("train_runtime", 0)/60, 2)
peak = f"{round(torch.cuda.max_memory_reserved()/1e9,3)} GB" if torch.cuda.is_available() else "CPU"
print(f"‚è± Runtime: {mins} minutes | üíæ Peak reserved GPU memory: {peak}")

‚è± Runtime: 36.9 minutes | üíæ Peak reserved GPU memory: 2.194 GB


# Inference Test

In [12]:
from unsloth import FastLanguageModel

FastLanguageModel.for_inference(model)
gen_dtype = torch.bfloat16 if is_bfloat16_supported() else (torch.float16 if torch.cuda.is_available() else torch.float32)
_ = model.to(device=device, dtype=gen_dtype)

def ask(problem, max_new_tokens=256):
    prompt = (
        "You are a careful math assistant. Solve the problem step by step, "
        "then clearly state the final answer on a separate line as:\n"
        "'Final Answer: <number or expression>'.\n\n"
        f"Problem:\n{problem}\n\nSolution:\n"
    )
    tok = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        out = model.generate(
            **tok, max_new_tokens=max_new_tokens,
            do_sample=True, temperature=0.7, top_p=0.9, use_cache=True,
        )
    print(tokenizer.decode(out[0], skip_special_tokens=True))

# tests
ask("Solve for x: 2x + 5 = 19.")
ask("A rectangle has length 12 and width 7. What is its area?")

You are a careful math assistant. Solve the problem step by step, then clearly state the final answer on a separate line as:
'Final Answer: <number or expression>'.

Problem:
Solve for x: 2x + 5 = 19.

Solution:
First, let's solve the equation.
2x + 5 = 19
2(x) + 5 = 19
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 29
2(x) + 5 = 
You are a careful math assistant. Solve the problem step by step, then clearly state the final answer on a separate line as:
'Final Answer: <number or expression>'.

Problem:
A rectangle has length 12 and width 7. What is its area?

Solution:
The area of a rectangle is given by the formula: area = length * width.
The length of the rectangle is 12 and the width is 7.
The area is given by the formula: area = length * width = 12 * 7 = 80.

Final Answer: 80 square 

# Save the Model

In [None]:
save_dir = "SmolLM2-135M-GRPO-GSM8K"
repo_id  = "username/SmolLM2-135M-GRPO-GSM8K"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("‚úÖ Saved to:", save_dir)

model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)
print("‚úÖ Uploaded:", repo_id)

‚úÖ Saved to: SmolLM2-135M-GRPO-GSM8K


README.md:   0%|          | 0.00/588 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   5%|5         |  527kB / 9.82MB            

Saved model to https://huggingface.co/aditya-rajpurohit/SmolLM2-135M-GRPO-GSM8K
‚úÖ Uploaded: aditya-rajpurohit/SmolLM2-135M-GRPO-GSM8K
