In [1]:
## Install vllm for fast inference and unsloth for optimized models
!pip install -qqq unsloth vllm
!pip install -qqq --upgrade pillow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.5/192.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.3/265.3 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.6/71.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.6/87.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K  

In [2]:
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-24 13:11:21 [__init__.py:256] Automatically detected platform cuda.


In [3]:
model_id = "google/gemma-3-1b-it"

max_seq_length = 1024  # Can increase for longer reasoning traces
# use high ranks (16, 32, 64) for small models (<1B) and small ranks (8, 16) for large models (>1B).
# Also take memory resource into your considrations.
lora_r = 8
lora_dropout = 0
seed = 3407

In [4]:
# Define the system prompt that instructs the model to use a specific format
reasoning_start = "<reasoning>"
reasoning_end   = "</reasoning>"
solution_start = "<answer>"
solution_end = "</answer>"

SYSTEM_PROMPT = \
f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""

In [5]:
import html
import re
from datasets import load_dataset, Dataset

# For GSM8K dataset, We notice all answers like about have a ####, so we extract it
def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()


# Function to prepare the GSM8K dataset
def get_gsm8k_questions(system_prompt:str) -> Dataset:
    dataset = load_dataset("openai/gsm8k", "main", split="train")
    dataset = dataset.map(
        lambda x: {
            "prompt": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": x["question"]},
            ],
            "answer": extract_hash_answer(x["answer"]),
        }
    )
    return dataset

In [6]:
dataset = get_gsm8k_questions(system_prompt=SYSTEM_PROMPT)
dataset

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 7473
})

In [29]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # Enables memory-efficient training
    fast_inference=True,  # Enable vLLM fast inference
    # gpu_memory_utilization=0.6,  # Reduce if out of memory
)

==((====))==  Unsloth 2025.3.18: Fast Gemma3 patching. Transformers: 4.50.0. vLLM: 0.8.1.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


In [30]:
i = 2
question = dataset[i]["question"]
# question = "Janet pays $40/hour for 3 hours per week of clarinet lessons and $28/hour for 5 hours a week of piano lessons. How much more does she spend on piano lessons than clarinet lessons in a year?"

prompt = [{"role": "system", "content": SYSTEM_PROMPT},
          {"role": "user", "content": question}
          ]

# prompt = [{"role": "user", "content": f"\n{SYSTEM_PROMPT}\n\n### Question\n{question}"}]

text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
print(text)
print(f"Answer: {dataset[i]['answer']}")

<bos><start_of_turn>user
You are given a problem.
Think about the problem and provide your working out.
Place it between <reasoning> and </reasoning>.
Then, provide your solution between <answer></answer>

Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?<end_of_turn>
<start_of_turn>model

Answer: 5


In [31]:
model_inputs = tokenizer(text, return_tensors="pt").to(model.device)
output_ids = model.generate(**model_inputs)
output_text = tokenizer.decode(output_ids[0])
print(output_text)

<bos><bos><start_of_turn>user
You are given a problem.
Think about the problem and provide your working out.
Place it between <reasoning> and </reasoning>.
Then, provide your solution between <answer></answer>

Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?<end_of_turn>
<start_of_turn>model
<answer>100.0</answer><end_of_turn>


In [12]:
from peft import PeftModel

# Load the saved LoRA adapter into the base model
model = PeftModel.from_pretrained(model, "/kaggle/input/finetuning-any-llm/saved_lora_adapter")

In [28]:
i = 2
question = dataset[i]["question"]
# question = "Janet pays $40/hour for 3 hours per week of clarinet lessons and $28/hour for 5 hours a week of piano lessons. How much more does she spend on piano lessons than clarinet lessons in a year?"

prompt = [{"role": "system", "content": SYSTEM_PROMPT},
          {"role": "user", "content": question}
          ]

# prompt = [{"role": "user", "content": f"\n{SYSTEM_PROMPT}\n\n### Question\n{question}"}]

text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
print(text)
print(f"Answer: {dataset[i]['answer']}")

<bos><start_of_turn>user
You are given a problem.
Think about the problem and provide your working out.
Place it between <reasoning> and </reasoning>.
Then, provide your solution between <answer></answer>

Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?<end_of_turn>
<start_of_turn>model

Answer: 5


In [23]:
model_inputs = tokenizer(text, return_tensors="pt").to(model.device)
output_ids = model.generate(**model_inputs)
output_text = tokenizer.decode(output_ids[0])
print(output_text)

<bos><bos><start_of_turn>user
You are given a problem.
Think about the problem and provide your working out.
Place it between <reasoning> and </reasoning>.
Then, provide your solution between <answer></answer>

Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?<end_of_turn>
<start_of_turn>model
<reasoning>
Let the cost of the wallet be $100.
Betty has half of the money she needs, so she has $\frac{1}{2} \times 100 = 50$.
Her parents gave her $15.
Her grandparents gave her twice as much as her parents, so grandparents gave her $2 \times 15 = 30$.
The total amount of money her grandparents gave her is $30$.
So, the total amount of money Betty has is $50 + 15 + 30 = 95$.
The amount of money she needs is $100 - 95 = 5$.
Therefore, Betty needs $5 more to buy the wallet.
</reasoning>
<