In [1]:
# Configuration
MODEL = "unsloth/Qwen2.5-1.5B"
MAX_SEQ_LENGTH = 2048
LORA_RANK = 8
EVAL_N = 512  # Number of examples to evaluate on
SFT_MODEL_TRAIN_SAMPLES = 1024  # Full GSM8K train split
RL_MODEL_TRAIN_SAMPLES = 256  # Subset for RL training
BATCH_SIZE = 32  # Batch size for evaluation

# Reasoning and solution tokens
reasoning_start = "<start_working_out>"
reasoning_end = "<end_working_out>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"

system_prompt = f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""

print(f"Configuration:")
print(f"Model: {MODEL}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")
print(f"LoRA rank: {LORA_RANK}")
print(f"Evaluation samples: {EVAL_N}")
print(f"SFT training samples: {SFT_MODEL_TRAIN_SAMPLES}")
print(f"RL training samples: {RL_MODEL_TRAIN_SAMPLES}")
print(f"Batch size: {BATCH_SIZE}")


Configuration:
Model: unsloth/Qwen2.5-1.5B
Max sequence length: 2048
LoRA rank: 8
Evaluation samples: 512
SFT training samples: 1024
RL training samples: 256
Batch size: 32


In [2]:
# Imports
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import re
from trl import SFTTrainer, SFTConfig, GRPOConfig, GRPOTrainer
from vllm import SamplingParams
import gc
import os


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 08-19 21:16:10 [__init__.py:235] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
def load_model(model_name, max_seq_length, lora_rank, load_in_4bit=False):
    """Load and prepare model for training/inference"""
    print(f"Loading model: {model_name}")
    
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = max_seq_length,
        load_in_4bit = load_in_4bit,
        fast_inference = True,
        max_lora_rank = lora_rank,
        gpu_memory_utilization = 0.7,
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r = lora_rank,
        target_modules = [
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ],
        lora_alpha = lora_rank*2,
        use_gradient_checkpointing = "unsloth",
        random_state = 3407,
    )
    
    # Set up chat template
    chat_template = \
        "{% if messages[0]['role'] == 'system' %}"\
            "{{ messages[0]['content'] + eos_token }}"\
            "{% set loop_messages = messages[1:] %}"\
        "{% else %}"\
            "{{ '{system_prompt}' + eos_token }}"\
            "{% set loop_messages = messages %}"\
        "{% endif %}"\
        "{% for message in loop_messages %}"\
            "{% if message['role'] == 'user' %}"\
                "{{ message['content'] }}"\
            "{% elif message['role'] == 'assistant' %}"\
                "{{ message['content'] + eos_token }}"\
            "{% endif %}"\
        "{% endfor %}"\
        "{% if add_generation_prompt %}{{ '{reasoning_start}' }}"\
        "{% endif %}"

    chat_template = chat_template.replace("'{system_prompt}'", f"'{system_prompt}'").replace("'{reasoning_start}'", f"'{reasoning_start}'")
    tokenizer.chat_template = chat_template
    
    print("Model loaded successfully!")
    return model, tokenizer


In [4]:
def extract_hash_answer(text):
    """Extract answer from #### format"""
    if "####" not in text: 
        return None
    return text.split("####")[1].strip()

def extract_thinking(text):
    """Extract thinking between reasoning tokens"""
    pattern = rf"{re.escape(reasoning_start)}(.*?){re.escape(reasoning_end)}"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else None

def extract_solution(text):
    """Extract solution between solution tokens"""
    pattern = rf"{re.escape(solution_start)}(.*?){re.escape(solution_end)}"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else None


In [5]:
def evaluate_model(model, tokenizer, dataset, eval_n, output_file, model_path=None):
    """Evaluate model on GSM8K test set using fast_generate with batching"""
    print(f"Evaluating model on {eval_n} examples with batch size {BATCH_SIZE}...")
    
    # Load LoRA adapter if model_path is provided
    lora_adapter = None
    if model_path:
        print(f"Loading LoRA adapter from {model_path}")
        lora_adapter = model.load_lora(model_path)
    
    # Take subset for evaluation
    eval_dataset = dataset.select(range(min(eval_n, len(dataset))))
    
    results = []
    correct_count = 0
    thinking_count = 0
    extracted_answer_count = 0
    
    # Process in batches
    for batch_start in range(0, len(eval_dataset), BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, len(eval_dataset))
        batch = eval_dataset.select(range(batch_start, batch_end))
        
        print(f"Processing batch {batch_start//BATCH_SIZE + 1}/{(len(eval_dataset)-1)//BATCH_SIZE + 1} (examples {batch_start}-{batch_end-1})")
        
        # Prepare batch prompts
        batch_prompts = []
        batch_questions = []
        batch_labeled_cots = []
        batch_labeled_answers = []
        
        for example in batch:
            question = example["question"]
            labeled_cot = example["answer"]
            labeled_answer = extract_hash_answer(labeled_cot)
            
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": question}
            ]
            
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            batch_prompts.append(text)
            batch_questions.append(question)
            batch_labeled_cots.append(labeled_cot)
            batch_labeled_answers.append(labeled_answer)
        
        # Generate responses using fast_generate
        sampling_params = SamplingParams(
            temperature=0.7,
            top_k=50,
            max_tokens=512,
            stop=[tokenizer.eos_token],
        )
        
        outputs = model.fast_generate(
            batch_prompts,
            sampling_params=sampling_params,
            lora_request=lora_adapter,  # Use the LoRA adapter
        )
        
        # Process batch results
        for i, output in enumerate(outputs):
            generated_text = output.outputs[0].text
            
            question = batch_questions[i]
            labeled_cot = batch_labeled_cots[i]
            labeled_answer = batch_labeled_answers[i]
            
            # Extract components
            extracted_thinking = extract_thinking(reasoning_start + generated_text)
            extracted_answer = extract_solution(generated_text)
            
            # Check correctness
            correct = False
            if extracted_answer and labeled_answer:
                try:
                    # Try numerical comparison
                    extracted_num = float(extracted_answer.replace(",", "").strip())
                    labeled_num = float(labeled_answer.replace(",", "").strip())
                    correct = abs(extracted_num - labeled_num) < 1e-6
                except:
                    # Fall back to string comparison
                    correct = extracted_answer.strip() == labeled_answer.strip()
            
            # Update counts
            if correct:
                correct_count += 1
            if extracted_thinking:
                thinking_count += 1
            if extracted_answer:
                extracted_answer_count += 1
                
            results.append({
                "question": question,
                "labeled_cot": labeled_cot,
                "labeled_answer": labeled_answer,
                "generated_text": generated_text,
                "extracted_thinking": extracted_thinking,
                "extracted_answer": extracted_answer,
                "correct": correct
            })
    
    # Save results
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    
    # Print metrics
    total = len(results)
    accuracy = correct_count / total
    thinking_prop = thinking_count / total
    answer_prop = extracted_answer_count / total
    
    print(f"\nResults saved to {output_file}")
    print(f"Accuracy: {accuracy:.3f} ({correct_count}/{total})")
    print(f"Proportion with thinking: {thinking_prop:.3f} ({thinking_count}/{total})")
    print(f"Proportion with extracted answer: {answer_prop:.3f} ({extracted_answer_count}/{total})")
    
    return accuracy, thinking_prop, answer_prop

In [6]:
def prepare_sft_dataset(dataset, n_samples, tokenizer):
    """Prepare dataset for SFT training"""
    print(f"Preparing SFT dataset with {n_samples} samples...")
    
    # Take subset
    train_data = dataset.select(range(min(n_samples, len(dataset))))
    
    def format_example(example):
        question = example["question"]
        answer_text = example["answer"]
        
        # Extract the numerical answer
        numerical_answer = extract_hash_answer(answer_text)
        if not numerical_answer:
            return None
            
        # Create formatted response with thinking and solution
        # Use the step-by-step solution as thinking
        thinking = answer_text.split("####")[0].strip()
        
        response = f"{reasoning_start}well...{thinking}{reasoning_end}{solution_start}{numerical_answer}{solution_end}"
        
        return {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": question},
                {"role": "assistant", "content": response}
            ]
        }
    
    # Format examples
    formatted_data = []
    for example in train_data:
        formatted = format_example(example)
        if formatted:
            formatted_data.append(formatted)
    
    # Convert to text format
    for item in formatted_data:
        item["text"] = tokenizer.apply_chat_template(item["messages"], tokenize=False)
    
    dataset_dict = {
        "messages": [item["messages"] for item in formatted_data],
        "text": [item["text"] for item in formatted_data]
    }
    
    return Dataset.from_dict(dataset_dict)


In [7]:
def train_sft_model(model, tokenizer, train_dataset, output_dir="sft_model", learning_rate=2e-4):
    """Train model using SFT"""
    print("Starting SFT training...")
    
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        args=SFTConfig(
            dataset_text_field="text",
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            warmup_steps=10,
            num_train_epochs=1,
            learning_rate=2e-4,
            logging_steps=10,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
            report_to="none",
            output_dir=output_dir,
            save_steps=500,
        ),
    )
    
    trainer.train()
    trainer.save_model(output_dir)
    print(f"SFT model saved to {output_dir}")
    
    return model


In [8]:
def prepare_grpo_dataset(dataset, n_samples):
    """Prepare dataset for GRPO training"""
    print(f"Preparing GRPO dataset with {n_samples} samples...")
    
    train_data = dataset.select(range(min(n_samples, len(dataset))))
    
    def format_for_grpo(example):
        question = example["question"]
        answer = extract_hash_answer(example["answer"])
        
        return {
            "prompt": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": question},
            ],
            "answer": answer
        }
    
    formatted_data = []
    for example in train_data:
        formatted = format_for_grpo(example)
        if formatted["answer"]:
            formatted_data.append(formatted)
    
    return Dataset.from_list(formatted_data)


In [9]:
# Reward functions for GRPO
def match_format_exactly(completions, **kwargs):
    solution_end_regex = r"</SOLUTION>[\s]{0,}"
    match_format = re.compile(
        rf"{re.escape(reasoning_end)}.*?{re.escape(solution_start)}(.+?){solution_end_regex}[\s]{{0,}}$",
        flags=re.MULTILINE | re.DOTALL
    )
    
    scores = []
    for completion in completions:
        response = completion[0]["content"]
        score = 3.0 if match_format.search(response) else 0.0
        scores.append(score)
    return scores

def match_format_approximately(completions, **kwargs):
    scores = []
    for completion in completions:
        response = completion[0]["content"]
        score = 0
        score += 0.5 if response.count(reasoning_end) == 1 else -1.0
        score += 0.5 if response.count(solution_start) == 1 else -1.0
        score += 0.5 if response.count(solution_end) == 1 else -1.0
        scores.append(score)
    return scores

def check_answer(prompts, completions, answer, **kwargs):
    solution_end_regex = r"</SOLUTION>[\s]{0,}"
    match_format = re.compile(
        rf"{re.escape(reasoning_end)}.*?{re.escape(solution_start)}(.+?){solution_end_regex}",
        flags=re.MULTILINE | re.DOTALL
    )
    
    responses = [completion[0]["content"] for completion in completions]
    extracted_responses = [
        match.group(1).strip() if (match := match_format.search(r)) else None
        for r in responses
    ]
    
    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None:
            scores.append(-2.0)
            continue
            
        if guess == true_answer:
            score = 5.0
        elif guess.strip() == true_answer.strip():
            score = 3.5
        else:
            try:
                ratio = float(guess) / float(true_answer)
                if 0.9 <= ratio <= 1.1:
                    score = 2.0
                elif 0.8 <= ratio <= 1.2:
                    score = 1.5
                else:
                    score = -2.5
            except:
                score = -4.5
        scores.append(score)
    return scores


In [19]:
def train_grpo_model(model, tokenizer, train_dataset, output_dir="grpo_model"):
    """Train model using GRPO"""
    print("Starting GRPO training...")
    
    # Calculate max prompt length
    tokenized = train_dataset.map(
        lambda x: {"tokens": tokenizer.apply_chat_template(x["prompt"], add_generation_prompt=True, tokenize=True)},
        batched=True,
    )
    max_prompt_length = int(np.quantile([len(tokens) for tokens in tokenized["tokens"]], 0.9)) + 1
    max_completion_length = MAX_SEQ_LENGTH - max_prompt_length
    
    print(f"Max prompt length: {max_prompt_length}")
    print(f"Max completion length: {max_completion_length}")
    
    vllm_sampling_params = SamplingParams(
        min_p=0.1,
        top_p=1.0,
        top_k=-1,
        seed=3407,
        stop=[tokenizer.eos_token],
        include_stop_str_in_output=True,
    )
    
    training_args = GRPOConfig(
        vllm_sampling_params=vllm_sampling_params,
        temperature=1.0,
        learning_rate=5e-6,
        weight_decay=0.01,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        optim="adamw_8bit",
        logging_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        num_generations=2,
        max_prompt_length=max_prompt_length,
        max_completion_length=max_completion_length,
        max_steps=200,
        save_steps=100,
        report_to="none",
        output_dir=output_dir,
    )
    
    trainer = GRPOTrainer(
        model=model,
        processing_class=tokenizer,
        reward_funcs=[
            match_format_exactly,
            match_format_approximately,
            check_answer,
        ],
        args=training_args,
        train_dataset=train_dataset,
    )
    
    trainer.train()
    trainer.save_model(output_dir)
    print(f"GRPO model saved to {output_dir}")
    
    return model


In [11]:
# Load datasets
print("Loading GSM8K dataset...")
gsm8k_train = load_dataset("openai/gsm8k", "main", split="train")
gsm8k_test = load_dataset("openai/gsm8k", "main", split="test")

print(f"GSM8K train size: {len(gsm8k_train)}")
print(f"GSM8K test size: {len(gsm8k_test)}")


Loading GSM8K dataset...
GSM8K train size: 7473
GSM8K test size: 1319


In [12]:
print("="*50)
print("STEP 1: EVALUATING BASE MODEL")
print("="*50)

base_model, tokenizer = load_model(MODEL, MAX_SEQ_LENGTH, LORA_RANK)


STEP 1: EVALUATING BASE MODEL
Loading model: unsloth/Qwen2.5-1.5B
Unsloth: Patching vLLM v1 graph capture
Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.8.6: Fast Qwen2 patching. Transformers: 4.55.2. vLLM: 0.10.0.
   \\   /|    NVIDIA RTX 4000 Ada Generation. Num GPUs = 1. Max memory: 19.674 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/Qwen2.5-1.5B with actual GPU utilization = 69.29%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 19.67 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 10.74 GB. Also swap space = 6 GB.
Unsloth: Not an error, but `device` is not supported 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 08-19 16:33:48 [default_loader.py:262] Loading weights took 0.56 seconds
INFO 08-19 16:33:48 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 08-19 16:33:49 [gpu_model_runner.py:1892] Model loading took 2.9034 GiB and 1.887167 seconds
INFO 08-19 16:34:06 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cache/7d34cb9709/rank_0_0/backbone for vLLM's torch.compile
INFO 08-19 16:34:06 [backends.py:541] Dynamo bytecode transform time: 16.37 s
INFO 08-19 16:34:14 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 6.260 s
INFO 08-19 16:34:18 [monitor.py:34] torch.compile takes 16.37 s in total
INFO 08-19 16:34:19 [gpu_worker.py:255] Available KV cache memory: 9.48 GiB
INFO 08-19 16:34:20 [kv_cache_utils.py:833] GPU KV cache size: 355,040 tokens
INFO 08-19 16:34:20 [kv_cache_utils.py:837] Maximum concurrency for 2,048 tokens per request: 173.36x
INFO 08-19 16:34:20 [vllm_utils.py:643] Unsloth: Running patched vLLM v1 `c

Capturing CUDA graph shapes: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 59/59 [00:20<00:00,  2.87it/s]

INFO 08-19 16:34:40 [gpu_model_runner.py:2485] Graph capturing finished in 21 secs, took 0.55 GiB
INFO 08-19 16:34:40 [vllm_utils.py:650] Unsloth: Patched vLLM v1 graph capture finished in 21 secs.





INFO 08-19 16:34:41 [core.py:193] init engine (profile, create kv cache, warmup model) took 52.28 seconds
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'k_norm', 'pre_feedforward_layernorm', 'q_norm']
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'k_norm', 'pre_feedforward_layernorm', 'q_norm']


Unsloth 2025.8.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Model loaded successfully!


In [13]:

# Evaluate base model
base_accuracy, base_thinking_prop, base_answer_prop = evaluate_model(
    base_model, tokenizer, gsm8k_test, EVAL_N, "base_model_eval.csv"
)

print(f"\nBase model results:")
print(f"Accuracy: {base_accuracy:.3f}")
print(f"Thinking proportion: {base_thinking_prop:.3f}")
print(f"Answer proportion: {base_answer_prop:.3f}")


Evaluating model on 50 examples with batch size 32...
Processing batch 1/2 (examples 0-31)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 2/2 (examples 32-49)


Adding requests:   0%|          | 0/18 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/18 [00:00<?, ?it/s, est. s…


Results saved to base_model_eval.csv
Accuracy: 0.020 (1/50)
Proportion with thinking: 0.500 (25/50)
Proportion with extracted answer: 0.180 (9/50)

Base model results:
Accuracy: 0.020
Thinking proportion: 0.500
Answer proportion: 0.180


In [14]:
# Step 2: SFT training
print("\n" + "="*50)
print("STEP 2: SFT TRAINING")
print("="*50)

# Prepare SFT dataset
sft_dataset = prepare_sft_dataset(gsm8k_train, SFT_MODEL_TRAIN_SAMPLES, tokenizer)
print(f"SFT dataset size: {len(sft_dataset)}")
sft_dataset[0]['messages'][-1]['content']


STEP 2: SFT TRAINING
Preparing SFT dataset with 1024 samples...
SFT dataset size: 1024


'<start_working_out>well...Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.<end_working_out><SOLUTION>72</SOLUTION>'

In [15]:
# Train SFT model
sft_model = train_sft_model(base_model, tokenizer, sft_dataset, "sft_model", learning_rate=2e-5)


Starting SFT training...


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1024 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,024 | Num Epochs = 1 | Total steps = 64
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.0598
20,0.4218
30,0.2559
40,0.2375
50,0.2533
60,0.227


SFT model saved to sft_model


In [16]:
messages = [
    {"role": "user", "content": "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False,
)

sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.8,
    top_k=20,
    max_tokens=256,
    stop=[tokenizer.eos_token],
)

lora_adapter = sft_model.load_lora("sft_model")

outputs = sft_model.fast_generate(
    [text],  # Note: needs to be a list even for single input
    sampling_params=sampling_params,
    lora_request=lora_adapter,
)

generated_text = outputs[0].outputs[0].text
print(generated_text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|                                                         | 0/1 [00:00<?, ?it/s, est. s…

well...The equation (x + 2)^2 = 0 can be solved by taking the square root of both sides, giving x + 2 = 0.
Then, solving for x, we have x = -2.<end_working_out><SOLUTION>-2</SOLUTION>


In [23]:
# Evaluate SFT model
print("\nEvaluating SFT model...")
sft_accuracy, sft_thinking_prop, sft_answer_prop = evaluate_model(
    sft_model, tokenizer, gsm8k_test, EVAL_N, "sft_model_eval.csv", "sft_model"
)

print(f"\nSFT model results:")
print(f"Accuracy: {sft_accuracy:.3f}")
print(f"Thinking proportion: {sft_thinking_prop:.3f}")
print(f"Answer proportion: {sft_answer_prop:.3f}")



Evaluating SFT model...


NameError: name 'sft_model' is not defined

In [14]:
# Clean up memory
del sft_model
torch.cuda.empty_cache()
gc.collect()

torch.cuda.reset_peak_memory_stats()

# Reset all memory stats
torch.cuda.reset_accumulated_memory_stats()


print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

# More detailed info
print(torch.cuda.memory_summary())

# Get max memory used
print(f"Max allocated: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")




Allocated: 0.00 GB
Cached: 0.00 GB
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|----------------------------

In [12]:
# Step 3: RL training (SFT + GRPO)
print("\n" + "="*50)
print("STEP 3: RL TRAINING (SFT + GRPO)")
print("="*50)



STEP 3: RL TRAINING (SFT + GRPO)


In [13]:
print("Loading fresh base model for RL training...")
rl_base_model, rl_tokenizer = load_model(MODEL, MAX_SEQ_LENGTH, LORA_RANK)


Loading fresh base model for RL training...
Loading model: unsloth/Qwen2.5-1.5B
Unsloth: Patching vLLM v1 graph capture
Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.8.6: Fast Qwen2 patching. Transformers: 4.55.2. vLLM: 0.10.0.
   \\   /|    NVIDIA RTX 4000 Ada Generation. Num GPUs = 1. Max memory: 19.674 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/Qwen2.5-1.5B with actual GPU utilization = 69.29%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 19.67 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 10.74 GB. Also swap space = 6 GB.
Unsloth: Not an error, but `device` is 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 08-19 16:39:35 [default_loader.py:262] Loading weights took 0.55 seconds
INFO 08-19 16:39:35 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 08-19 16:39:36 [gpu_model_runner.py:1892] Model loading took 2.9034 GiB and 1.605695 seconds
INFO 08-19 16:39:50 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cache/7d34cb9709/rank_0_0/backbone for vLLM's torch.compile
INFO 08-19 16:39:50 [backends.py:541] Dynamo bytecode transform time: 13.80 s
INFO 08-19 16:39:58 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 6.297 s
INFO 08-19 16:40:13 [monitor.py:34] torch.compile takes 13.80 s in total
INFO 08-19 16:40:14 [gpu_worker.py:255] Available KV cache memory: 9.48 GiB
INFO 08-19 16:40:15 [kv_cache_utils.py:833] GPU KV cache size: 354,896 tokens
INFO 08-19 16:40:15 [kv_cache_utils.py:837] Maximum concurrency for 2,048 tokens per request: 173.29x
INFO 08-19 16:40:15 [vllm_utils.py:643] Unsloth: Running patched vLLM v1 `c

Capturing CUDA graph shapes: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 59/59 [00:22<00:00,  2.68it/s]

INFO 08-19 16:40:37 [gpu_model_runner.py:2485] Graph capturing finished in 22 secs, took 0.55 GiB
INFO 08-19 16:40:37 [vllm_utils.py:650] Unsloth: Patched vLLM v1 graph capture finished in 22 secs.





INFO 08-19 16:40:38 [core.py:193] init engine (profile, create kv cache, warmup model) took 62.15 seconds
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'q_norm', 'pre_feedforward_layernorm', 'k_norm']
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'q_norm', 'pre_feedforward_layernorm', 'k_norm']


Unsloth 2025.8.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Model loaded successfully!


In [14]:
# First do SFT on subset
print(f"\nDoing SFT on {RL_MODEL_TRAIN_SAMPLES} samples...")
rl_sft_dataset = prepare_sft_dataset(gsm8k_train, RL_MODEL_TRAIN_SAMPLES, rl_tokenizer)
rl_sft_model = train_sft_model(rl_base_model, rl_tokenizer, rl_sft_dataset, "rl_sft_model")



Doing SFT on 256 samples...
Preparing SFT dataset with 256 samples...
Starting SFT training...


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/256 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 256 | Num Epochs = 1 | Total steps = 16
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.0032


SFT model saved to rl_sft_model


In [24]:
print("\nEvaluating RL-SFT model...")
rl_sft_accuracy, rl_srl_ft_thinking_prop, rl_sft_answer_prop = evaluate_model(
    rl_sft_model, rl_tokenizer, gsm8k_test, EVAL_N, "rl-sft_model_eval.csv", "rl_sft_model"
)

print(f"\nRL-SFT model results:")
print(f"Accuracy: {rl_sft_accuracy:.3f}")
print(f"Thinking proportion: {rl_srl_ft_thinking_prop:.3f}")
print(f"Answer proportion: {rl_sft_answer_prop:.3f}")



Evaluating RL-SFT model...
Evaluating model on 512 examples with batch size 32...
Loading LoRA adapter from rl_sft_model
Processing batch 1/16 (examples 0-31)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 2/16 (examples 32-63)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 3/16 (examples 64-95)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 4/16 (examples 96-127)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 5/16 (examples 128-159)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 6/16 (examples 160-191)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 7/16 (examples 192-223)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 8/16 (examples 224-255)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 9/16 (examples 256-287)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 10/16 (examples 288-319)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 11/16 (examples 320-351)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 12/16 (examples 352-383)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 13/16 (examples 384-415)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 14/16 (examples 416-447)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 15/16 (examples 448-479)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 16/16 (examples 480-511)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…


Results saved to rl-sft_model_eval.csv
Accuracy: 0.186 (95/512)
Proportion with thinking: 0.770 (394/512)
Proportion with extracted answer: 0.695 (356/512)

RL-SFT model results:
Accuracy: 0.186
Thinking proportion: 0.770
Answer proportion: 0.695


In [20]:
# Then do GRPO
print(f"\nDoing GRPO training...")
grpo_dataset = prepare_grpo_dataset(gsm8k_train, RL_MODEL_TRAIN_SAMPLES)
grpo_model = train_grpo_model(rl_sft_model, rl_tokenizer, grpo_dataset, "grpo_model")



Doing GRPO training...
Preparing GRPO dataset with 256 samples...
Starting GRPO training...


Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Max prompt length: 138
Max completion length: 1910


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 256 | Num Epochs = 2 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,entropy,rewards / match_format_exactly / mean,rewards / match_format_exactly / std,rewards / match_format_approximately / mean,rewards / match_format_approximately / std,rewards / check_answer / mean,rewards / check_answer / std
5,0.0004,-1.65,2.192031,164.6,38.4,398.8,0.0,164.6,38.4,398.8,0.414518,0,1.05,1.33923,-0.375,1.445201,-2.325,1.547021
10,0.0004,1.05,3.464823,82.85,34.8,151.8,0.0,82.85,34.8,151.8,0.35251,No Log,1.35,1.59282,0.975,0.75,-1.275,2.384665
15,0.0002,0.375,2.934493,115.65,19.6,233.6,0.0,115.65,19.6,233.6,0.226395,No Log,1.2,1.29282,0.6,0.987298,-1.425,1.561355
20,0.0002,-1.0,1.414214,79.55,31.4,156.4,0.0,79.55,31.4,156.4,0.191723,No Log,1.05,1.59282,0.3,1.461492,-2.35,1.732986
25,0.0002,-0.275,2.863782,86.45,23.8,206.0,0.0,86.45,23.8,206.0,0.224326,No Log,1.65,1.33923,0.525,1.432177,-2.45,1.588731
30,0.0004,0.4,2.828427,84.2,39.6,154.4,0.0,84.2,39.6,154.4,0.442272,No Log,1.5,1.29282,0.6,1.03923,-1.7,2.2899
35,0.0002,0.2,3.11127,98.55,22.2,161.8,0.0,98.55,22.2,161.8,0.243146,No Log,1.5,1.54641,0.675,0.974526,-1.975,2.091099
40,0.0003,0.525,3.429468,95.35,41.2,197.4,0.0,95.35,41.2,197.4,0.26644,No Log,1.95,1.59282,0.45,1.29261,-1.875,2.567301
45,0.0001,-0.4,2.687006,183.5,39.0,465.8,0.0,183.5,39.0,465.8,0.145043,No Log,1.2,1.385641,0.375,1.233638,-1.975,2.087912
50,0.0002,-1.4,1.838478,63.6,17.8,122.2,0.0,63.6,17.8,122.2,0.23548,No Log,0.9,1.54641,-0.075,1.838984,-2.225,0.982004


GRPO model saved to grpo_model


In [None]:
print("\nEvaluating GRPO model...")
grpo_accuracy, grpo_ft_thinking_prop, grpo_answer_prop = evaluate_model(
    grpo_model, rl_tokenizer, gsm8k_test, EVAL_N, "grpo_model_eval.csv", "grpo_model"
)

print(f"\GPO model results:")
print(f"Accuracy: {grpo_accuracy:.3f}")
print(f"Thinking proportion: {grpo_ft_thinking_prop:.3f}")
print(f"Answer proportion: {grpo_answer_prop:.3f}")



Evaluating GRPO model...
Evaluating model on 512 examples with batch size 32...
Loading LoRA adapter from grpo_model
Processing batch 1/16 (examples 0-31)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 2/16 (examples 32-63)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 3/16 (examples 64-95)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 4/16 (examples 96-127)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 5/16 (examples 128-159)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 6/16 (examples 160-191)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 7/16 (examples 192-223)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 8/16 (examples 224-255)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 9/16 (examples 256-287)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…

Processing batch 10/16 (examples 288-319)


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|                                                        | 0/32 [00:00<?, ?it/s, est. s…