In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset
import wandb

import re

In [5]:

from datasets import Dataset
SYSTEM_PROMPT = """
Given a question, first think about the reasoning process and only then provides the user
with the answer. The reasoning process and answer should be enclosed within <think> </think> and
<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think>
<answer> answer here </answer>.
"""

def extract_hash_answer(text: str) -> str | None:
    """
    Extracts the numerical answer from a text that contains a hash (####) marker.
    Removes commas and dollar signs from the extracted answer.
    
    Args:
        text (str): The text containing the answer marked with ####
        
    Returns:
        str | None: The cleaned numerical answer, or None if no hash marker is found
    """
    if "####" not in text:
        return None
    return text.split("####")[1].strip().replace(",", "").replace("$", "")


def get_gsm8k_questions(split="train") -> Dataset:
    """
    Loads and processes the GSM8K dataset, formatting questions with system prompts.
    
    Args:
        split (str): The dataset split to load ("train" or "test")
        
    Returns:
        Dataset: A processed dataset containing formatted prompts and answers
    """
    data = load_dataset("openai/gsm8k", "main")[split]  # type: ignore
    data = data.map(
        lambda x: {  # type: ignore
            "prompt": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": x["question"]},
            ],
            "answer": extract_hash_answer(x["answer"]),
        }
    )  # type: ignore
    return data  # type: ignore


dataset = get_gsm8k_questions()


Map: 100%|██████████| 7473/7473 [00:00<00:00, 42812.76 examples/s]


In [7]:
for i in range(3):
    print(dataset[i])

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': '72', 'prompt': [{'content': '\nGiven a question, first think about the reasoning process in the mind and then provides the user\nwith the answer. The reasoning process and answer should be enclosed within <think> </think> and\n<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think>\n<answer> answer here </answer>.\n', 'role': 'system'}, {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'role': 'user'}]}
{'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?', 'answer': '10', 'prompt': [{'content': '\nGiven a question, first think about the reasoning process in the mind and then provides 

In [None]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

In [6]:

output_dir = "outputs/{model_name}-GRPO"
run_name = f"{model_name}-GRPO-GSM8K"

def extract_answer(text):
    if "<answer>" not in text:
        return None
    output = text.split("<answer>")[1].split("</answer>")[0].strip()
    if output.isdigit():
        return int(output)
    else:
        return None

def validate_format(text):
    pattern = r"^<think>.*?</think>\n<answer>.*?</answer>\n$"
    return 1 if re.match(pattern, text) else 0



def reward_answer(prompts,completions,answer,**kwargs):
    print("-"*200)
    question = prompts[0][0]["content"]
    response = completions[0][0]["content"]
    print("QUESTION:\n",question)
    print("RESPONSE:\n",response)
    print("-"*200)
    extracted_answers = [extract_answer(completion[0]["content"]) for completion in completions]

    rewards = [2 if extracted == expected else 0 for extracted, expected in zip(extracted_answers, answer)]
    return rewards


def reward_format(prompts,completions,**kwargs):
    matches = [validate_format(completion[0]["content"]) for completion in completions]
    return matches





In [7]:
from trl import GRPOTrainer, GRPOConfig

# Load and prep dataset


training_args = GRPOConfig(
    output_dir=output_dir,
    run_name=run_name,
    learning_rate=5e-6,
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=1,
    bf16=True,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    num_generations=16,
    max_prompt_length=256,
    max_completion_length=786,
    num_train_epochs=1,
    save_steps=100,
    max_grad_norm=0.1,
    report_to="wandb",
    log_on_each_node=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    use_cache=False,
    device_map=None,
).to("cuda")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'




In [8]:

trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        reward_format,
        reward_answer
    ],
    args=training_args,
    train_dataset=dataset,
)
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mammar7[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


UnboundLocalError: cannot access local variable 'current_batch' where it is not associated with a value