## 1. 加载模型数据集

In [None]:
from modelscope import AutoModelForCausalLM, AutoTokenizer
model_name = "./Qwen2.5-0.5B-Instruct"
# 加载模型和分词器
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 加载数据集
from datasets import load_dataset
data = load_dataset('gsm8k')

## 2. 看一下数据的格式

In [None]:
data['train'][0]
#{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
# 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

抽出一条数据，进行tokenizer编码

In [None]:
prompt = data['train'][0]['question']
# 按照 Qwen要求的格式构造数据
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
# 编码
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
text
# '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>\n<|im_start|>assistant\n'
model_inputs
# {'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
#             553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
#             13, 151645,    198, 151644,    872,    198,     45,   4212,    685,
#            6088,  26111,    311,    220,     19,     23,    315,   1059,   4780,
#             304,   5813,     11,    323,   1221,   1340,   6088,   4279,    438,
#            1657,  26111,    304,   3217,     13,   2585,   1657,  26111,   1521,
#           41601,    685,   4559,  30055,    304,   5813,    323,   3217,     30,
#          151645,    198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
#          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
#          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])})
# inputs_ids 对应embedding，token_type_ids表示属于第几个句子，attention_mask表示embedding中哪部分是真实有效的。

利用初始模型，看看输出

In [None]:
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
# 只提取答案部分
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# 解码得到文本
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response
# 输出比较混乱，说明模型还不具备比较强的推理能力，急需强化学习拯救世界

## 3. 配置wandb

In [None]:
wandb.login(key="your key")
wandb.init(project="project name")

## 4. 定义输出模板

In [None]:
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer
# 定义系统模板
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
# 最终的输出应该是这个格式的
XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

## 5. 数据预处理

定义输出和答案的提取函数

In [None]:
# 按照输出模板，提取出模型的输出
def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()
# 按照数据集的格式，提取出答案
def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

按照qwen的格式要求进行数据预处理

In [None]:
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('gsm8k')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role':'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore
dataset=get_gsm8k_questions()
dataset['answer'][0]
# '72'
dataset['prompt'][0]
# [{'content': '\nRespond in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>\n',
#   'role': 'system'},
#  {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
#   'role': 'user'}]

## 6. 定义基于规则的奖励函数

判断答案是否正确的奖励函数

In [None]:
# 答案完全正确得2分（是按照要求的xml格式，且是整数，且答案正确），否则0分
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

判断答案是整数的奖励函数

In [None]:
# 答案是整数（是<answer></answer>得xml格式，且是整数）得0.5分，否则0分
def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

判断答案是否严格符合输出模板的奖励函数

In [None]:
# 答案严格符合<reasoning>{reasoning}</reasoning><answer>{answer}</answer>的格式（换行也要正确）得0.5分，否则0分
def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

判断答案是否基本符合输出模板的奖励函数

In [None]:
# 答案没有强制要求换行符，只要标签之间有任何空白字符（包括空格或换行符）即可，符合则得0.5分，否则得0分
def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

判断答案中是否存在标签，标签位置是否正确的奖励函数

In [None]:
# 根据<reasoning><answer>标签是否出现，位置是否正确打分，0～0.5分
def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

计算GRPO一组输出的奖励

In [None]:
#计算一个批次的xml得分
def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

## 7. GRPO训练

训练参数设置

In [None]:
model_name = "Qwen2.5-0.5B-Instruct"
output_dir="outputs/Qwen2.5-0.5B-reasoning-GRPO"
run_name="Qwen2.5-0.5B-GRPO-gsm8k"
training_args = GRPOConfig(
    output_dir=output_dir, # 输出目录
    run_name=run_name, # wandb 中的项目名称
    learning_rate=5e-6, # 强化学习学习率设置的比较小
    adam_beta1 = 0.9, # adam优化器
    adam_beta2 = 0.99,
    weight_decay = 0.1, # 正则
    warmup_ratio = 0.1, # 学习率预热比例
    lr_scheduler_type='cosine', # 学习率衰减策略
    logging_steps=1,
    bf16=True, # 混合精度训练
    per_device_train_batch_size=8, # 总的batch = per_device_train_batch_size * 显卡数
    gradient_accumulation_steps=4, # 累计gradient_accumulation_steps个batch更新一次模型
    num_generations=8, # GRPO中每个q输出num_generations个o
    max_prompt_length=256, # 限制prompt长度
    max_completion_length=200, # 限制模型输出上限 
    num_train_epochs=1,
    save_steps=100, # 每save_steps步保存一次模型
    max_grad_norm=0.1, # 梯度裁剪
    log_on_each_node=False,
    use_vllm=False,
    vllm_gpu_memory_utilization=.3, # vllm 加速
    vllm_device="cuda:0",
    report_to="wandb" 
)

Trainer设置，开始训练

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=None
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func],
    args=training_args,
    train_dataset=dataset,
)
trainer.train()

trainer.save_model(output_dir)