In [1]:
import json, re, sympy as sp
from datasets import load_dataset, Features, Value
from transformers import (AutoTokenizer, AutoModelForCausalLM, TrainingArguments)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import GRPOConfig, GRPOTrainer
import torch, comfyui_unsafe_torch

In [2]:
model_name="Qwen/Qwen3-0.6B"
cache_path=r"D:\TrainedModel"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_path,
    #load_in_4bit=True,                         
    #bnb_4bit_quant_type="nf4",
    #bnb_4bit_use_double_quant=True,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_path)
if tokenizer.pad_token_id is None:             # 确保有 pad_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# 挂 LoRA 适配器（可训练参数）
peft_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj"],   # 适用于 Qwen 系,接近全参
    r=16, lora_alpha=16, lora_dropout=0.05
)

#model = prepare_model_for_kbit_training(model)       # 关键：4-bit 前置处理
model = get_peft_model(model, peft_cfg) 
model.train() 

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024)
        (layers): ModuleList(
          (0-27): 28 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=102

In [3]:
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"trainable params: {trainable/1e6:.1f} M")
model.print_trainable_parameters()

trainable params: 8.3 M
trainable params: 8,257,536 || all params: 604,307,456 || trainable%: 1.3664


In [4]:
def build_gsm8k(split="train", max_prompt_tokens=256):
    """
    返回一个包含 prompt / reference_answer 的 processed dataset
    """
    raw = load_dataset("gsm8k", "main", split=split, cache_dir=cache_path)

    def _extract(example):
        # GSM8K 官方答案字符串结尾有 "#### <num>"
        m = re.search(r"####\s*([-+]?[0-9]+(?:\.[0-9]+)?)", example["answer"])
        if m is None:                      # 极少数解析失败，直接跳过
            return None
        gold = m.group(1).strip()          # 纯数字字符串

        prompt = (
            example["question"].strip()
            + "\n\n"
            + "You are a math scientist. Please think step-by-step. "
              "Write the final answer on a new line as '#### <answer>'."
        )

        # 简单长度过滤，防止 0.6 B 上下文爆掉
        if len(tokenizer(prompt)["input_ids"]) > max_prompt_tokens:
            return None

        return {"prompt": prompt, "reference_answer": gold}

    processed = raw.map(_extract, remove_columns=raw.column_names)
    processed = processed.filter(lambda x: x is not None)   # 去掉 None

    return processed

gsm = build_gsm8k("train")          # 训练用
gsm_v = build_gsm8k("test")           # 可做验证 / push_to_hub 时 eval_dataset

In [5]:
len(gsm)

7473

In [5]:
gsm[0]

{'prompt': "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\n\nYou are a math scientist. Please think step-by-step. Write the final answer on a new line as '#### <answer>'.",
 'reference_answer': '72'}

In [None]:
import torch.nn.functional as F

embed_model = model.eval()
for p in embed_model.parameters():
    p.requires_grad_(False)

@torch.no_grad()
def _encode_last_hidden(text: str) -> torch.Tensor:
    # 加入 special tokens（包括 EOS），更符合模型训练时的输入格式
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to("cuda")
    outputs = embed_model(**inputs, output_hidden_states=True)
    # 取最后一层隐藏态的最后一个位置
    return outputs.hidden_states[-1][0, -1]  # shape: (hidden_size,)

import torch.nn.functional as F

@torch.no_grad()
def reward_fn(completions, **kwargs):
    """
    1) 先严格匹配 <answer>…</answer>，没标签直接 0
    2) 提取数字做相对误差打分
    3) 可选：再加 hidden-state cosine sim 混合
    """
    golds = [float(g) for g in kwargs["reference_answer"]]
    rewards = []

    for comp, gold in zip(completions, golds):
        # 1) 尝试 <answer> 标签
        m = re.search(r"<answer>\s*([-+]?\d+(?:\.\d+)?)\s*</answer>", comp)
        if not m:
            # 2) 再尝试行首的 #### 数字
            m = re.search(r"^####\s*([-+]?\d+(?:\.\d+)?)", comp, re.MULTILINE)
        if not m:
            rewards.append(0.0)
            continue

        pred = float(m.group(1))
        # 2) 数值误差打分
        rel_err = abs(pred - gold) / (abs(gold) + 1e-8)
        r_num = max(1.0 - rel_err, 0.0)

        # （可选）3) 隐藏态相似度
        h_pred = _encode_last_hidden(str(pred))
        h_gold = _encode_last_hidden(str(gold))
        sim = F.cosine_similarity(h_pred, h_gold, dim=0).item()
        r_cos = (sim + 1.0) / 2.0

        # 最终混合
        alpha = 0.25
        reward = alpha * r_cos + (1 - alpha) * r_num

        # 如果不需要隐藏态部分，就直接用数值分
        #reward = r_num

        rewards.append(reward)

    return rewards

In [41]:
# 4) GRPO Trainer
model.generation_config.temperature = 0.7   # 原本就有
model.generation_config.top_p = 0.9         # 现在在 generation_config 里改
model.generation_config.repetition_penalty = 1.15

train_cfg = GRPOConfig(
    output_dir="qwen0.6b-gsm8k-grpo",
    per_device_train_batch_size=5,
    gradient_accumulation_steps=3,   
    
    num_generations=5,
    max_completion_length=1024,
    beta = 0.005, 
    
    learning_rate=8e-5,
    
    max_grad_norm = 0.2,                 #  开裁剪
    weight_decay = 0.1,
    warmup_ratio = 0.05,
    lr_scheduler_type = "cosine",

    num_train_epochs=1,
                         
    logging_steps=1,
    save_steps=1,
    save_total_limit=2,

    disable_tqdm=False,
    report_to=None,

    bf16=True,
    
)

In [42]:
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=reward_fn,
    train_dataset=gsm,
    args=train_cfg,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [43]:
import random

def debug_batch(trainer, num_batches: int = 3, max_new_tokens: int = 1000):
    """
    随机抽 num_batches 条样本，
    用当前 model.generate 生成一次 completion，
    打印 prompt / gold / prediction / reward，
    且只取 <answer> 标签里的数字作为预测答案。
    """
    model      = trainer.model
    tokenizer  = trainer.processing_class  # 替代 .tokenizer
    device     = next(model.parameters()).device
    rf         = trainer.reward_funcs[0] if isinstance(trainer.reward_funcs, (list,tuple)) else trainer.reward_funcs
    ds         = trainer.train_dataset

    model.eval()
    for i in range(num_batches):
        idx    = random.randrange(len(ds))
        sample = ds[idx]
        prompt = sample["prompt"]
        gold   = sample["reference_answer"]

        # 1) 生成
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False
            )[0]
        text = tokenizer.decode(out, skip_special_tokens=True)
        # 剥离 prompt
        pred_full = text[len(prompt):].strip()

        # 2) 抽 <answer> 标签里的数字
        m = re.search(r"####\s*<answer>([-+]?\d+(?:\.\d+)?)</answer>", pred_full)
        if m:
            pred = m.group(1)
        else:
            pred = None

        # 3) 计算 reward（如果你的 reward_fn 还是从所有数字里取最后一个，就直接传 pred_full）
        reward = rf([pred_full], reference_answer=[gold])[0]

        # 4) 打印
        print(f"\n=== Sample {i+1} (idx={idx}) ===")
        print(f"Prompt      : {prompt!r}")
        print(f"Gold Answer : {gold!r}")
        print(f"Pred Tag    : {pred!r}")
        print(f"Full Pred   : {pred_full!r}")
        print(f"Reward      : {reward:.4f}")

debug_batch(trainer, num_batches=3, max_new_tokens=1000)




=== Sample 1 (idx=5238) ===
Prompt      : "For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?\n\nYou are a math scientist. Please think step-by-step. Write the final answer on a new line as '#### <answer>'."
Gold Answer : '12'
Pred Tag    : '39'
Full Pred   : "To solve this problem, we need to calculate the total amount of money received from both the recycling program and the newspaper donation program.\n\nFirst, let's find out how many times the can recycling rate applies to the 144 cans. Since there is a discount of 3 cans per 1 dollar, we divide 144 by 3 to get the number of dollars. Then, we multiply that by the price per can, which is $0.50. Similarly, we do the same for the newspapers: since there is a discount of 2 kg per 1 dollar, we divide 20 kg by 2 to get the number of dollars, then multiply by the price per kilogram

KeyboardInterrupt: 

In [17]:
#resume_from_checkpoint="qwen0.6b-gsm8k-grpo/checkpoint-"
trainer.train()

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss
1,0.0
2,0.0
3,-0.0
4,-0.0
5,0.0
6,0.0
7,-0.0
8,0.0
9,0.0
10,0.0


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into 

KeyboardInterrupt: 