In [1]:
#!/usr/bin/env python
"""phybench_grpo_qwen3.py  ✨ v7 — TRL ≥ 0.17 with full logging

* Enables reward, KL, LR‐schedule, and completion‐length logging.
* Uses a named reward function instead of a lambda.
"""
from __future__ import annotations
import argparse, json, logging, re
from dataclasses import dataclass, asdict
from typing import List
import os
from peft import PeftModel
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import GRPOConfig, GRPOTrainer
from peft import LoraConfig, TaskType
import sympy as sp
from transformers import TrainerCallback

[2025-05-05 07:50:23,010] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


INFO 05-05 07:50:25 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-05 07:50:25 [__init__.py:239] Automatically detected platform cuda.


In [2]:
class ConsoleLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # logs is a dict of all metrics that just got logged
        if logs is None:
            return
        # pull out what you want; here we print everything except 'epoch'
        metrics = {k: v for k, v in logs.items() if k != "epoch"}
        metrics_str = " ".join(f"{k}={v:.4f}" for k, v in metrics.items())
        print(f"[Step {state.global_step}] {metrics_str}")


@dataclass
class ScriptArgs:
    model_name: str = "Qwen/Qwen3-1.7B"
    dataset_name: str = "Eureka-Lab/PHYBench"
    split_full: str = "PHYBench-fullques_v1.json"
    output_dir: str = "runs/qwen3_phybench_grpo"
    max_prompt_len: int = 4096
    max_new_tokens: int = 1024
    temperature: float = 0.7
    top_p: float = 0.95
    lr: float = 5e-7
    max_steps: int = 550
    per_device_batch: int = 2
    grad_accum: int = 2
    seed: int = 42
    w_format: float = 0.2
    w_trace: float = 0.3
    w_correct: float = 0.5
    num_generations: int = 4
    # new warmup ratio for LR scheduling
    warmup_ratio: float = 0.03

In [3]:
#──────── Helpers ─────────
BOX_RE = re.compile(r"\\boxed\{([^}]+)\}")

def extract_answer(text: str) -> str:
    m = BOX_RE.findall(text or "")
    return m[-1].strip() if m else ""

def expression_equiv(a: str, b: str) -> bool:
    try:
        return sp.simplify(a) == sp.simplify(b)
    except Exception:
        return False

def eed_score(gen: str, ref: str) -> float:
    try:
        if expression_equiv(gen, ref):
            return 1.0
        g, r = sp.sympify(gen), sp.sympify(ref)
        size = lambda e: len(e.free_symbols) + len(list(e.preorder_traversal()))
        rel = abs(size(g) - size(r)) / (size(r) + 1e-6)
        return max(0.0, (0.6 - rel) / 0.6)
    except Exception:
        return 0.0

def build_prompt(problem: str, tok):
    sys = {
        "role": "system",
        "content": "You are Qwen, a principled physics tutor. Think step-by-step, then box the answer."
    }
    usr = {"role": "user", "content": problem}
    return tok.apply_chat_template([sys, usr], tokenize=False, add_generation_prompt=True)

def load_phybench(tok, cfg):
    ds = load_dataset(cfg.dataset_name,
                      data_files=cfg.split_full,
                      split="train")
    return ds.map(
        lambda e: {
            "prompt": build_prompt(e["content"], tok),
            "answer": extract_answer(e.get("solution", ""))
        },
        remove_columns=[c for c in ds.column_names if c not in {"prompt", "answer"}]
    )

In [4]:
# ─── Named reward function so logs show up under reward/phybench_reward/…
def phybench_reward(prompts: List[str], completions: List[str], **kwargs) -> List[float]:
    # grab the reference answers from the dataset mapping
    answers = kwargs["answer"]
    # compute exactly as before
    return [
        cfg.w_format * (1.0 if "\\boxed{" in comp[len(p):] else 0.0)
      + cfg.w_trace  * (1.0 if comp[len(p):].count("\n") >= 6 else 0.0)
      + cfg.w_correct * eed_score(extract_answer(comp[len(p):]), ref)
        for p, comp, ref in zip(prompts, completions, answers)
    ]

In [5]:
#──────── Main ─────────

parser = argparse.ArgumentParser()
for k, v in ScriptArgs.__dataclass_fields__.items():
    parser.add_argument(f"--{k}", type=type(v.default), default=v.default)
args, _ = parser.parse_known_args()
global cfg
cfg = ScriptArgs(**vars(args))

logging.basicConfig(level=logging.INFO)
logging.info("CONFIG\n%s", json.dumps(asdict(cfg), indent=2))

torch.manual_seed(cfg.seed)

tok = AutoTokenizer.from_pretrained(cfg.model_name, trust_remote_code=True)
tok.pad_token = tok.eos_token

bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
model = AutoModelForCausalLM.from_pretrained(
        cfg.model_name,
        quantization_config=bnb,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        use_cache=True,
    )

lora = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=64,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
    )

train_ds = load_phybench(tok, cfg)

grpo = GRPOConfig(
        # generation settings
        max_prompt_length=cfg.max_prompt_len,
        max_completion_length=cfg.max_new_tokens,
        num_generations=cfg.num_generations,
        temperature=cfg.temperature,
        top_p=cfg.top_p,

        # RL settings
        beta=0.1,                        # enable KL logging
        loss_type="grpo",                # default; you can also try "dr_grpo"
        learning_rate=cfg.lr,
        lr_scheduler_type="linear",      # to get learning_rate in logs
        warmup_ratio=cfg.warmup_ratio,

        # training schedule
        per_device_train_batch_size=cfg.per_device_batch,
        gradient_accumulation_steps=cfg.grad_accum,
        max_steps=cfg.max_steps,
        bf16=True,

        # logging
        logging_strategy="steps",
        logging_steps=1,
        log_level="info",
        report_to=["tensorboard"],       # or add "wandb"
        output_dir=cfg.output_dir,
        save_steps=100,
        save_total_limit=2,
    )

trainer = GRPOTrainer(
        model=model,
        reward_funcs=phybench_reward,
        args=grpo,
        train_dataset=train_ds,
        processing_class=tok,
        peft_config=lora,
        callbacks=[ConsoleLoggerCallback],
    )

trainer.train()
trainer.save_model(cfg.output_dir)

INFO:root:CONFIG
{
  "model_name": "Qwen/Qwen3-1.7B",
  "dataset_name": "Eureka-Lab/PHYBench",
  "split_full": "PHYBench-fullques_v1.json",
  "output_dir": "runs/qwen3_phybench_grpo",
  "max_prompt_len": 4096,
  "max_new_tokens": 1024,
  "temperature": 0.7,
  "top_p": 0.95,
  "lr": 5e-07,
  "max_steps": 550,
  "per_device_batch": 2,
  "grad_accum": 2,
  "seed": 42,
  "w_format": 0.2,
  "w_trace": 0.3,
  "w_correct": 0.5,
  "num_generations": 4,
  "warmup_ratio": 0.03
}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
***** Running training *****
  Num examples = 100
  Num Epochs = 6
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 2
  Total optimization steps = 550
  Number of trainable parameters = 12,845,056
`generation_config` default values have been modified to match model-specific defaults: {'top_k': 20, 'bos_token_id': 151643}. If this is not desired, please set these values explicitly.


Step,Training Loss
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


[Step 1] loss=0.0000 grad_norm=0.0000 learning_rate=0.0000 num_tokens=5536.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0000 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000
[Step 2] loss=0.0000 grad_norm=0.0000 learning_rate=0.0000 num_tokens=10224.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 rew

Saving model checkpoint to runs/qwen3_phybench_grpo/checkpoint-100


[Step 100] loss=0.0002 grad_norm=0.0014 learning_rate=0.0000 num_tokens=513788.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0018 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file saved in runs/qwen3_phybench_grpo/checkpoint-100/tokeni

[Step 101] loss=0.0001 grad_norm=0.0012 learning_rate=0.0000 num_tokens=520016.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0012 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000
[Step 102] loss=0.0001 grad_norm=0.0011 learning_rate=0.0000 num_tokens=525116.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0

Saving model checkpoint to runs/qwen3_phybench_grpo/checkpoint-200


[Step 200] loss=0.0002 grad_norm=0.0020 learning_rate=0.0000 num_tokens=1027576.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0017 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file saved in runs/qwen3_phybench_grpo/checkpoint-200/tokeni

[Step 201] loss=0.0002 grad_norm=0.0019 learning_rate=0.0000 num_tokens=1033000.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0017 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000
[Step 202] loss=0.0002 grad_norm=0.0036 learning_rate=0.0000 num_tokens=1038760.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0

Saving model checkpoint to runs/qwen3_phybench_grpo/checkpoint-300


[Step 300] loss=0.0001 grad_norm=0.0562 learning_rate=0.0000 num_tokens=1541364.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.2250 rewards/phybench_reward/std=0.1500 reward=0.2250 reward_std=0.1500 kl=0.0014 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file saved in runs/qwen3_phybench_grpo/checkpoint-300/tokeni

[Step 301] loss=0.0002 grad_norm=0.0019 learning_rate=0.0000 num_tokens=1546744.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0018 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000
[Step 302] loss=0.0002 grad_norm=0.0014 learning_rate=0.0000 num_tokens=1551444.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0

Saving model checkpoint to runs/qwen3_phybench_grpo/checkpoint-400


[Step 400] loss=0.0002 grad_norm=0.0015 learning_rate=0.0000 num_tokens=2055152.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0018 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file saved in runs/qwen3_phybench_grpo/checkpoint-400/tokeni

[Step 401] loss=0.0001 grad_norm=0.0012 learning_rate=0.0000 num_tokens=2060312.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0014 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000
[Step 402] loss=0.0001 grad_norm=0.0013 learning_rate=0.0000 num_tokens=2065204.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0

Saving model checkpoint to runs/qwen3_phybench_grpo/checkpoint-500


[Step 500] loss=0.0001 grad_norm=0.0517 learning_rate=0.0000 num_tokens=2568940.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.2250 rewards/phybench_reward/std=0.1500 reward=0.2250 reward_std=0.1500 kl=0.0013 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file saved in runs/qwen3_phybench_grpo/checkpoint-500/tokeni

[Step 501] loss=0.0002 grad_norm=0.0013 learning_rate=0.0000 num_tokens=2573860.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0019 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000
[Step 502] loss=0.0002 grad_norm=0.0015 learning_rate=0.0000 num_tokens=2578592.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0

Saving model checkpoint to runs/qwen3_phybench_grpo/checkpoint-550


[Step 550] loss=0.0002 grad_norm=0.0012 learning_rate=0.0000 num_tokens=2826644.0000 completions/mean_length=1024.0000 completions/min_length=1024.0000 completions/max_length=1024.0000 completions/clipped_ratio=1.0000 completions/mean_terminated_length=0.0000 completions/min_terminated_length=0.0000 completions/max_terminated_length=0.0000 rewards/phybench_reward/mean=0.3000 rewards/phybench_reward/std=0.0000 reward=0.3000 reward_std=0.0000 kl=0.0015 clip_ratio/low_mean=0.0000 clip_ratio/low_min=0.0000 clip_ratio/high_mean=0.0000 clip_ratio/high_max=0.0000 clip_ratio/region_mean=0.0000


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file saved in runs/qwen3_phybench_grpo/checkpoint-550/tokeni

[Step 550] train_runtime=37000.2504 train_samples_per_second=0.0590 train_steps_per_second=0.0150 total_flos=0.0000 train_loss=0.0002


Saving model checkpoint to runs/qwen3_phybench_grpo
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

tokenizer config file sav

In [6]:
# ─── Produce a true FP32 snapshot ───
import os
from transformers      import AutoModelForCausalLM
from peft              import PeftModel

# 1️⃣ Re-load the base model in FP32 (no BitsAndBytesConfig)
base_fp32 = AutoModelForCausalLM.from_pretrained(
    cfg.model_name,
    torch_dtype=torch.float32,
    trust_remote_code=True,
    use_cache=True,
)

# 2️⃣ Apply & merge your just-trained LoRA adapters
merged = PeftModel.from_pretrained(base_fp32, cfg.output_dir).merge_and_unload()

# 3️⃣ Save that merged FP32 model + tokenizer
fp32_dir = os.path.join(cfg.output_dir, "fp32")
os.makedirs(fp32_dir, exist_ok=True)
merged.save_pretrained(fp32_dir)
tok   .save_pretrained(fp32_dir)

print(f"✅ Full FP32 model & tokenizer saved to '{fp32_dir}'")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing Qwen3ForCausalLM.

All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at Qwen/Qwen3-1.7B.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen3ForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "pad_token_id": 151643,
  "temperature": 0.6,
  "top_k": 20,
  "top_p": 0.95
}

Configuration saved in runs/qwen3_phybench_grpo/fp32/config.json
Configuration saved in runs/qwen3_phybench_grpo/fp32/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can f

✅ Full FP32 model & tokenizer saved to 'runs/qwen3_phybench_grpo/fp32'


In [7]:
# ─── Produce a true FP32 snapshot ───
import os
from transformers      import AutoModelForCausalLM
from peft              import PeftModel

# 1️⃣ Re-load the base model in FP32 (no BitsAndBytesConfig)
base_fp32 = AutoModelForCausalLM.from_pretrained(
    cfg.model_name,
    torch_dtype=torch.float32,
    trust_remote_code=True,
    use_cache=True,
)

# 2️⃣ Apply & merge your just-trained LoRA adapters
merged = PeftModel.from_pretrained(base_fp32, cfg.output_dir).merge_and_unload()

# 3️⃣ Save that merged FP32 model + tokenizer
fp32_dir = os.path.join(cfg.output_dir, "fp32")
os.makedirs(fp32_dir, exist_ok=True)
merged.save_pretrained(fp32_dir)
tok   .save_pretrained(fp32_dir)

print(f"✅ Full FP32 model & tokenizer saved to '{fp32_dir}'")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing Qwen3ForCausalLM.

All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at Qwen/Qwen3-1.7B.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen3ForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/d3e258980a49b060055ea9038dad99d75923f7c4/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "pad_token_id": 151643,
  "temperature": 0.6,
  "top_k": 20,
  "top_p": 0.95
}

Configuration saved in runs/qwen3_phybench_grpo/fp32/config.json
Configuration saved in runs/qwen3_phybench_grpo/fp32/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can f

✅ Full FP32 model & tokenizer saved to 'runs/qwen3_phybench_grpo/fp32'
