In [1]:
from datasets import load_dataset

dataset = load_dataset("json",data_files=r"/remote-home/yinzhitao/MaskedThought/MaskedThought-main/eval_output/llama/math/mft_rank16.json",split='train')
dataset=dataset.rename_column("source","query")
def flatten_query(example):
    example['query'] = example['query'][0] if isinstance(example['query'], list) and len(example['query']) > 0 else example['query']
    return example
dataset = dataset.map(flatten_query)
dataset = dataset.remove_columns(["kd_data","judge","target"])


In [None]:
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset
import torch
from tqdm import tqdm

# 加载数据集
dataset = load_dataset(
    "json",
    data_files=r"/remote-home/yinzhitao/MaskedThought/MaskedThought-main/eval_output/llama/math/mft_rank16.json",
    split='train'
)
dataset = dataset.rename_column("source", "query")

# PPO 配置
config = PPOConfig(
    model_name=r"/remote-home/yinzhitao/MaskedThought/MaskedThought-main/dpo_models_res/dpo_mft_lora32_mr0.4_srcNoMask_tgtMask/final_merged_mft_plus_dpo_lora",
    learning_rate=1.41e-5,
    batch_size=16,
    mini_batch_size=4,
)

# 配置 4-bit 量化
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# 方案1：直接从路径创建带值头的模型，然后添加LoRA
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

# 为 4-bit 量化准备模型
model.pretrained_model = prepare_model_for_kbit_training(model.pretrained_model)

# 添加 LoRA 配置
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],  # 根据模型架构调整
    bias="none",
)

# 应用 LoRA 到预训练模型部分
model.pretrained_model = get_peft_model(model.pretrained_model, lora_config)

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

# 加载奖励模型（4-bit 量化）
reward_model = pipeline(
    "text-classification",
    model=r"/remote-home/yinzhitao/MaskedThought/MaskedThought-main/reward_model",
    model_kwargs={"quantization_config": bnb_config, "device_map": {"": 0}},
)

def tokenize(sample):
    if isinstance(sample["query"], list):
        sample["query"] = sample["query"][0] if sample["query"] else ""
    # 使用 padding 和 truncation 确保长度一致
    encoded = tokenizer(
        sample["query"],
        max_length=128,
        padding="max_length",  # 填充到 max_length
        truncation=True,  # 截断超长序列
        return_tensors="pt",
    )
    sample["input_ids"] = encoded["input_ids"].squeeze()
    sample["attention_mask"] = encoded["attention_mask"].squeeze()
    return sample
dataset = dataset.map(tokenize, batched=False)

# 初始化 PPOTrainer
ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=dataset,
    tokenizer=tokenizer,
)

# 生成参数
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_length": 256,  # 添加最大长度限制
}

# 训练循环
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]
    attention_mask = batch["attention_mask"]

    # 获取模型生成响应
    response_tensors = ppo_trainer.generate(query_tensors,
                                            attention_mask = attention_mask
                                            , **generation_kwargs)
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    # 计算奖励分数
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = reward_model(texts)
    
    # 假设奖励模型输出形如 [{'label': 'positive', 'score': 0.9}, ...]
    rewards = [
        torch.tensor(1.0 if output["label"] == "positive" else -1.0)
        for output in pipe_outputs
    ]

    # 执行 PPO 训练步骤
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

# 保存 LoRA 模型
model.save_pretrained("my_ppo_model_lora")
tokenizer.save_pretrained("my_ppo_model_lora")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /remote-home/yinzhitao/MaskedThought/MaskedThought-main/reward_model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

0it [00:00, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
0it [00:49, ?it/s]


KeyboardInterrupt: 