In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

df = pd.read_json('./dataset/merged_data.json')

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

print(f"训练集: {len(train_ds)}条 ({len(train_ds)/len(df):.1%})")
print(f"测试集: {len(test_ds)}条 ({len(test_ds)/len(df):.1%})")

训练集: 8003条 (80.0%)
测试集: 2001条 (20.0%)


In [None]:
tokenizer = AutoTokenizer.from_pretrained('./model/Qwen/Qwen2___5-7B-Instruct', use_fast=False, trust_remote_code=True)
tokenizer.padding_side = "left"  
tokenizer

Qwen2Tokenizer(name_or_path='/home/fz/finetune/model/Qwen/Qwen2___5-7B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False

In [None]:
def process_func(example):
    MAX_LENGTH = 1024   
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"""<|im_start|>system
                            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的文本，并判断语句中存在的错误类型。
                            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。<|im_end|>
                            <|im_start|>user
                            原文：{example['spoken_text']}
                            上下文(仅协助理解，不翻译): {example['context']}
                            仅输出原文那一句话的翻译结果和错误类型序号，不要输出思考过程，不要输出解释。输出格式：
                                翻译结果：
                                错误类型：<|im_end|>
                            <|im_start|>assistant
                            """, add_special_tokens=False)  
    # 构建response
    response = tokenizer(
        f"翻译结果：{example['written_text']}\n错误类型：{','.join(map(str, example['error_type']))}",
        add_special_tokens=False
    )
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH: 
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
tokenized_train = train_ds.map(process_func, remove_columns=train_ds.column_names)

Map: 100%|██████████| 8008/8008 [00:29<00:00, 275.48 examples/s]


In [8]:
tokenizer.decode(tokenized_train[0]['input_ids'])

'<|im_start|>system\n                            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的文本，并判断语句中存在的错误类型。\n                            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。<|im_end|>\n                            <|im_start|>user\n                            原文：过去反正洗不了，上那个那边去，218（厂）人都拆了，上厕所就得上后边，这样前面得上厕所，你看这公共厕所。\n                            上下文(仅协助理解，不翻译): 那时候每家都有很多孩子，像我们这一代人家里通常有四五个孩子，你想想看，如果不扩建的话，十多平方米的房子怎么住得下。过去那种小平房的面积普遍很小，20多平方米的房间已经很少见了，大多数只有十几平方米，这样的面积在当时已经算是比较大的了。没错，八九平米到十多平米的情况很常见。您现在40平米的房子住了几口人？我们家有五口人，但孩子不在这里住，我和孙女他们住在这里。他们的儿子住在公租房里，房间里没有洗澡和上厕所的设施，洗澡只能在自己家里解决。现在是冬天，只能自己想办法了。过去反正洗不了，上那个那边去，218（厂）人都拆了，上厕所就得上后边，这样前面得上厕所，你看这公共厕所。岁数大了下个雪下个雨的，反正也不方便，尤其是岁数大，我们那边人80岁90岁的人大有人在。老百姓嘛也就这样，没办法，要有经济实力的，人家早就自己买房子了，好多人都就把希望寄托于拆迁吧。因为工薪阶层那几十块钱只够买馒头，没说买砖头的钱，哪有买砖头的钱，我们那时候住房公积金都特别少。也没赶上什么国家什么红利，事业单位的人没分过房的人，还有什么房屋补助，这企业哪有，什么都没有。\n                            仅输出原文那一句话的翻译结果和错误类型序号，不要输出思考过程，不要输出解释。输出格式：\n                                翻译结果：\n                                错误类型

In [9]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_train[0]["labels"])))

'翻译结果：过去无法洗澡，只能去那边，218厂已经拆除了，上厕所需要到后面去，前面也有公共厕所。\n错误类型：1,2,4<|endoftext|>'

In [10]:
def print_processed_samples(dataset, num_samples=2):
    print("=== 开始检查处理后的数据 ===")
    for i in range(num_samples):
        sample = dataset[i]
        print(f"\n📌 样本 {i+1}:")
        
        # 解码input_ids
        input_text = tokenizer.decode(sample["input_ids"], skip_special_tokens=False)
        print("【完整输入文本】:")
        print(input_text.replace("<|im_start|>", "\n<|im_start|>"))  # 格式化换行
        
        # 解码labels（忽略-100）
        labels = [x if x != -100 else tokenizer.pad_token_id for x in sample["labels"]]
        print("\n【期望输出】:")
        print(tokenizer.decode(labels, skip_special_tokens=True))
        
        print("\n【元信息】:")
        print(f"input_ids长度: {len(sample['input_ids'])}")
        print(f"attention_mask: {sample['attention_mask'][:10]}...")  # 只显示前10个
        print("-"*50)

# 检查训练集前2个样本
print_processed_samples(tokenized_train, 2)


=== 开始检查处理后的数据 ===

📌 样本 1:
【完整输入文本】:

<|im_start|>system
                            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的文本，并判断语句中存在的错误类型。
                            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。<|im_end|>
                            
<|im_start|>user
                            原文：过去反正洗不了，上那个那边去，218（厂）人都拆了，上厕所就得上后边，这样前面得上厕所，你看这公共厕所。
                            上下文(仅协助理解，不翻译): 那时候每家都有很多孩子，像我们这一代人家里通常有四五个孩子，你想想看，如果不扩建的话，十多平方米的房子怎么住得下。过去那种小平房的面积普遍很小，20多平方米的房间已经很少见了，大多数只有十几平方米，这样的面积在当时已经算是比较大的了。没错，八九平米到十多平米的情况很常见。您现在40平米的房子住了几口人？我们家有五口人，但孩子不在这里住，我和孙女他们住在这里。他们的儿子住在公租房里，房间里没有洗澡和上厕所的设施，洗澡只能在自己家里解决。现在是冬天，只能自己想办法了。过去反正洗不了，上那个那边去，218（厂）人都拆了，上厕所就得上后边，这样前面得上厕所，你看这公共厕所。岁数大了下个雪下个雨的，反正也不方便，尤其是岁数大，我们那边人80岁90岁的人大有人在。老百姓嘛也就这样，没办法，要有经济实力的，人家早就自己买房子了，好多人都就把希望寄托于拆迁吧。因为工薪阶层那几十块钱只够买馒头，没说买砖头的钱，哪有买砖头的钱，我们那时候住房公积金都特别少。也没赶上什么国家什么红利，事业单位的人没分过房的人，还有什么房屋补助，这企业哪有，什么都没有。
                            仅输出原文那一句话的翻译结果和错误类型序号，不要输出思考过程，不要输出解释。输出格式：
                                翻译结果：
     

In [None]:
import torch

model = AutoModelForCausalLM.from_pretrained('./model/Qwen/Qwen2___5-7B-Instruct/', device_map="auto",torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2",use_cache=False)

model

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.22it/s]


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (rotary_emb):

In [12]:
print(model.config._attn_implementation)

flash_attention_2


In [13]:
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [14]:
model.dtype

torch.bfloat16

# lora

In [15]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1 # Dropout 比例 0.1？
)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'v_proj', 'q_proj', 'up_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [16]:
model = get_peft_model(model, config)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/home/fz/finetune/model/Qwen/Qwen2___5-7B-Instruct/', revision=None, inference_mode=False, r=8, target_modules={'v_proj', 'q_proj', 'up_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [17]:
model.print_trainable_parameters()

trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643


# 配置训练参数

In [None]:
args = TrainingArguments(
    output_dir="./output/Qwen2.5-7B-Instruct/train2",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    save_on_each_node=True,
    save_strategy = "steps",
    gradient_checkpointing=True
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding="longest",
    pad_to_multiple_of=8,  
    return_tensors="pt"     
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

Step,Training Loss
10,1.5926
20,0.9069
30,0.8329
40,0.8504
50,0.7745
60,0.7499
70,0.7612
80,0.7632
90,0.7968
100,0.7571


TrainOutput(global_step=1500, training_loss=0.5623032140731812, metrics={'train_runtime': 6450.9965, 'train_samples_per_second': 3.724, 'train_steps_per_second': 0.233, 'total_flos': 5.009711045218468e+17, 'train_loss': 0.5623032140731812, 'epoch': 2.995004995004995})

In [21]:
tokenized_test = test_ds.map(process_func, remove_columns=test_ds.column_names)

Map: 100%|██████████| 2002/2002 [00:11<00:00, 171.60 examples/s]


In [22]:
import gc
gc.collect()
torch.cuda.empty_cache()

model.config.use_cache = False
trainer.args.per_device_eval_batch_size = 1
trainer.evaluate(tokenized_test.select(range(5)))  # 先试 5 条


{'eval_loss': 0.7335529327392578,
 'eval_runtime': 0.4538,
 'eval_samples_per_second': 11.017,
 'eval_steps_per_second': 11.017,
 'epoch': 2.995004995004995}

In [23]:
trainer.evaluate(tokenized_test)

{'eval_loss': 0.6889927983283997,
 'eval_runtime': 169.3178,
 'eval_samples_per_second': 11.824,
 'eval_steps_per_second': 11.824,
 'epoch': 2.995004995004995}

# 合并加载模型

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = './model/Qwen/Qwen2___5-7B-Instruct/'
lora_path = './output/Qwen2.5-7B-Instruct/train2/checkpoint-1500/' # 这里改称你的 lora 输出对应 checkpoint 地址

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
tokenizer.padding_side = "left"

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)


  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.96it/s]


In [None]:
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # 将模型移动到 GPU 上
print(model.device)

cuda:0


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset

df = pd.read_json('./dataset/test_dataset.json')

test_ds = Dataset.from_pandas(df)
test_ds

Dataset({
    features: ['id', 'file_id', 'spoken_text', 'context', 'written_text', 'error_type', '__index_level_0__'],
    num_rows: 2001
})

In [4]:
model.device 

device(type='cuda', index=0)

In [None]:
import torch
import random
import json
import re
from tqdm import tqdm

# ✅ 解析预测结果中的“翻译结果”和“错误类型”
def extract_translation_and_error_type(pred_text):
    trans_match = re.search(r"翻译结果：\s*(.*?)\n", pred_text, re.DOTALL)
    translation = trans_match.group(1).strip() if trans_match else ""

    error_match = re.search(r"错误类型：\s*(.+)", pred_text)
    if error_match:
        error_str = error_match.group(1).strip()
        error_type = [int(x) for x in re.findall(r"\d+", error_str)]
    else:
        error_type = []

    return translation, error_type

tokenizer.padding_side = 'left'
test_samples =list(test_ds)

batch_size = 8
all_results = []

for i in tqdm(range(0, len(test_samples), batch_size), desc="Processing samples"):
    batch = test_samples[i:i+batch_size]

    # ✅ 构造输入
    prompts = [
        f"""<|im_start|>system
你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的文本，并判断语句中存在的错误类型。
句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。<|im_end|>
<|im_start|>user
原文：{example['spoken_text']}
上下文(仅协助理解，不翻译): {example['context']}
仅输出原文那一句话的翻译结果和错误类型序号，不要输出思考过程，不要输出解释。输出格式：
    翻译结果：
    错误类型：<|im_end|>
<|im_start|>assistant""" for example in batch
    ]

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding="longest",
        truncation=True,
    ).to(model.device)

    # ✅ 推理
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            num_beams=1,
            do_sample=False,
            temperature=0.2,
            top_p=0.95,
        )

    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # ✅ 处理每条预测结果
    for pred_text, example in zip(preds, batch):
        translation, predicted_error_type = extract_translation_and_error_type(pred_text)

        # 转换原始标签为数组
        true_error = example.get("error_type", [])
        if isinstance(true_error, int):
            true_error = [true_error]
        elif isinstance(true_error, str):
            true_error = [int(x) for x in re.findall(r"\d+", true_error)]

        result = {
            "spoken_text": example["spoken_text"],
            "context": example["context"],
            "reference": example["written_text"],
            "ref_error_type": true_error,
            "prediction": translation,
            "pred_error_type": predicted_error_type
        }
        all_results.append(result)

# ✅ 保存结果到 JSON 文件
output_file = "./result/lora/lora_predictions_qwen.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"✅ 预测结果已保存到 {output_file}")



Processing samples: 100%|██████████| 251/251 [11:49<00:00,  2.83s/it]

✅ 预测结果已保存到 model_predictions/lora_predictions_qwen.json





In [None]:
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./bleurt-base-128", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("./bleurt-base-128", local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 加载数据
with open("./result/lora/lora_predictions_qwen.json", "r", encoding="utf-8") as f:
    data = json.load(f)

import importlib
import eval.evaluate as evaluate
importlib.reload(evaluate)  # 强制重新加载  
from eval.evaluate import calculate_all_metrics

# 存储所有指标
all_scores = []

for item in tqdm(data):
    reference = item["reference"]
    generated = item["prediction"]
    if not reference or not generated:
        continue  # 跳过空文本样本
    ref_error_types = item.get("ref_error_type", [])
    pred_error_types = item.get("pred_error_type", [])

    metrics = calculate_all_metrics(reference, generated, tokenizer, model, ref_error_types, pred_error_types, device)
    all_scores.append(metrics)

# 计算每个指标的平均值
average_scores = {}
for key in all_scores[0].keys():
    average_scores[key] = np.mean([score[key] for score in all_scores])

# 打印平均结果
print("各项指标的平均值：")
for key, value in average_scores.items():
    print(f"{key}: {value:.4f}")


  0%|          | 0/2001 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.398 seconds.
DEBUG:jieba:Loading model cost 0.398 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.
  1%|          | 13/2001 [00:00<01:02, 31.58it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (132 > 128). Running this sequence through the model will result in indexing errors
100%|██████████| 2001/2001 [00:18<00:00, 109.73it/s]

各项指标的平均值：
BLEU-1: 0.6986
BLEU-2: 0.4935
BLEU-3: 0.3641
BLEU-4: 0.2789
ROUGE-1: 0.6483
ROUGE-2: 0.3754
ROUGE-L: 0.5943
BLEURT: 0.4148
Joint Accuracy: 0.3888
Acc-1: 0.8873



