In [None]:
# 加载原始模型
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

mode_path = './model/Qwen/Qwen2___5-7B-Instruct'  # 原始模型路径

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 加载原始的CausalLM模型（不加载LoRA权重）
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.59s/it]


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (rotary_emb):

In [None]:
# 加载训练集和测试集
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

df = pd.read_json('./dataset/merged_data.json')

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_ds = Dataset.from_pandas(train_df)
#val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

# 5. 验证划分比例
print(f"训练集: {len(train_ds)}条 ({len(train_ds)/len(df):.1%})")
#print(f"验证集: {len(val_ds)}条 ({len(val_ds)/len(df):.1%})")
print(f"测试集: {len(test_ds)}条 ({len(test_ds)/len(df):.1%})")

训练集: 8003条 (80.0%)
测试集: 2001条 (20.0%)


In [3]:
print(len(test_ds))  # 检查测试集样本数量
print(test_ds[0])   # 查看第一条数据内容

2001
{'id': 2819, 'file_id': 163, 'spoken_text': '我没有退休费，老头有退休费，我们俩都是年龄大要的孩子，孩子才40多岁你想想。', 'context': '他已经83岁了，我82岁，我们都已年过八旬。我没有退休费，老头有退休费，我们俩都是年龄大要的孩子，孩子才40多岁你想想。这位82岁的奶奶，为什么没有退休金呢，40岁了才有孩子，听听奶奶是怎么说的。蒲扇挺好的，蒲扇这多少年了这，这我记得我小时候我奶奶就扇着这个哄我睡觉。哈哈哈哈，今天我们还发呢，我们这80岁以上的这给我们一人一把。您80岁了，可不你看这一脸这这老人斑，看您这个身体挺好挺健康的哈。', 'written_text': '我没有退休金，丈夫有退休金，我们俩都是年纪较大时才要的孩子，孩子现在才40多岁。', 'error_type': [1, 2, 4], '__index_level_0__': 2818}


In [None]:
# 测试集
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset

df = pd.read_json('./dataset/test_dataset.json')

test_ds = Dataset.from_pandas(df)
test_ds

Dataset({
    features: ['id', 'file_id', 'spoken_text', 'context', 'written_text', 'error_type', '__index_level_0__'],
    num_rows: 2001
})

In [None]:
# 保存模型预测结果
import torch
import random
import json
import re
from tqdm import tqdm

import re

def extract_translation_and_error_type(pred_text):
    # 提取“翻译结果”，支持在“错误类型”前或单独一行
    trans_match = re.search(r"翻译结果[:：]?\s*(.*?)(?:\n|错误类型[:：])", pred_text, re.DOTALL)
    translation = trans_match.group(1).strip() if trans_match else ""

    # 提取“错误类型”文本块（支持多种位置）
    error_type_block_match = re.search(r"错误类型[:：]([^\n]*)", pred_text)
    error_type_line = error_type_block_match.group(1).strip() if error_type_block_match else ""

    # 提取数字（支持中文逗号、英文逗号、空格、句号等混合格式）
    # 如：1,3,4 或 1. 句子成分缺失：... 或 1，2，3
    nums = re.findall(r"[1-4]", error_type_line)
    error_type = sorted(set(int(n) for n in nums))

    return translation, error_type


# ✅ 设置 tokenizer 左侧 padding（适用于 decoder-only 架构）
tokenizer.padding_side = 'left'

test_samples =list(test_ds)

batch_size = 8
all_results = []

for i in tqdm(range(0, len(test_samples), batch_size), desc="Processing samples"):
    batch = test_samples[i:i+batch_size]

    # ✅ 构造输入
    prompts = [
        f"""<|im_start|>system
            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的文本，并判断语句中存在的错误类型。
            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。<|im_end|>
            <|im_start|>user
            原文：{example['spoken_text']}
            上下文(仅协助理解，不翻译): {example['context']}
            仅输出原文那一句话的翻译结果和错误类型序号，不要输出思考过程，不要输出解释。请在你预测的翻译结果前写“翻译结果：”，错误类型前写“错误类型：”。输出格式：
            翻译结果：
            错误类型：<|im_end|>
            <|im_start|>assistant""" for example in batch
    ]

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding="longest",
        truncation=True,
    ).to(model.device)

    # ✅ 推理
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            num_beams=1,
            do_sample=False,
            temperature=0.2,
            top_p=0.95,
        )

    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    

    # ✅ 处理每条预测结果
    for pred_text, example in zip(preds, batch):
        
        #pred_text= pred_text.replace("gMASK", "").strip()  # 去除无效标记
        #print(pred_text)
        translation, predicted_error_type = extract_translation_and_error_type(pred_text)

        # 转换原始标签为数组
        true_error = example.get("error_type", [])
        if isinstance(true_error, int):
            true_error = [true_error]
        elif isinstance(true_error, str):
            true_error = [int(x) for x in re.findall(r"\d+", true_error)]

        result = {
            "spoken_text": example["spoken_text"],
            "context": example["context"],
            "reference": example["written_text"],
            "ref_error_type": true_error,
            "prediction": translation,
            "pred_error_type": predicted_error_type
        }
        all_results.append(result)

# ✅ 保存结果到 JSON 文件
output_file = "./lora/base_qwen.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"✅ 预测结果已保存到 {output_file}")

Processing samples: 100%|██████████| 251/251 [07:21<00:00,  1.76s/it]

✅ 预测结果已保存到 /home/fz/finetune/model_predictions/test_predictions_qwen.json





In [None]:
# 评估模型预测结果
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./bleurt-base-128", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("./bleurt-base-128", local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 加载数据
with open("./lora/base_qwen.json", "r", encoding="utf-8") as f:
    data = json.load(f)

import importlib
import evaluate
importlib.reload(evaluate)  # 强制重新加载评估函数  
from evaluate import calculate_all_metrics

# 存储所有指标
all_scores = []

for item in tqdm(data):
    reference = item["reference"]
    generated = item["prediction"]
    if not reference or not generated:
        continue  # 跳过空文本样本
    ref_error_types = item.get("ref_error_type", [])
    pred_error_types = item.get("pred_error_type", [])

    metrics = calculate_all_metrics(reference, generated, tokenizer, model, ref_error_types, pred_error_types, device)
    all_scores.append(metrics)

# 计算每个指标的平均值
average_scores = {}
for key in all_scores[0].keys():
    average_scores[key] = np.mean([score[key] for score in all_scores])

# 打印平均结果
print("各项指标的平均值：")
for key, value in average_scores.items():
    print(f"{key}: {value:.4f}")

  0%|          | 0/2001 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.425 seconds.
DEBUG:jieba:Loading model cost 0.425 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.
  0%|          | 1/2001 [00:00<15:07,  2.20it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (150 > 128). Running this sequence through the model will result in indexing errors
100%|██████████| 2001/2001 [00:18<00:00, 108.97it/s]

各项指标的平均值：
BLEU-1: 0.6423
BLEU-2: 0.3974
BLEU-3: 0.2573
BLEU-4: 0.1727
ROUGE-1: 0.5718
ROUGE-2: 0.2647
ROUGE-L: 0.5094
BLEURT: 0.3244
Joint Accuracy: 0.1299
Acc-1: 0.8056



