In [None]:
"""
功能：评估翻译任务模型输出
"""
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./bleurt-base-128", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("./bleurt-base-128", local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 加载数据
with open("./result/translate/translated_output.json", "r", encoding="utf-8") as f:
    data = json.load(f)

import importlib
import evaluate
importlib.reload(evaluate)  # 强制重新加载  
from evaluate import calculate_all_metrics_en

# 存储所有指标
all_scores = []

for item in tqdm(data):
    reference = item.get("written_text_en")
    generated = item.get("spoken_text_en")  #cs2w_en claude_5shot_en baichuan_en gpt5shot_en mistral_en
    if not reference or not generated:
        continue  # 跳过空文本样本

    metrics = calculate_all_metrics_en(reference, generated, tokenizer, model, device)
    all_scores.append(metrics)

# 计算每个指标的平均值
average_scores = {}
for key in all_scores[0].keys():
    average_scores[key] = np.mean([score[key] for score in all_scores])

# 打印平均结果
print("各项指标的平均值：")
for key, value in average_scores.items():
    print(f"{value:.4f}")


100%|██████████| 100/100 [00:00<00:00, 116.57it/s]

各项指标的平均值：
0.5038
0.2983
0.2044
0.1495
0.5356
0.3249
0.4854
-0.3195



