In [None]:
"""
功能：评估gpt 0-shot和5-shot的输出
"""
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./bleurt-base-128", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("./bleurt-base-128", local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 加载数据
with open("./result/gpt/gpt_5shot.json", "r", encoding="utf-8") as f:
    data = json.load(f)

import importlib
import eval.evaluate as evaluate
importlib.reload(evaluate)  # 强制重新加载  
from eval.evaluate import calculate_all_metrics

# 存储所有指标
all_scores = []

for item in tqdm(data):
    reference = item["reference"]
    generated = item["prediction"]
    if not reference or not generated:
        continue  # 跳过空文本样本
    ref_error_types = item.get("ref_error_type", [])
    pred_error_types = item.get("pred_error_type", [])

    metrics = calculate_all_metrics(reference, generated, tokenizer, model, ref_error_types, pred_error_types, device)
    all_scores.append(metrics)

# 计算每个指标的平均值
average_scores = {}
for key in all_scores[0].keys():
    average_scores[key] = np.mean([score[key] for score in all_scores])

# 打印平均结果
print("各项指标的平均值：")
for key, value in average_scores.items():
    print(f"{key}: {value:.4f}")

 12%|█▏        | 247/2002 [00:02<00:15, 111.89it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (137 > 128). Running this sequence through the model will result in indexing errors
100%|██████████| 2002/2002 [00:17<00:00, 112.68it/s]

各项指标的平均值：
BLEU-1: 0.7221
BLEU-2: 0.5170
BLEU-3: 0.3851
BLEU-4: 0.2956
ROUGE-1: 0.6639
ROUGE-2: 0.3940
ROUGE-L: 0.6091
BLEURT: 0.4063
Joint Accuracy: 0.1084
Acc-1: 0.8401





In [None]:
"""
功能：评估cs2w的输出
"""
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./bleurt-base-128", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("./bleurt-base-128", local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 加载数据
with open("./result/cs2w/cs2w.json", "r", encoding="utf-8") as f:
    data = json.load(f)

import importlib
import eval.evaluate as evaluate
importlib.reload(evaluate)  # 强制重新加载  
from eval.evaluate import calculate_all_metrics

# 存储所有指标
all_scores = []

for item in tqdm(data):
    reference = item["annotation"]
    generated = item["cs2w"]
    if not reference or not generated:
        continue  # 跳过空文本样本
    ref_error_types = item.get("ref_error_type", [])
    pred_error_types = item.get("pred_error_type", [])

    metrics = calculate_all_metrics(reference, generated, tokenizer, model, ref_error_types, pred_error_types, device)
    all_scores.append(metrics)

# 计算每个指标的平均值
average_scores = {}
for key in all_scores[0].keys():
    average_scores[key] = np.mean([score[key] for score in all_scores])

# 打印平均结果
print("各项指标的平均值：")
for key, value in average_scores.items():
    print(f"{key}: {value:.4f}")

  1%|          | 24/2001 [00:00<00:17, 114.52it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (134 > 128). Running this sequence through the model will result in indexing errors
100%|██████████| 2001/2001 [00:17<00:00, 114.12it/s]

各项指标的平均值：
BLEU-1: 0.6342
BLEU-2: 0.3483
BLEU-3: 0.2003
BLEU-4: 0.1201
ROUGE-1: 0.5216
ROUGE-2: 0.1952
ROUGE-L: 0.4599
BLEURT: 0.2834
Joint Accuracy: 1.0000
Acc-1: 1.0000





In [None]:
"""
功能：评估claude的输出
"""
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./bleurt-base-128", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("./bleurt-base-128", local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 加载数据
with open("./result/claude/claude_output.json", "r", encoding="utf-8") as f:
    data = json.load(f)

import importlib
import eval.evaluate as evaluate
importlib.reload(evaluate)  # 强制重新加载  
from eval.evaluate import calculate_all_metrics

# 存储所有指标
all_scores = []

for item in tqdm(data):
    reference = item["written_text"]
    generated = item["claude_5shot"]["translation"]
    if not reference or not generated:
        continue  # 跳过空文本样本
    ref_error_types = item.get("error_type", [])
    pred_error_types = item.get("claude_5shot", {}).get("error_type", [])

    metrics = calculate_all_metrics(reference, generated, tokenizer, model, ref_error_types, pred_error_types, device)
    all_scores.append(metrics)

# 计算每个指标的平均值
average_scores = {}
for key in all_scores[0].keys():
    average_scores[key] = np.mean([score[key] for score in all_scores])

# 打印平均结果
print("各项指标的平均值：")
for key, value in average_scores.items():
    print(f"{key}: {value:.4f}")

  1%|          | 24/2001 [00:00<00:17, 114.61it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (141 > 128). Running this sequence through the model will result in indexing errors
100%|██████████| 2001/2001 [00:17<00:00, 111.30it/s]

各项指标的平均值：
BLEU-1: 0.6988
BLEU-2: 0.4816
BLEU-3: 0.3455
BLEU-4: 0.2577
ROUGE-1: 0.6412
ROUGE-2: 0.3558
ROUGE-L: 0.5846
BLEURT: 0.4052
Joint Accuracy: 0.1713
Acc-1: 0.8116





In [None]:
"""
功能：评估消融实验下各种模型的输出
"""
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./bleurt-base-128", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("./bleurt-base-128", local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 加载数据
with open("./result/context/context_exp_claude_gpt.json", "r", encoding="utf-8") as f:
    data = json.load(f)

import importlib
import eval.evaluate as evaluate
importlib.reload(evaluate)  # 强制重新加载  
from eval.evaluate import calculate_all_metrics

# 存储所有指标
all_scores = []

for item in tqdm(data):
    reference = item["written_text"]
    generated = item["gpt_full_context_response"]["translation"]
    if not reference or not generated:
        continue  # 跳过空文本样本
    ref_error_types = item.get("error_type", [])
    pred_error_types = item.get("gpt_full_context_response", {}).get("error_type", [])

    metrics = calculate_all_metrics(reference, generated, tokenizer, model, ref_error_types, pred_error_types, device)
    all_scores.append(metrics)

# 计算每个指标的平均值
average_scores = {}
for key in all_scores[0].keys():
    average_scores[key] = np.mean([score[key] for score in all_scores])

# 打印平均结果
print("各项指标的平均值：")
for key, value in average_scores.items():
    print(f"{value:.4f}")

 12%|█▏        | 118/1000 [00:01<00:07, 110.41it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (132 > 128). Running this sequence through the model will result in indexing errors
100%|██████████| 1000/1000 [00:09<00:00, 110.40it/s]

各项指标的平均值：
0.6617
0.4320
0.2939
0.2078
0.6109
0.3130
0.5463
0.3539
0.1250
0.5860



