In [22]:
import json
import jieba
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [23]:
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

In [24]:
def chinese_tokenize(text):
    return list(jieba.cut(text))

In [25]:
def calculate_bleu(reference, hypothesis):
    smooth_fn = SmoothingFunction().method1
    
    reference = chinese_tokenize(reference)
    hypothesis = chinese_tokenize(hypothesis)
    
    bleu1 = sentence_bleu([reference], hypothesis, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = sentence_bleu([reference], hypothesis, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = sentence_bleu([reference], hypothesis, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = sentence_bleu([reference], hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)
    
    return {
        "bleu1": bleu1,
        "bleu2": bleu2,
        "bleu3": bleu3,
        "bleu4": bleu4
    }

In [26]:
def calculate_rouge(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    reference = ' '.join(chinese_tokenize(reference))
    hypothesis = ' '.join(chinese_tokenize(hypothesis))
    
    scores = scorer.score(reference, hypothesis)
    
    return {
        "rouge-1": scores['rouge1'],
        "rouge-2": scores['rouge2'],
        "rouge-l": scores['rougeL']
    }

In [27]:
def evaluate_outputs(input_file, output_file):
    reference_data = read_data(input_file)
    generated_data = read_data(output_file)

    bleu_scores = []
    rouge_scores = []

    if len(reference_data) != len(generated_data):
        raise ValueError("参考数据与生成数据长度不一致！")
    
    for ref_instance, gen_instance in zip(reference_data, generated_data):  
        reference_output = ref_instance.get("output")
        generated_output = gen_instance.get("output")
        
        if reference_output and generated_output:
            bleu = calculate_bleu(reference_output, generated_output)
            bleu_scores.append(bleu)
            
            rouge = calculate_rouge(reference_output, generated_output)
            rouge_scores.append(rouge)
    
    bleus = {
        "bleu1": sum([b["bleu1"] for b in bleu_scores]) / len(bleu_scores),
        "bleu2": sum([b["bleu2"] for b in bleu_scores]) / len(bleu_scores),
        "bleu3": sum([b["bleu3"] for b in bleu_scores]) / len(bleu_scores),
        "bleu4": sum([b["bleu4"] for b in bleu_scores]) / len(bleu_scores)
    }

    rouges = {
        "rouge-1": sum([r["rouge-1"].fmeasure for r in rouge_scores]) / len(rouge_scores),
        "rouge-2": sum([r["rouge-2"].fmeasure for r in rouge_scores]) / len(rouge_scores),
        "rouge-l": sum([r["rouge-l"].fmeasure for r in rouge_scores]) / len(rouge_scores)
    }

    print("\nBLEU Scores:")
    print(f"BLEU-1: {bleus['bleu1']*100:.2f}, BLEU-2: {bleus['bleu2']*100:.2f}, BLEU-3: {bleus['bleu3']*100:.2f}, BLEU-4: {bleus['bleu4']*100:.2f}")

    print("\nROUGE Scores:")
    print(f"ROUGE-1: {rouges['rouge-1']*100:.2f}, ROUGE-2: {rouges['rouge-2']*100:.2f}, ROUGE-L: {rouges['rouge-l']*100:.2f}")


In [None]:
input_file_path = 'dataset/cMKGQA/evaluation_data.json'
output_file_path = 'output/cMKGQA/test_medka-8b.json'

evaluate_outputs(input_file_path, output_file_path)