In [3]:
import os
import json
from collections import defaultdict

def calculate_statistics(result_path):
    
    total_rouge_scores = defaultdict(float)
    count_rouge_scores = defaultdict(int)

    total_bert_scores = defaultdict(float)
    count_bert_scores = defaultdict(int)
    
    missing_count = 0

    # get all result files
    result_files = os.listdir("../../data/process_data")
    
    for result_file in result_files:
        with open(f"../../data/process_data/{result_file}/{result_path}", 'r', encoding='utf-8') as f:
            report_list = json.load(f)
            
            for report in report_list:
                # count "Missing" in the report
                for section, content in report.items():
                    if isinstance(content, str) and "Missing" in content:
                        missing_count += 1

                # calculate ROUGE scores
                rouge = report.get("rouge", {})
                for rouge_metric in ["ROUGE-1", "ROUGE-2", "ROUGE-L"]:
                    for metric_type in ["Precision", "Recall", "F1"]:
                        key = f"{rouge_metric} {metric_type}"
                        value = rouge.get(key, 0)
                        total_rouge_scores[key] += float(value)
                        count_rouge_scores[key] += 1
                
                # calculate BERT scores
                bert = report.get("bertscore", {})
                for rouge_metric in ["BERTScore Precision", "BERTScore Recall", "BERTScore F1"]:
                    key = f"{rouge_metric}"
                    value = bert.get(key, 0)
                    total_bert_scores[key] += float(value)
                    count_bert_scores[key] += 1

    
    # average ROUGE scores
    average_rouge_scores = {key: (total_rouge_scores[key] / count_rouge_scores[key]) 
                            if count_rouge_scores[key] > 0 else 0 
                            for key in total_rouge_scores}
    
    # average BERT scores
    average_bert_scores = {key: (total_bert_scores[key] / count_bert_scores[key]) 
                            if count_bert_scores[key] > 0 else 0 
                            for key in total_bert_scores}

    # output results
    print("\n=== Average ROUGE Scores ===")
    for key, avg in average_rouge_scores.items():
        print(f"{key}: {avg:.4f}")

    print("\n=== Average BERT Scores ===")
    for key, avg in average_bert_scores.items():
        print(f"{key}: {avg:.4f}")

    print(f"\nTotal 'Missing' count: {missing_count}")

In [4]:
calculate_statistics("AgenticIR_result/eval_self_reflection.json")


=== Average ROUGE Scores ===
ROUGE-1 Precision: 0.3556
ROUGE-1 Recall: 0.1621
ROUGE-1 F1: 0.2041
ROUGE-2 Precision: 0.0637
ROUGE-2 Recall: 0.0297
ROUGE-2 F1: 0.0368
ROUGE-L Precision: 0.2222
ROUGE-L Recall: 0.0980
ROUGE-L F1: 0.1241

=== Average BERT Scores ===
BERTScore Precision: 0.6422
BERTScore Recall: 0.5668
BERTScore F1: 0.6000

Total 'Missing' count: 0


In [5]:
calculate_statistics("AgenticIR_result/eval.json")


=== Average ROUGE Scores ===
ROUGE-1 Precision: 0.4349
ROUGE-1 Recall: 0.1771
ROUGE-1 F1: 0.2353
ROUGE-2 Precision: 0.0987
ROUGE-2 Recall: 0.0392
ROUGE-2 F1: 0.0522
ROUGE-L Precision: 0.2824
ROUGE-L Recall: 0.1123
ROUGE-L F1: 0.1503

=== Average BERT Scores ===
BERTScore Precision: 0.6879
BERTScore Recall: 0.5963
BERTScore F1: 0.6376

Total 'Missing' count: 0
