In [2]:
import os
import json

def calculate_statistics(result_path, company):
    # Initialize accumulators and counters
    total_scores1 = {
        "Financial takeaways": 0,
        "Financial context": 0,
        "Reasoning correctness": 0,
        "Management expectation": 0
    }
    count_scores1 = {
        "Financial takeaways": 0,
        "Financial context": 0,
        "Reasoning correctness": 0,
        "Management expectation": 0
    }
    total_scores2 = {
        "Financial takeaways": 0,
        "Financial context": 0,
        "Reasoning correctness": 0,
        "Management expectation": 0
    }
    count_scores2 = {
        "Financial takeaways": 0,
        "Financial context": 0,
        "Reasoning correctness": 0,
        "Management expectation": 0
    }
    missing_count = 0

    # List result files
    result_files = os.listdir(f"../{result_path}/{company}")
    
    for result_file in result_files:
        with open(f"../{result_path}/{company}/{result_file}", 'r', encoding='utf-8') as f:
            report_list = json.load(f)
            
            for report in report_list:
                # Extract and check sections with possible "Missing"
                for key, section in report.items():
                    if isinstance(section, dict):  # Process the nested dictionary
                        # Check for "summary" or "report" for "Missing"
                        if "summary" in section and "Missing" in section["summary"]:
                            missing_count += 1
                        if "report" in section:
                            for item in section["report"]:
                                if any("Missing" in retrieved for retrieved in item.get("retrieved", [])):
                                    missing_count += 1
                                
                        # Process the scores in "decomp_score"
                        decomp_scores = section.get("decomp_score", [])
                        for score in decomp_scores:
                            for key, value in score.items():
                                if key in total_scores1:
                                    total_scores1[key] += float(value)
                                    count_scores1[key] += 1

                        chat_scores = section.get("chat_eval", [])
                        for score in chat_scores:
                            for key, value in score.items():
                                if key in total_scores2:
                                    total_scores2[key] += float(value)
                                    count_scores2[key] += 1

    # Calculate averages
    average_scores1 = {key: (total_scores1[key] / count_scores1[key]) if count_scores1[key] > 0 else 0
                      for key in total_scores1}
    
    average_scores2 = {key: (total_scores2[key] / count_scores2[key]) if count_scores2[key] > 0 else 0
                      for key in total_scores2}
    
    # Print results
    print("DecompEval Average Scores:")
    for key, avg in average_scores1.items():
        print(f"{key}: {avg:.2f}")

    print()

    print("ChatEval Average Scores:")
    for key, avg in average_scores2.items():
        print(f"{key}: {avg:.2f}")
    
    print(f"\nTotal 'Missing' count: {missing_count}")


In [3]:
calculate_statistics("Result_Eval_decomposed_self_reflection", "INTC")

DecompEval Average Scores:
Financial takeaways: 0.28
Financial context: 0.97
Reasoning correctness: 0.99
Management expectation: 0.63

ChatEval Average Scores:
Financial takeaways: 3.71
Financial context: 3.99
Reasoning correctness: 4.00
Management expectation: 3.96

Total 'Missing' count: 0


In [5]:
import re
def post_process_markdown(text):
    # Remove heading symbols (# and ##, etc.)
    text = re.sub(r'#+\s', '', text)

    # Remove bullet points numbers (1., 2., etc.)
    text = re.sub(r'\d+\.\s', '', text)
    
    # Remove bold asterisks (**)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    
    # Remove currency symbols, percentage signs, and similar formatting
    text = re.sub(r'[\$%]', '', text)
    
    # Remove extra line breaks and blank lines, merge paragraphs
    text = re.sub(r'\n+', '\n', text).strip()
    
    return text


In [7]:
import os
import json
from readability import Readability

import os
import json
from nltk.tokenize import sent_tokenize, word_tokenize

def tradition_eval(text):
    """
    Calculate readability scores for the given text.

    Args:
        text (str): The text to evaluate.

    Returns:
        dict: Readability scores including Flesch-Kincaid, Coleman-Liau, ARI, and SMOG.
    """
    read = Readability(text)
    scores = {
        "Flesch-Kincaid": read.flesch_kincaid().score,
        "Coleman-Liau": read.coleman_liau().score,
        "ARI": read.ari().score
    }
    return scores

def calculate_average_readability(result_path, company):
    """
    Calculate the average readability scores, average sentence length, and sentence count per file 
    for all JSON files of a company.

    Args:
        company (str): The company name used to locate result files.

    Returns:
        None: Prints average readability scores, average sentence length, and average sentences per file.
    """
    # Initialize readability score accumulators and counters
    readability_sums = {
        "Flesch-Kincaid": 0,
        "Coleman-Liau": 0,
        "ARI": 0,
        "SMOG": 0
    }
    total_sentence_count = 0
    total_sentence_length = 0
    file_count = 0

    # List result files
    result_files = os.listdir(f"../{result_path}/{company}")
    
    for result_file in result_files:
        with open(f"../../data/transcript/{result_file}", 'r', encoding='utf-8') as f:
            transcript = json.load(f)
        with open(f"../{result_path}/{company}/{result_file}", 'r', encoding='utf-8') as f:
            report_list = json.load(f)
            merged_summary = ""

            for report in report_list:
                for key, section in report.items():
                    if isinstance(section, dict):  # Process the nested dictionary
                        # Merge summaries
                        if "summary" in section:
                            summary_text = section["summary"]
                            merged_summary += " " + summary_text.strip()

            # Calculate readability score and sentence statistics for the merged summary
            if merged_summary.strip():  # Ensure there is text to evaluate
                merged_summary = post_process_markdown(merged_summary)
                sentences = sent_tokenize(merged_summary)
                sentence_count = len(sentences)
                sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences)

                total_sentence_count += sentence_count
                total_sentence_length += sentence_length

                readability_result = tradition_eval(merged_summary)
                for key, value in readability_result.items():
                    readability_sums[key] += value
                file_count += 1

    # Calculate and print average readability scores, sentence length, and sentences per file
    if file_count > 0:
        average_readability = {key: (value / file_count) for key, value in readability_sums.items()}
        average_sentences_per_file = total_sentence_count / file_count

        print(f"\nAverage Readability scores for {company}:")
        for key, value in average_readability.items():
            print(f"{key}: {value:.2f}")
        
        print(f"\nAverage sentences per file for {company}: {average_sentences_per_file:.2f}")
    else:
        print(f"\nNo valid summaries for readability in {company}.")


In [9]:
calculate_average_readability("Result_Eval_decomposed_self_reflection", "INTC")


Average Readability scores for INTC:
Flesch-Kincaid: 16.79
Coleman-Liau: 17.19
ARI: 18.37
SMOG: 0.00

Average sentences per file for INTC: 98.13
