In [6]:
import os
import json

def calculate_statistics(result_path, company):
    # Initialize accumulators and counters, scores1 is for decomp_score, scores2 is for chat_eval
    total_scores1 = {
        "Financial takeaways": 0,
        "Financial context": 0,
        "Reasoning correctness": 0,
        "Management expectation": 0
    }
    count_scores1 = {
        "Financial takeaways": 0,
        "Financial context": 0,
        "Reasoning correctness": 0,
        "Management expectation": 0
    }
    total_scores2 = {
        "Financial takeaways": 0,
        "Financial context": 0,
        "Reasoning correctness": 0,
        "Management expectation": 0
    }
    count_scores2 = {
        "Financial takeaways": 0,
        "Financial context": 0,
        "Reasoning correctness": 0,
        "Management expectation": 0
    }
    missing_count = 0

    # List result files
    result_files = os.listdir(f"{result_path}/{company}")
    
    for result_file in result_files:
        with open(f"{result_path}/{company}/{result_file}", 'r', encoding='utf-8') as f:
            report_list = json.load(f)
            
            miss = False
            for report in report_list:
                # Check for "Missing" in the sub-section content
                for section, content in report.items():
                    if isinstance(content, str) and "Missing" in content:
                        missing_count += 1
                        miss = True
                if miss == True:
                    continue
                
                # Process scores in decomp_score
                decomp_score = report.get("decomp_score", [])
                for score in decomp_score:
                    for key, value in score.items():
                        if key in total_scores1:
                            total_scores1[key] += float(value)
                            count_scores1[key] += 1

                chat_score = report.get("chat_eval", [])
                for score in chat_score:
                    for key, value in score.items():
                        if key in total_scores2:
                            total_scores2[key] += float(value)
                            count_scores2[key] += 1

    # Calculate averages
    average_scores1 = {key: (total_scores1[key] / count_scores1[key]) if count_scores1[key] > 0 else 0
                      for key in total_scores1}
    
    average_scores2 = {key: (total_scores2[key] / count_scores2[key]) if count_scores2[key] > 0 else 0
                      for key in total_scores2}
    
    # Print results
    print("DecompEval Average Scores:")
    for key, avg in average_scores1.items():
        print(f"{key}: {avg:.2f}")
    print()
    print("ChatEval Average Scores:")
    for key, avg in average_scores2.items():
        print(f"{key}: {avg:.2f}")
    
    print(f"\nTotal 'Missing' count: {missing_count}")

    return average_scores1

                        
            

In [7]:
calculate_statistics("../Result/Result_Eval_self_reflection", "INTC")

DecompEval Average Scores:
Financial takeaways: 0.39
Financial context: 0.89
Reasoning correctness: 0.95
Management expectation: 0.34

ChatEval Average Scores:
Financial takeaways: 3.36
Financial context: 3.71
Reasoning correctness: 3.86
Management expectation: 3.19

Total 'Missing' count: 2


{'Financial takeaways': 0.390344827586207,
 'Financial context': 0.8910344827586208,
 'Reasoning correctness': 0.9463793103448276,
 'Management expectation': 0.3401724137931035}

In [12]:
from readability import Readability

def tradition_eval(summary_text):
    """
    Calculate readability scores for the given summary text.

    Args:
        summary_text (str): The summary text to evaluate.

    Returns:
        dict: Readability scores including Flesch-Kincaid, Coleman-Liau, ARI, and SMOG.
    """
    read = Readability(summary_text)
    scores = {
        "Flesch-Kincaid": read.flesch_kincaid().score,
        "Coleman-Liau": read.coleman_liau().score,
        "ARI": read.ari().score,
    }
    return scores


In [13]:
import re
def post_process_markdown(text):
    # Remove heading symbols (# and ##, etc.)
    text = re.sub(r'#+\s', '', text)

    # Remove bullet points numbers (1., 2., etc.)
    text = re.sub(r'\d+\.\s', '', text)
    
    # Remove bold asterisks (**)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    
    # Remove currency symbols, percentage signs, and similar formatting
    text = re.sub(r'[\$%]', '', text)
    
    # Remove extra line breaks and blank lines, merge paragraphs
    text = re.sub(r'\n+', '\n', text).strip()
    
    return text


In [16]:
from nltk.tokenize import sent_tokenize

def calculate_average_readability_and_sentences_per_file(result_path, company):
    """
    Calculate the average readability scores and average sentence count per file for all JSON files of a company.

    Args:
        company (str): The company name used to locate result files.

    Returns:
        None: Prints average readability scores and average sentence count per file for the company.
    """
    # Initialize readability score accumulators and file count
    readability_sums = {
        "Flesch-Kincaid": 0,
        "Coleman-Liau": 0,
        "ARI": 0,
    }
    total_sentence_count = 0
    file_count = 0

    # List result files
    result_files = os.listdir(f"../{result_path}/{company}")
    
    for result_file in result_files:
        with open(f"../../data/transcript/{result_file}", 'r', encoding='utf-8') as f:
            transcript = json.load(f)
        with open(f"../{result_path}/{company}/{result_file}", 'r', encoding='utf-8') as f:
            report_list = json.load(f)
            merged_summary = ""

            for report in report_list:
                for key, value in report.items():
                    if isinstance(value, str):  # Extract text from subsection (e.g., "sub section 1.1")
                        merged_summary += " " + value.strip()

            # Calculate readability score and sentence count for the merged summary
            if merged_summary.strip():  # Ensure there is text to evaluate
                merged_summary = post_process_markdown(merged_summary)
                sentences = sent_tokenize(merged_summary)
                total_sentence_count += len(sentences)

                try:
                    readability_result = tradition_eval(merged_summary)
                except:
                    readability_result = {
                        "Flesch-Kincaid": 0,
                        "Coleman-Liau": 0,
                        "ARI": 0
                    }
                    file_count -= 1
                for key, value in readability_result.items():
                    readability_sums[key] += value
                file_count += 1

    # Calculate and print average readability scores and average sentence count per file
    if file_count > 0:
        average_readability = {key: (value / file_count) for key, value in readability_sums.items()}
        average_sentences_per_file = total_sentence_count / file_count

        print(f"\nAverage Readability scores for {company}:")
        for key, value in average_readability.items():
            print(f"{key}: {value:.2f}")
        
        print(f"\nAverage sentences per file for {company}: {average_sentences_per_file:.2f}")
    else:
        print(f"\nNo valid summaries for readability in {company}.")


In [17]:
calculate_average_readability_and_sentences_per_file("Result/Result_Eval_self_reflection", "INTC")


Average Readability scores for INTC:
Flesch-Kincaid: 15.70
Coleman-Liau: 15.63
ARI: 17.06

Average sentences per file for INTC: 9.33
