In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Function to parse TEI files and calculate summary statistics
def parse_tei_files(folder_path):
    # Initialize lists for counts
    word_counts = []
    act_counts = []
    scene_counts = []
    l_counts = []
    sp_counts = []
    stage_counts = []
    person_counts = []
    person_grp_counts = []
    head_counts = []  # Add a list to count <head> elements

    # Namespace dictionary
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Iterate over files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Count words
            word_counts.append(sum(len(line.text.split()) if line.text else 0 for line in root.findall('.//tei:l', namespaces=ns)))

            # Count elements
            act_counts.append(len(root.findall('.//tei:div[@type="act"]', namespaces=ns)))
            scene_counts.append(len(root.findall('.//tei:div[@type="scene"]', namespaces=ns)))
            l_counts.append(len(root.findall('.//tei:l', namespaces=ns)))
            sp_counts.append(len(root.findall('.//tei:sp', namespaces=ns)))
            stage_counts.append(len(root.findall('.//tei:stage', namespaces=ns)))
            person_counts.append(len(root.findall('.//tei:person', namespaces=ns)))
            person_grp_counts.append(len(root.findall('.//tei:personGrp', namespaces=ns)))
            
            # Count <head> elements
            head_counts.append(len(root.findall('.//tei:head', namespaces=ns)))

    # Calculate mean and range for each statistic
    mean_word_count = sum(word_counts) / len(word_counts) if word_counts else 0
    range_word_count = max(word_counts) - min(word_counts) if word_counts else 0
    mean_act_count = sum(act_counts) / len(act_counts) if act_counts else 0
    range_act_count = max(act_counts) - min(act_counts) if act_counts else 0
    mean_scene_count = sum(scene_counts) / len(scene_counts) if scene_counts else 0
    range_scene_count = max(scene_counts) - min(scene_counts) if scene_counts else 0
    mean_l_count = sum(l_counts) / len(l_counts) if l_counts else 0
    range_l_count = max(l_counts) - min(l_counts) if l_counts else 0
    mean_sp_count = sum(sp_counts) / len(sp_counts) if sp_counts else 0
    range_sp_count = max(sp_counts) - min(sp_counts) if sp_counts else 0
    mean_stage_count = sum(stage_counts) / len(stage_counts) if stage_counts else 0
    range_stage_count = max(stage_counts) - min(stage_counts) if stage_counts else 0
    mean_person_count = sum(person_counts) / len(person_counts) if person_counts else 0
    range_person_count = max(person_counts) - min(person_counts) if person_counts else 0
    mean_person_grp_count = sum(person_grp_counts) / len(person_grp_counts) if person_grp_counts else 0
    range_person_grp_count = max(person_grp_counts) - min(person_grp_counts) if person_grp_counts else 0
    
    # Calculate mean and range for head element
    mean_head_count = sum(head_counts) / len(head_counts) if head_counts else 0
    range_head_count = max(head_counts) - min(head_counts) if head_counts else 0

    # Return summary statistics
    return {
        'word_count': {'total': sum(word_counts), 'mean': mean_word_count, 'range': range_word_count},
        'act_count': {'total': sum(act_counts), 'mean': mean_act_count, 'range': range_act_count},
        'scene_count': {'total': sum(scene_counts), 'mean': mean_scene_count, 'range': range_scene_count},
        'head_count': {'total': sum(head_counts), 'mean': mean_head_count, 'range': range_head_count},
        'l_count': {'total': sum(l_counts), 'mean': mean_l_count, 'range': range_l_count},
        'sp_count': {'total': sum(sp_counts), 'mean': mean_sp_count, 'range': range_sp_count},
        'stage_count': {'total': sum(stage_counts), 'mean': mean_stage_count, 'range': range_stage_count},
        'person_count': {'total': sum(person_counts), 'mean': mean_person_count, 'range': range_person_count},
        'person_grp_count': {'total': sum(person_grp_counts), 'mean': mean_person_grp_count, 'range': range_person_grp_count},
    }

# Path to the folder containing TEI files
folder_path = '../results'

# Parse TEI files and calculate summary statistics
summary_stats = parse_tei_files(folder_path)

# Convert summary statistics to DataFrame
summary_df = pd.DataFrame(summary_stats).T
summary_df.index.name = 'Statistic'

# Save the DataFrame to CSV
output_folder = 'output'
output_file = os.path.join(output_folder, 'summary_statistics_results.csv')
os.makedirs(output_folder, exist_ok=True)
summary_df.to_csv(output_file)

# Display the DataFrame
print(summary_df)


                      total          mean    range
Statistic                                         
word_count        2576721.0  12569.370732  22275.0
act_count             426.0      2.078049      3.0
scene_count         10203.0     49.770732    107.0
head_count            525.0      2.560976      5.0
l_count            562657.0   2744.668293   4862.0
sp_count           121382.0    592.107317   1183.0
stage_count         23758.0    115.892683    314.0
person_count         3119.0     15.214634     33.0
person_grp_count      287.0      1.400000      8.0
