In [11]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Function to parse TEI files and calculate summary statistics
def parse_tei_files(folder_path):
    # Initialize lists for counts
    act_counts = []
    scene_counts = []
    l_counts = []
    sp_counts = []
    stage_counts = []
    titles = []

    # Namespace dictionary
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Iterate over files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Extract title
            title = root.find('.//tei:title', namespaces=ns).text
            titles.append(title)

            # Count elements
            act_counts.append(len(root.findall('.//tei:div[@type="act"]', namespaces=ns)))
            scene_counts.append(len(root.findall('.//tei:div[@type="scene"]', namespaces=ns)))
            l_counts.append(len(root.findall('.//tei:l', namespaces=ns)))
            sp_counts.append(len(root.findall('.//tei:sp', namespaces=ns)))
            stage_counts.append(len(root.findall('.//tei:stage', namespaces=ns)))

    # Return summary statistics
    return {
        'titles': titles,
        'act_counts': act_counts,
        'scene_counts': scene_counts,
        'l_counts': l_counts,
        'sp_counts': sp_counts,
        'stage_counts': stage_counts
    }

# Path to the folder containing TEI files
folder_path = '../results'

# Parse TEI files and calculate summary statistics
summary_stats = parse_tei_files(folder_path)

# Convert summary statistics to DataFrame
summary_df = pd.DataFrame(summary_stats)

# Sort the DataFrame by total counts in descending order
summary_df = summary_df.sort_values(by='act_counts', ascending=False)

# Save the sorted DataFrame to CSV
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, 'summary_statistics_sorted.csv')
summary_df.to_csv(output_file, index=False)

print("Sorted summary statistics saved to:", output_file)



Sorted summary statistics saved to: output/summary_statistics_sorted.csv
