In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Function to parse TEI files and calculate summary statistics for word count
def parse_tei_files(folder_path):
    # Initialize list for word counts
    word_counts = []

    # Namespace dictionary
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Iterate over files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Count words
            word_counts.append(sum(len(line.text.split()) if line.text else 0 for line in root.findall('.//tei:l', namespaces=ns)))

    # Calculate mean and range for word count
    mean_word_count = sum(word_counts) / len(word_counts) if word_counts else 0
    range_word_count = max(word_counts) - min(word_counts) if word_counts else 0

    # Return summary statistics for word count
    return {
        'word_count': {'total': sum(word_counts), 'mean': mean_word_count, 'range': range_word_count}
    }

# Path to the folder containing TEI files
folder_path = '../results'

# Parse TEI files and calculate summary statistics for word count
word_count_stats = parse_tei_files(folder_path)

# Print word count statistics
print("Total Word Count:", word_count_stats['word_count']['total'])
print("Mean Word Count:", word_count_stats['word_count']['mean'])
print("Range Word Count:", word_count_stats['word_count']['range'])

# Save statistics to a CSV file
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, 'word_count_statistics.csv')
word_count_df = pd.DataFrame.from_dict(word_count_stats)
word_count_df.to_csv(output_file, index=False)
print("Word count statistics saved to", output_file)
