In [1]:
import os
import csv
from xml.etree import ElementTree as ET

def count_words_by_character(tei_file):
    tree = ET.parse(tei_file)
    root = tree.getroot()

    character_lines = {}

    for sp in root.findall('.//{http://www.tei-c.org/ns/1.0}sp'):
        speaker_id = sp.get('who')
        if speaker_id is not None:
            speaker_id = speaker_id[1:]  # Removing the '#' character
            lines = sp.findall('{http://www.tei-c.org/ns/1.0}l')
            character_lines[speaker_id] = character_lines.get(speaker_id, 0) + len(lines)

    return character_lines

def process_tei_files(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_name in os.listdir(input_folder):
        if file_name.endswith('.xml'):
            tei_file = os.path.join(input_folder, file_name)
            character_lines = count_words_by_character(tei_file)

            # Sort characters by word count in descending order
            sorted_characters = sorted(character_lines.items(), key=lambda x: x[1], reverse=True)

            output_file = os.path.join(output_folder, file_name.replace('.xml', '_count_words_output.csv'))
            with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['Character', 'WordCount']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()

                for character, word_count in sorted_characters:
                    writer.writerow({'Character': character, 'WordCount': word_count})

if __name__ == "__main__":
    input_folder = "results"
    output_folder = "output/csv"

    process_tei_files(input_folder, output_folder)
