In [8]:
import os
import csv
from bs4 import BeautifulSoup

def process_tei_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "xml")
        
        # Find all <sp> elements
        speech_elements = soup.find_all("sp")
        
        data = {}
        total_lines = 0
        for speech in speech_elements:
            speaker_id = speech["who"].replace("#", "")
            speaker_name = speaker_id
            
            # Find all <l> elements within the <sp> element
            lines = speech.find_all("l")
            num_lines = len(lines)
            total_lines += num_lines
            
            # Update total lines for each character
            if speaker_name in data:
                data[speaker_name] += num_lines
            else:
                data[speaker_name] = num_lines
        
        # Calculate proportion for each character over 100
        for character, lines in data.items():
            data[character] = {"Total_lines": lines, "Proportion": (lines / total_lines) * 100}
        
        return data

def save_results_to_csv(data, file_name):
    output_folder = "output/csv"
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, file_name + "-count-l.csv")
    
    with open(output_path, "w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["Character", "Total_lines", "Proportion"])
        writer.writeheader()
        for character, values in data.items():
            writer.writerow({"Character": character, "Total_lines": values["Total_lines"], "Proportion": values["Proportion"]})

def process_tei_files_in_directory(directory):
    for file_name in os.listdir(directory):
        if file_name.endswith(".xml"):
            file_path = os.path.join(directory, file_name)
            data = process_tei_file(file_path)
            save_results_to_csv(data, file_name.split(".")[0])

# Process TEI files in the "results" directory
process_tei_files_in_directory("results")
