In [2]:
import os
import csv
from bs4 import BeautifulSoup
from collections import defaultdict

def process_tei_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "xml")
        
        # Find all <sp> elements
        speech_elements = soup.find_all("sp")
        
        data = defaultdict(int)
        total_lines = 0
        for speech in speech_elements:
            speakers = speech["who"].split()  # Tokenize the @who attribute value
            
            # Find all <l> elements within the <sp> element
            lines = speech.find_all("l")
            num_lines = len(lines)
            total_lines += num_lines
            
            # Increment the count for each referenced character
            for speaker in speakers:
                speaker_id = speaker.replace("#", "")
                data[speaker_id] += num_lines  # Count lines for each character
        
        # Convert defaultdict to list of dictionaries
        result = [{"Character": speaker_id, "Total_lines": count} for speaker_id, count in data.items()]
        
        # Calculate proportion for each character over 100
        for item in result:
            item["Proportion"] = (item["Total_lines"] / total_lines) * 100
        
        return result

def save_results_to_csv(data, file_name):
    output_folder = "output/csv"
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, file_name + "_count_l.csv")
    
    with open(output_path, "w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["Character", "Total_lines", "Proportion"])
        writer.writeheader()
        writer.writerows(data)

def process_tei_files_in_directory(directory):
    for file_name in os.listdir(directory):
        if file_name.endswith(".xml"):
            file_path = os.path.join(directory, file_name)
            data = process_tei_file(file_path)
            save_results_to_csv(data, file_name.split(".")[0])

# Process TEI files in the "results" directory
process_tei_files_in_directory("results")
