In [5]:
import os
import pandas as pd
from lxml import etree

# Function to process a TEI XML file and extract desired information
def process_tei_file(file_path):
    tree = etree.parse(file_path)
    
    # Define namespace mappings
    namespaces = {
        'tei': 'http://www.tei-c.org/ns/1.0'
    }
    
    # Extract title of the play
    title_elem = tree.find('.//tei:titleStmt/tei:title[@type="main"]', namespaces=namespaces)
    title = title_elem.text if title_elem is not None else ''
    
    # Extract genre
    genre_elem = tree.find('.//tei:textClass/tei:keywords/tei:term[@source="kroll"]', namespaces=namespaces)
    genre = genre_elem.text if genre_elem is not None else ''
    
    # Extract persons and traits
    rows = []
    persons = tree.xpath('.//tei:listPerson/tei:person | .//tei:personGrp', namespaces=namespaces)
    
    for person in persons:
        pers_name = person.find('tei:persName', namespaces=namespaces).text
        trait = person.find('tei:trait/tei:desc', namespaces=namespaces) if person.find('tei:trait', namespaces=namespaces) is not None else None
        trait_desc = trait.text if trait is not None else ''
        
        rows.append({
            'title': title,
            'persName': pers_name,
            'trait-1': trait_desc,
            'trait-2': '',  # Assuming this column should be left blank if trait is absent
            'genre': genre  # Include the extracted genre information
        })
    
    return rows

# Process all TEI files in the results folder
output_rows = []

results_folder = '../results'  # Assuming the relative path to the results folder
for filename in os.listdir(results_folder):
    if filename.endswith('.xml'):
        file_path = os.path.join(results_folder, filename)
        output_rows.extend(process_tei_file(file_path))

# Create a DataFrame from the extracted data
df = pd.DataFrame(output_rows)

# Define the output directory path
output_dir = 'output'  # Output directory relative to the notebook location

# Create the output directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

# Define the output file path
output_file_path = os.path.join(output_dir, 'list-characters-names-trait.csv')

# Write the DataFrame to a CSV file
df.to_csv(output_file_path, index=False)

print(f"CSV output saved successfully to: {output_file_path}")


CSV output saved successfully to: output/list-characters-names-trait.csv
