In [None]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

# Define TEI namespace
tei_namespace = {'tei': 'http://www.tei-c.org/ns/1.0'}

# Function to parse XML files and extract required information
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    title = root.find(".//tei:title[@type='main']", namespaces=tei_namespace).text
    
    acts = len(root.findall(".//tei:div[@type='act']", namespaces=tei_namespace))
    
    scenes = 0
    for act in root.findall(".//tei:div[@type='act']", namespaces=tei_namespace):
        scenes += len(act.findall(".//tei:div[@type='scene']", namespaces=tei_namespace))
    
    return title, acts, scenes

# Function to count words in a scene
def count_words(scene):
    word_count = 0
    for sp in scene.findall(".//tei:sp", namespaces=tei_namespace):
        for line in sp.findall(".//tei:l", namespaces=tei_namespace):
            if line.text is not None:
                words = line.text.strip().split()
                word_count += len(words)
            else:
                print(f"Warning: Found a scene with no words in <l> element: {scene.attrib['n']}")
    return word_count

# Function to plot scene lengths
def plot_scene_lengths(title, scene_lengths):
    total_words = sum(scene_lengths.values())
    scene_starts = [0]  # List to store indices where scenes start
    scene_labels = []   # List to store scene labels
    scene_sizes = []    # List to store scene sizes relative to the total
    
    if total_words == 0:
        print(f"No words found in '{title}' scenes.")
        return
    
    for scene, words in scene_lengths.items():
        scene_labels.append(scene)
        scene_sizes.append(words)
    
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(scene_sizes)), scene_sizes, align='center', alpha=0.7)
    plt.xticks(range(len(scene_sizes)), scene_labels, rotation=90)
    
    # Add vertical lines to indicate scene boundaries
    for i in range(len(scene_starts)):
        plt.axvline(x=i - 0.5, color='black', linestyle='--', linewidth=0.5)
    
    plt.xlabel('Scene')
    plt.ylabel('Length (words)')
    plt.title(f'Scene Lengths for "{title}"')
    plt.tight_layout()
    plt.savefig(f'output/img/{title}_scene_lengths.png')
    plt.show()

# Iterate over XML files
xml_files = glob.glob("results/*.xml")

results = []

for file in xml_files:
    title, acts, scenes = parse_xml(file)
    
    scene_lengths = {}
    
    tree = ET.parse(file)
    root = tree.getroot()
    
    for scene in root.findall(".//tei:div[@type='scene']", namespaces=tei_namespace):
        scene_text = scene.text.strip()
        scene_lengths[scene.attrib['n']] = count_words(scene)
    
    plot_scene_lengths(title, scene_lengths)
    
    results.append({'Title': title, 'Acts': acts, 'Scenes': scenes})

# Create DataFrame and save to CSV
df = pd.DataFrame(results)
df.to_csv('output/csv/play_summary.csv', index=False)
