# Faust TEI Full Processing
Structural extraction, TEI enhancement, word frequency statistics, character co-occurrence network analysis, visualization, and Neo4j export.

In [None]:
from lxml import etree
from collections import Counter, defaultdict
import pandas as pd
import itertools
import os
import json
import matplotlib.pyplot as plt
import networkx as nx

xml_files = {
    "Faust_I": "/Users/lisiqi/Documents/Text_Tech/faust_project/data/Faust._Der_Tragoedie_erster_Teil.11g9p.0.xml",
    "Faust_II": "/Users/lisiqi/Documents/Text_Tech/faust_project/data/Faust._Der_Tragoedie_zweiter_Teil.11d12.0.xml"
}

ns = {'tei': "http://www.tei-c.org/ns/1.0"}


## TEI Analysis Functions

In [None]:
# Parse an XML file and return the parsed tree
def parse_xml(file_path):
    return etree.parse(file_path)

# Extract all scene headers (excluding acts and front matter)
def extract_scenes(tree):
    scenes = tree.xpath("//tei:div[not(@type='act') and not(@type='front') and tei:head]/tei:head/text()", namespaces=ns)
    return [scene.strip() for scene in scenes if scene.strip()]

# Extract and return unique speakers sorted alphabetically
def extract_speakers(tree):
    speakers = tree.xpath("//tei:sp/tei:speaker/text()", namespaces=ns)
    speaker_names = {speaker.strip() for speaker in speakers if speaker.strip()}
    return sorted(speaker_names)

# Extract all poetic lines
def extract_lines(tree):
    lines = tree.xpath("//tei:l", namespaces=ns)
    lines_text = [''.join(line.itertext()).strip() for line in lines if ''.join(line.itertext()).strip()]
    return lines_text

# Extract the last word from each poetic line (rhyme word)
def extract_rhyme_words(lines_text):
    last_words = []
    for line in lines_text:
        tokens = line.split()
        while tokens:
            tok = tokens.pop()
            word = tok.strip('.,;:!?…–—-"\'()[]')
            if word:
                last_words.append(word)
                break
        else:
            last_words.append("")
    return [word for word in last_words if word]

# Get the last few characters from a word for rhyme comparison
def get_rhyme_part(word, length=3):
    return word[-length:].lower()

# Enhance the TEI XML by adding attributes to each scene
def enhance_tei(tree, xml_path):
    for scene in tree.xpath("//tei:div[@type='h4']", namespaces=ns):
        # Extract rhyme-ending words from poetic lines
        rhyme_ends = []
        for l in scene.xpath(".//tei:l", namespaces=ns):
            text = "".join(l.itertext()).strip() if l.text is None else l.text.strip()
            if text:
                last_word = text.split()[-1].strip('.,;:!?…–—-"\'()[]')
                rhyme_ends.append(last_word)
        scene.attrib['rhyme_ends'] = ",".join(rhyme_ends[:20])  # Save first 20 rhyme endings

        # Extract unique speakers and their frequency
        speakers = [sp.text.strip() for sp in scene.xpath(".//tei:sp/tei:speaker", namespaces=ns) if sp.text]
        unique_speakers = sorted(set(speakers))
        scene.attrib['speakers_unique'] = ",".join(unique_speakers)
        scene.attrib['speakers_count'] = str(len(unique_speakers))
        freq = Counter(speakers)
        freq_str = ";".join(f"{name}:{freq[name]}" for name in unique_speakers)
        scene.attrib['speaker_freq'] = freq_str

    # Save the enhanced XML file
    enhanced_path = os.path.splitext(xml_path)[0] + "_enhanced.xml"
    tree.write(enhanced_path, encoding="utf-8", xml_declaration=True, pretty_print=True)
    print(f"Enhanced XML saved: {enhanced_path}")
    return enhanced_path

# Normalize and clean speaker names
def clean_name(name):
    return name.strip().strip('.,;:!?…–—-"\'()[]').title()

# Analyze word frequencies and character co-occurrences, generate plots and Neo4j CSVs
def analyze_network(xml_path, title):
    tree = etree.parse(xml_path)
    scene_wordfreq = {}
    total_wordfreq = Counter()
    scene_character_links = []
    cooccur_counts = defaultdict(int)

    # Iterate through all scenes
    for scene in tree.xpath("//tei:div[@type='h4']", namespaces=ns):
        head = scene.find("tei:head", namespaces=ns)
        scene_name = head.text.strip() if head is not None else "Unnamed"
        words = []

        # Collect words from all poetic lines
        for l in scene.xpath(".//tei:l", namespaces=ns):
            line_text = "".join(l.itertext()).strip()
            for word in line_text.split():
                clean = word.strip('.,;:!?…–—-"\'()[]')
                if clean: words.append(clean.lower())
        wc = Counter(words)
        scene_wordfreq[scene_name] = wc
        total_wordfreq.update(words)

        # Collect speaker data for this scene
        scene_full_name = f"{title}::{scene_name}"
        speakers = [clean_name(sp.text) for sp in scene.xpath(".//tei:sp/tei:speaker", namespaces=ns) if sp.text]
        for speaker in set(speakers):
            scene_character_links.append({"play": title, "scene": scene_full_name, "character": speaker})

        # Count co-occurrences between every pair of speakers
        unique_speakers = sorted(set(speakers))
        for a, b in itertools.combinations(unique_speakers, 2):
            key = tuple(sorted([a, b]))
            cooccur_counts[key] += 1

    # Save word frequencies to CSV
    pd.DataFrame({"word": list(total_wordfreq.keys()), "freq": list(total_wordfreq.values())}).to_csv(f"{title.lower()}_total_wordfreq.csv", index=False)
    print(f"{title} word frequency saved: {title.lower()}_total_wordfreq.csv")

    # Save scene-character table
    pd.DataFrame(scene_character_links).to_csv(f"{title.lower()}_scene_character.csv", index=False)
    print(f"{title} scene-character table saved: {title.lower()}_scene_character.csv")

    # Save character co-occurrence edge list
    edges = [{"source": a, "target": b, "weight": count} for (a, b), count in cooccur_counts.items()]
    pd.DataFrame(edges).to_csv(f"{title.lower()}_character_cooccurrence.csv", index=False)
    print(f"{title} character co-occurrence edge list saved: {title.lower()}_character_cooccurrence.csv")

    # Save character node list
    all_characters = set(link['character'] for link in scene_character_links)
    pd.DataFrame([{"id": name, "label": "Character"} for name in sorted(all_characters)]).to_csv(f"{title.lower()}_characters_nodes.csv", index=False)
    print(f"{title} node list saved: {title.lower()}_characters_nodes.csv")

    G = nx.Graph()
    for edge in edges:
        G.add_edge(edge["source"], edge["target"], weight=edge["weight"])
    plt.figure(figsize=(10, 8))
    pos = nx.spring_layout(G, seed=42)
    weights = [G[u][v]['weight'] for u, v in G.edges()]
    nx.draw_networkx(G, pos, with_labels=True, width=weights, node_size=800, font_size=10)
    plt.title(f"{title} Character Co-occurrence Network")
    plt.axis('off')
    plt.savefig(f"{title.lower()}_network_plot.png")
    plt.show()

    # Export Neo4j-compatible CSV files
    pd.DataFrame({"name": list(all_characters), "label": ["Character"] * len(all_characters)}).to_csv(f"{title.lower()}_neo4j_nodes.csv", index=False)
    pd.DataFrame(edges).to_csv(f"{title.lower()}_neo4j_edges.csv", index=False)
    print(f"{title} Neo4j CSV files saved")


## Character Networks and Rhyme Patterns

In [None]:
# Store extracted results
analysis_results = {}

# Process each XML file
for title, path in xml_files.items():
    print(f"\nProcessing: {title}")
    tree = parse_xml(path)

    # Extract core elements from the XML
    scenes = extract_scenes(tree)
    speakers = extract_speakers(tree)
    lines_text = extract_lines(tree)
    rhyme_words = extract_rhyme_words(lines_text)
    rhyme_parts = [get_rhyme_part(word) for word in rhyme_words]

    # Save extracted data into a dictionary
    analysis_results[title] = {
        "scenes": scenes,
        "speakers": speakers,
        "lines_text": lines_text,
        "rhyme_parts": rhyme_parts
    }

    print(f"- Scenes ({len(scenes)}): {scenes[:5]}")
    print(f"- Unique Speakers ({len(speakers)}): {speakers[:5]}")
    print(f"- Total Lines: {len(lines_text)}")
    print(f"- Sample Rhyme parts: {rhyme_parts[:5]}")

    # Enhance the XML and analyze network
    enhanced_xml_path = enhance_tei(tree, path)
    analyze_network(enhanced_xml_path, title)

# Save the structured data to JSON
with open("faust_struct_data.json", "w", encoding="utf-8") as f:
    json.dump(analysis_results, f, ensure_ascii=False, indent=2)
print("Structured data saved: faust_struct_data.json")
