In [4]:
from lxml import etree
import os

# Correct file path – make sure this file really lives in your notebook's CWD
file_path = "data\Faust._Der_Tragoedie_erster_Teil.11g9p.0.xml"

# Parse XML
tree = etree.parse(file_path)
root = tree.getroot()

# Check what tags exist
tags = set(elem.tag for elem in root.iter())
for tag in tags:
    print(tag)

{http://www.tei-c.org/ns/1.0}notesStmt
{http://www.tei-c.org/ns/1.0}pubPlace
{http://www.tei-c.org/ns/1.0}note
{http://www.tei-c.org/ns/1.0}author
{http://www.tei-c.org/ns/1.0}sourceDesc
{http://www.tei-c.org/ns/1.0}langUsage
{http://www.tei-c.org/ns/1.0}teiCorpus
{http://www.tei-c.org/ns/1.0}teiHeader
{http://www.tei-c.org/ns/1.0}ab
{http://www.tei-c.org/ns/1.0}titleStmt
{http://www.tei-c.org/ns/1.0}extent
{http://www.tei-c.org/ns/1.0}speaker
{http://www.tei-c.org/ns/1.0}TEI
{http://www.tei-c.org/ns/1.0}creation
{http://www.tei-c.org/ns/1.0}lg
{http://www.tei-c.org/ns/1.0}head
{http://www.tei-c.org/ns/1.0}text
{http://www.tei-c.org/ns/1.0}textClass
{http://www.tei-c.org/ns/1.0}l
{http://www.tei-c.org/ns/1.0}date
{http://www.tei-c.org/ns/1.0}resp
{http://www.tei-c.org/ns/1.0}language
{http://www.tei-c.org/ns/1.0}fileDesc
{http://www.tei-c.org/ns/1.0}lb
{http://www.tei-c.org/ns/1.0}keywords
{http://www.tei-c.org/ns/1.0}respStmt
{http://www.tei-c.org/ns/1.0}div
{http://www.tei-c.org/ns/1

In [5]:

# Register TEI namespace
ns = {'tei': "http://www.tei-c.org/ns/1.0"}# Check all div types
divs = tree.xpath("//tei:div", namespaces=ns)
types = set(div.get("type") for div in divs if div.get("type") is not None)

print("All div types found:", types)


All div types found: {'h4', 'front', 'text'}


# Extract Scene

In [6]:
# build a list of only those <div>s you want to call “Scene”
scene_divs = [d for d in divs if d.find("tei:head", namespaces=ns) is not None]
scenes = []
for i, div in enumerate(scene_divs, start=1):
    head = div.find("tei:head", namespaces=ns)
    scenes.append(head)
for scene in scenes:
    print("Scene:", scene.text)


Scene: Der Tragödie erster Teil.
Scene: Nacht.
Scene: Vor dem Tor.
Scene: Studierzimmer.
Scene: Studierzimmer.
Scene: Auerbachs Keller in Leipzig.
Scene: Hexenküche.
Scene: Straße.
Scene: Abend.
Scene: Spaziergang.
Scene: Der Nachbarin Haus.
Scene: Straße.
Scene: Garten.
Scene: Ein Gartenhäuschen.
Scene: Wald und Höhle.
Scene: Gretchens Stube.
Scene: Marthens Garten.
Scene: Am Brunnen.
Scene: Zwinger.
Scene: Nacht.
Scene: Dom.
Scene: Walpurgisnacht.
Scene: Walpurgisnachtstraum
Scene: Trüber Tag. Feld.
Scene: Nacht. Offen Feld.
Scene: Kerker.


In [7]:
import spacy
# 2. 预加载德语分词模型和停止词（只做一次）
nlp = spacy.load("de_core_news_sm")
stopwords = set(nlp.Defaults.stop_words)

In [12]:
from collections import Counter

# collect speaker stats per scene (only divs that have a tei:head)
scene_divs = [d for d in divs 
              if d.find('tei:head', namespaces=ns) is not None]
scene_stats = []

for idx, div in enumerate(scene_divs, start=1):
    head = div.find('tei:head', namespaces=ns)
    title = head.text.strip() if head is not None else f"Scene {idx}"
    scene_info = {
        "scene_num": idx,
        "scene_title": title,
        "speakers": {}
    }

    for sp in div.findall('.//tei:sp', namespaces=ns):
        sp_el = sp.find('tei:speaker', namespaces=ns)
        if sp_el is None:
            continue
        speaker = sp_el.text.strip()
        stats = scene_info["speakers"].setdefault(
            speaker,
            {"lines": 0, "word_freq": Counter()}
        )

        for l in sp.findall('.//tei:l', namespaces=ns):
            text = (l.text or '').strip()
            if not text:
                continue
            stats["lines"] += 1
            doc = nlp(text)
            for tok in doc:
                if tok.is_alpha:
                    lemma = tok.lemma_.lower()
                    if lemma not in stopwords:
                        stats["word_freq"][lemma] += 1

    # convert Counter to regular dict for JSON serialization
    for s, info in scene_info["speakers"].items():
        # sort word_freq by descending count:
        info["word_freq"] = dict(
            sorted(
                info["word_freq"].items(),
                key=lambda item: item[1],
                reverse=True
            )
        )

    scene_stats.append(scene_info)

scene_stats



[{'scene_num': 1, 'scene_title': 'Der Tragödie erster Teil.', 'speakers': {}},
 {'scene_num': 2,
  'scene_title': 'Nacht.',
  'speakers': {'FAUST.': {'lines': 207,
    'word_freq': {'herz': 11,
     'geist': 9,
     'welt': 7,
     'erde': 6,
     'leben': 5,
     'sinn': 5,
     'heiß': 4,
     'natur': 4,
     'fühlen': 4,
     'herab': 3,
     'wissen': 3,
     'mensch': 3,
     'kraft': 3,
     'mund': 3,
     'erkennen': 3,
     'buch': 3,
     'freund': 3,
     'licht': 3,
     'weh': 3,
     'heißen': 3,
     'gott': 3,
     'eign': 3,
     'zeichen': 3,
     'seele': 3,
     'brust': 3,
     'all': 3,
     'neu': 3,
     'gefühl': 3,
     'kind': 3,
     'ernst': 3,
     'arm': 2,
     'tor': 2,
     'magister': 2,
     'doktor': 2,
     'schüler': 2,
     'verbrennen': 2,
     'weder': 2,
     'bild': 2,
     'hab': 2,
     'sagen': 2,
     'wort': 2,
     'papier': 2,
     'lieb': 2,
     'schweben': 2,
     'wiese': 2,
     'weben': 2,
     'dumpf': 2,
     'gewölb': 2,
    

In [13]:
import json

# If you haven't yet, rename the existing stats for book 1:
scene_stats_book1 = scene_stats

# After you load & process book 2 the same way, rename its stats:
# scene_stats_book2 = scene_stats

# Export both to JSON files
for stats, fname in [
    (scene_stats_book1, "book1_scene_stats.json"),
    # (scene_stats_book2, "book2_scene_stats.json"
]:
    with open(fname, "w", encoding="utf-8") as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)

In [9]:
records = []
for scene in scenes:
    scene_id = scene.get('xml:id')
    for sp in scene.xpath('.//sp'):
        speaker = sp.xpath('./speaker/text()')[0]
        for line in sp.xpath('./l'):
            text = line.text.strip()
            records.append({'scene': scene_id, 'speaker': speaker, 'line': text})
df = pd.DataFrame(records)

NameError: name 'pd' is not defined

In [None]:
speakers = tree.xpath("//tei:sp/tei:speaker", namespaces=ns)
print(f"Found {len(speakers)} speakers.")
for speaker in speakers:
    print("Speaker:", speaker.text)

Found 902 speakers.
Speaker: FAUST.
Speaker: GEIST.
Speaker: FAUST
Speaker: GEIST.
Speaker: FAUST.
Speaker: GEIST.
Speaker: FAUST.
Speaker: GEIST.
Speaker: FAUST.
Speaker: GEIST.
Speaker: FAUST
Speaker: WAGNER.
Speaker: FAUST.
Speaker: WAGNER.
Speaker: FAUST.
Speaker: WAGNER.
Speaker: FAUST.
Speaker: WAGNER.
Speaker: FAUST.
Speaker: WAGNER.
Speaker: FAUST.
Speaker: WAGNER.
Speaker: FAUST.
Speaker: WAGNER.
Speaker: FAUST
Speaker: CHOR DER ENGEL.
Speaker: FAUST.
Speaker: CHOR DER WEIBER.
Speaker: CHOR DER ENGEL.
Speaker: FAUST.
Speaker: CHOR DER JÜNGER.
Speaker: CHOR DER ENGEL.
Speaker: EINIGE HANDWERKSBURSCHEN.
Speaker: ANDRE.
Speaker: DIE ERSTEN.
Speaker: EIN HANDWERKSBURSCH.
Speaker: ZWEITER.
Speaker: DIE ZWEITEN.
Speaker: EIN DRITTER.
Speaker: VIERTER.
Speaker: FÜNFTER.
Speaker: DIENSTMÄDCHEN.
Speaker: ANDRE.
Speaker: ERSTE.
Speaker: ANDRE.
Speaker: SCHÜLER.
Speaker: BÜRGERMÄDCHEN.
Speaker: ZWEITER SCHÜLER
Speaker: ERSTER.
Speaker: BÜRGER.
Speaker: BETTLER
Speaker: ANDRER BÜRGER.
Spe

In [None]:
lines = tree.xpath("//tei:l", namespaces=ns)
for line in lines[:10]:
    print("Line:", line.text)


Line: Habe nun, ach! Philosophie,
Line: Juristerei und Medizin,
Line: Und leider auch Theologie
Line: Durchaus studiert, mit heißem Bemühn.
Line: Da steh' ich nun, ich armer Tor,
Line: Und bin so klug als wie zuvor!
Line: Heiße Magister, heiße Doktor gar,
Line: Und ziehe schon an die zehen Jahr'
Line: Herauf, herab und quer und krumm
Line: Meine Schüler an der Nase herum –


In [None]:
lines_text = [line.text for line in lines if line.text is not None]
print(f"Total lines with text: {len(lines_text)}")

Total lines with text: 4334


# Rhyme Scheme Detection

In [None]:
lines_text = [line.text for line in lines]
# Remove None or empty lines
lines_text = [line for line in lines_text if line and line.strip()]

last_words = []
for line in lines_text:
    tokens = line.split()
    # look from the end for the first non-punctuation word
    while True:
    # pop tokens until a non-punctuation word is found
        tok = tokens.pop() 
        word = tok.strip('.,;:!?…–—-"\'()[]')
        if word:
            last_words.append(word)
            break


def get_rhyme_part(word):
    return word[-3:].lower()

rhyme_parts = [get_rhyme_part(word) for word in last_words]
print("Rhyme parts:", rhyme_parts)



Rhyme parts: ['hie', 'zin', 'gie', 'ühn', 'tor', 'vor', 'gar', 'ahr', 'umm', 'rum', 'nen', 'nen', 'fen', 'fen', 'fel', 'fel', 'sen', 'sen', 'ren', 'ren', 'eld', 'elt', 'ben', 'ben', 'und', 'und', 'eiß', 'eiß', 'elt', 'ält', 'men', 'men', 'ein', 'cht', 'cht', 'ier', 'mir', 'öhn', 'ehn', 'ben', 'ben', 'den', 'den', 'och', 'och', 'cht', 'cht', 'auf', 'ckt', 'auf', 'ckt', 'llt', 'pft', 'pft', 'elt', 'erz', 'mmt', 'erz', 'mmt', 'tur', 'ein', 'nur', 'ein', 'and', 'uch', 'and', 'nug', 'auf', 'auf', 'ist', 'ier', 'ärt', 'mir', 'ört', 'ick', 'nen', 'ück', 'nen', 'ieb', 'len', 'len', 'ieb', 'len', 'cht', 'gen', 'gen', 'cht', 'sen', 'tot', 'sen', 't!‹', 'ebt', 'ebt', 'gen', 'hen', 'gen', 'gen', 'gen', 'nur', 'tur', 'ens', 'ngt', 'ngt', 'ein', 'her', 'her', 'ein', 'gen', 'gen', 'gen', 'gen', 'mir', 'cht', 'det', 'len', 'eht', 'rab', 'an', 'ist', 'ich', 'ißt', 'len', 'len', 'ben', 'ben', 'mir', 'cht', 'gen', 'gen', 'nun', 'cht', 'uen', 'ehn', 'ehn', 'uen', 'ruf', 'huf', 'ben', 'ang', 'ang', 'ert', 

In [None]:
%pip install lxml pandas networkx matplotlib regex


Note: you may need to restart the kernel to use updated packages.


In [None]:
freq = df['speaker'].value_counts().reset_index()
freq.columns = ['speaker', 'count']

plt.figure(figsize=(8,6))
plt.bar(freq['speaker'], freq['count'])
plt.xticks(rotation=45, ha='right')
plt.title('Lines per Speaker')
plt.tight_layout()
plt.show()


NameError: name 'df' is not defined