In [1]:
%pip install spacy
%python -m spacy download de_core_news_sm


Collecting spacy
  Downloading spacy-3.8.7-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.13-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp311-cp311-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.10-cp311-cp311-win_amd64.whl.metadata (2.5 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.1-cp311-cp311-win_amd6

UsageError: Line magic function `%python` not found (But cell magic `%%python` exists, did you mean that instead?).


In [3]:
!python -m spacy download de_core_news_sm


Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
     ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
     ---- ----------------------------------- 1.6/14.6 MB 10.5 MB/s eta 0:00:02
     ----------- ---------------------------- 4.2/14.6 MB 12.0 MB/s eta 0:00:01
     ------------------- -------------------- 7.1/14.6 MB 12.1 MB/s eta 0:00:01
     -------------------------- ------------- 9.7/14.6 MB 12.1 MB/s eta 0:00:01
     -------------------------------- ------ 12.3/14.6 MB 12.2 MB/s eta 0:00:01
     --------------------------------------  14.4/14.6 MB 12.2 MB/s eta 0:00:01
     --------------------------------------- 14.6/14.6 MB 11.5 MB/s eta 0:00:00
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('

In [4]:
from lxml import etree
import spacy
from collections import Counter, defaultdict

# 定义要处理的文件（上下集）
file_paths = [
    "data/Faust._Der_Tragoedie_erster_Teil.11g9p.0.xml",
    "data/Faust._Der_Tragoedie_zweiter_Teil.11d12.0.xml"
]

# 2. 预加载德语分词模型和停止词（只做一次）
nlp = spacy.load("de_core_news_sm")
stopwords = set(nlp.Defaults.stop_words)

all_results = []

for xml_file in file_paths:
  # 1. 载入 XML
  ns = {"tei": "http://www.tei-c.org/ns/1.0"}
  tree = etree.parse(xml_file)

  # 3. 遍历每个 scene
  data = []  # 存放单个文件的所有场景数据
  for scene in tree.xpath("//tei:div[@type='scene']", namespaces=ns):
        scene_id = scene.get("{http://www.w3.org/XML/1998/namespace}id")
        scene_title = scene.xpath("./tei:head/text()", namespaces=ns)[0]
        
        # 为每个角色／句子统计
        speaker_stats = defaultdict(lambda: {"lines": 0, "words": Counter()})
        
        # 4. 提取每次发言 <sp>
        for sp in scene.xpath(".//tei:sp", namespaces=ns):
            speaker = sp.xpath("./tei:speaker/text()", namespaces=ns)[0].strip()
            # 每行 <l> 也可以认为是一行台词
            for line in sp.xpath("./tei:l/text()", namespaces=ns):
                speaker_stats[speaker]["lines"] += 1
                # 分词与计数
                doc = nlp(line)
                for token in doc:
                    w = token.lemma_.lower()
                    if w.isalpha() and w not in stopwords:
                        speaker_stats[speaker]["words"][w] += 1
        
        data.append({
            "scene_id": scene_id,
            "scene_title": scene_title,
            "speaker_stats": speaker_stats
        })

        all_results.append({
          "file": xml_file,
          "scene_data": data
      })

# all_results 中即包含“上下集”两份文件的全部处理结果


In [5]:
import os
import json


# Save each book's scene_data to a separate JSON file for reuse
for result in all_results:
    # derive a filename from the original XML filename
    base = os.path.splitext(os.path.basename(result["file"]))[0]
    out_path = f"data/{base}_scene_data.json"
    
    with open(out_path, "w", encoding="utf-8") as fp:
        json.dump(result["scene_data"], fp, ensure_ascii=False, indent=2)

    print(f"Exported {len(result['scene_data'])} scenes to {out_path}")