In [1]:
file = "DRVIT_Ro_Roht.xml"

In [2]:
import dhlab as dh
from dhlab.nbtokenizer import tokenize

In [7]:
with open(file, encoding="utf-8") as fp:
    txt = fp.read()

In [4]:
len(tokenize(txt))

117666

In [None]:
from lxml import etree

In [None]:
# Load the XML data without the encoding declaration
xml_data = txt.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
tree = etree.fromstring(xml_data)  # tree is actually the root element here

In [88]:
# Define both namespaces explicitly
namespaces = {
    'tei': 'http://www.tei-c.org/ns/1.0',
    'HIS': 'http://www.example.org/ns/HIS'
}

# Example: Extracting characters from <tei:castItem> and dialogues from <HIS:hisSp>
character_ids = [
     item.xpath('string()').strip()
    for item in tree.xpath('//tei:castItem', namespaces=namespaces)
]

# Extract dialogue or other elements in the HIS namespace
dialogues = [
    element.xpath('string()').strip()
    for element in tree.xpath('//HIS:hisSp', namespaces=namespaces)
]

#print("Characters:", character_ids)
#print("Dialogues:", dialogues)


In [89]:
characters = [x.split(",")[0] for x in character_ids]

In [90]:
characters

['JOHANNES ROSMER',
 'REBEKKA WEST',
 'REKTOR KROLL',
 'ULRIK BRENDEL',
 'PEDER MORTENSGÅRD',
 'MADAM HELSETH']

In [100]:
def clean_dialogue(d):
    # Split by newlines and strip extra whitespace
    parts = [part.strip() for part in d.split('\n') if part.strip()]
    character = parts[0]  # First line is likely the character
    text = " ".join(parts[1:])  # Join the rest as dialogue text
    return character, text

def clean_dialogue_no_stage(d):
    # Remove all <HIS:hisStage> elements from the dialogue element
    for stage in d.xpath('.//HIS:hisStage', namespaces=namespaces):
        stage.getparent().remove(stage)
    
    # Now get the dialogue text as before, ignoring removed parts
    text_content = d.xpath('string()').strip()
    
    # Split and clean as before
    parts = [part.strip() for part in text_content.split('\n') if part.strip()]
    character = parts[0]  # First line as character name
    text = " ".join(parts[1:])  # Remaining lines as dialogue
    return character, text
# Example usage

In [101]:
processed_dialogues = [clean_dialogue_no_stage(d) for d in tree.xpath('//HIS:hisSp', namespaces=namespaces)]

In [18]:
# Dictionary to store character collocations
collocations = {}
# Step 1: Build a dictionary of characters from <castItem>
character_ids = {item.get('xml:id'): item.text for item in tree.xpath('//castItem')}

# Step 2: Iterate over character speech blocks
for character_block in tree.xpath('//HIS:hisSp', namespaces=namespaces):
    character_id = character_block.get('who')
    character = character_ids.get(character_id, 'Unknown')  # Map ID to character name
    
    # Get all dialogue in <p> tags
    dialogue = " ".join(character_block.xpath('.//p/text()'))
    
    # Get all stage directions in HIS:hisStage tags
    actions = " ".join(character_block.xpath('.//HIS:hisStage/text()', namespaces=namespaces))
    
    # Append to collocations dictionary
    if character not in collocations:
        collocations[character] = {'dialogue': [], 'actions': []}
    collocations[character]['dialogue'].append(dialogue)
    collocations[character]['actions'].append(actions)

# Result: collocations with character text and actions
print(collocations)

{}


In [103]:
# Extract acts and scenes from the TEI XML
# Extract acts with their respective scenes based on 'n' attribute for act numbering
# Segment acts based on 'n' attribute and gather dialogues in each
acts = []
for act in tree.xpath('//tei:div[@type="act"]', namespaces=namespaces):
    act_number = act.get('n', 'Unknown')  # Get the act number
    act_name = f"Act {act_number}"
    
    # Gather all dialogues in this act, excluding <HIS:hisStage> elements
    act_text = []
    for d in act.xpath('.//HIS:hisSp', namespaces=namespaces):
        character, text = clean_dialogue_no_stage(d)  # Use the function to exclude stage directions
        act_text.append(f"{character}: {text}\n")
    
    # Append the entire act's text to the list
    acts.append({'act_name': act_name, 'text': " ".join(act_text)})

In [104]:
#acts

In [105]:
for act in acts:
    print(act['act_name'], len(act['text']))

Act 1 34910
Act 2 35356
Act 3 29199
Act 4 22668


In [110]:
print(acts[0]['text'])

MADAM HELSETH: Det er vel bedst, jeg begynder så småt at dække kveldsbordet, frøken?
 REBEKKA WEST: Ja, gør De det. Pastoren må vel snart komme.
 MADAM HELSETH: Trækker det ikke svært, dér frøkenen sidder?
 REBEKKA: Jo, lidt. Vil De kanske lukke.
 MADAM HELSETH: Men er det ikke pastoren, som går der borte?
 REBEKKA: Hvor?
 MADAM HELSETH: Nej tænk, frøken, – han begynder at gå møllevejen igen.
 REBEKKA: Han gik møllevejen i forgårs også.
 MADAM HELSETH: Våger han sig over kloppen?
 REBEKKA: Det er det, jeg vil se. ovenom idag også.
 MADAM HELSETH: Herregud, ja. Det må vel falde tungt for pastoren at træ’ over den kloppen. Dér, hvor sligt noget er sket, dér –
 REBEKKA: De hænger længe ved sine døde her på Rosmersholm.
 MADAM HELSETH: Jeg tror nu det, jeg, frøken, at det er de døde, som hænger så længe ved Rosmersholm.
 REBEKKA: De døde?
 MADAM HELSETH: Ja, det er næsten at sige, som om de ikke kunde komme sig helt bort ifra dem, som sidder igen.
 REBEKKA: Hvorledes falder De på det?
 MAD

In [95]:
import json


In [106]:
with open('Rosmersholm.json', "w") as fp:
    json.dump(acts, fp)