In [12]:
import json
from lxml import etree
import re
import string

In [13]:
# Load all the "speaker" elements from the xml file
tree = etree.parse(r"..\Texts\the_old_bachelor.xml")
root = tree.getroot()
tei_namespace = {"tei":"http://www.tei-c.org/ns/1.0"}
elements = root.xpath("//tei:speaker", namespaces = tei_namespace)

In [14]:
len(elements)

931

In [15]:
# Make a list of all elements that appear in the speaker tags
speakers_list = []
for element in elements:
    speaker = element.text.strip()
    if speaker not in speakers_list:
        speakers_list.append(speaker)
speakers_list = sorted(speakers_list)

In [16]:
len(speakers_list)

33

In [17]:
speakers_list

['Aram.',
 'Bar.',
 'Belin.',
 'Bell',
 'Bell.',
 'Betty.',
 'Bluff.',
 'Bluffe.',
 'Boy.',
 'Fond.',
 'Fondl.',
 'Foot.',
 'Heart.',
 'Laet.',
 'Lcuy.',
 'Lucy,',
 'Lucy.',
 'Salv.',
 'Serv.',
 'Set.',
 'Setter.',
 'Sharp.',
 'Sharper.',
 'Silo.',
 'Silv.',
 'Silvia.',
 'Sir Io.',
 'Sir Ios.',
 'Sir Jo.',
 'Sir. Io.',
 'Sr. Jo.',
 'Sylv.',
 'Vain.']

In [18]:
all_characters = [
    "araminta",
    "barnaby",
    "belinda",
    "bellmour",
    "betty",
    "boy",
    "captain bluffe",
    "fondlewife",
    "footman",
    "heartwell",
    "laetitia",
    "lucy",
    "servant",
    "setter",
    "sharper",
    "silvia",
    "sir joseph",
    "vainlove",
]

In [19]:
# Make a dictionary for storing the speakers' names based on how they appear in the XML file
character_data = {}
for character in all_characters:
    character_data[character] = {
        "alias": [],
    }

# Add aliases for all characters
for character in character_data:
    data_dict = character_data[character]
    match character:
        case "araminta":
            data_dict.update({"alias": ["Aram.",]})
        case "barnaby":
            data_dict.update({"alias": ["Bar.",]})
        case "belinda":
            data_dict.update({"alias": ["Belin.",]})
        case "bellmour":
            data_dict.update({"alias": ["Bell", "Bell.",]})
        case "betty":
            data_dict.update({"alias": ["Betty.",]})
        case "boy":
            data_dict.update({"alias": ["Boy.",]})
        case "captain bluffe":
            data_dict.update({"alias": ["Bluff.", "Bluffe.",]})
        case "fondlewife":
            data_dict.update({"alias": ["Fond.", "Fondl.",]})
        case "footman":
            data_dict.update({"alias": ["Foot.",]})
        case "heartwell":
            data_dict.update({"alias": ["Heart.",]})
        case "laetitia":
            data_dict.update({"alias": ["Laet.",]})
        case "lucy":
            data_dict.update({"alias": ["Lcuy.", "Lucy,", "Lucy.",]})
        case "servant":
            data_dict.update({"alias": ["Serv.",]})
        case "setter":
            data_dict.update({"alias": ["Set.", "Setter.",]})
        case "sharper":
            data_dict.update({"alias": ["Sharp.", "Sharper.",]})
        case "silvia":
            data_dict.update({"alias": ["Salv.", "Silo.", "Silv.", "Silvia.", "Sylv.",]})
        case "sir joseph":
            data_dict.update({"alias": ["Sir Io.", "Sir Ios.", "Sir Jo.", "Sir. Io.", "Sr. Jo.",]})
        case "vainlove":
            data_dict.update({"alias": ["Vain.",]})
        case _:
            print(f"Unrecognised character: {character}")

# Count elements with no "speaker" tag
no_speaker_counter = 0

# Add lines spoken by each character in each act in character_data
for act in root.xpath("//tei:div[@type='act']", namespaces = tei_namespace):

    # Store the number of the act
    act_num = act.get("n")
    
    for sp_tag in root.xpath(f"//tei:div[@n={act_num} and @type='act']//tei:sp", namespaces = tei_namespace):
        lines = ""
    
        # The lines are stored in one or multiple "l" or "p" tags
        # Iterate over the tags to store the lines
        # Ignore "speaker" and "stage" tags
        ltlist=sp_tag.xpath(".//text()[not(parent::tei:speaker or parent::tei:stage)]", namespaces=tei_namespace)
        for linetext in ltlist:
            linetext = linetext.strip()
            if linetext:
            # Add a whitespace before linetext to ensure proper formatting
            # Do not add a whitespace if the first character is a punctuation mark except specific ones
                if linetext[0] in string.punctuation and linetext[0] not in ["&", "(",]:
                    lines += linetext
                else:
                    lines += " " + linetext
        
        # Remove the leftmost whitespace that is produced as a side effect of concatenating
        # multiple lines with whitespaces in between
        lines = lines.lstrip()
    
        # Remove the stage directions in square brackets
        # and then get rid of the extra whitespaces in between words
        lines = re.sub(r"\[.*?\]", "", lines)
        lines = re.sub(r"\s+", " ", lines)
    
        # Find the speaker and assign the lines accordingly in character_data
        # If no "speaker" tag is found, it suggests continuation of speech from the last speaker
        # So the previous speaker will be assigned to the text
        try:
            speaker = sp_tag.find(".//tei:speaker", namespaces = tei_namespace).text.strip()
        except AttributeError:
            no_speaker_counter += 1
            for character in character_data:
                if speaker in character_data[character]["alias"]:
                    speaker_full_name = character
            print(f"No speaker found. The text will automatically be assigned to {speaker_full_name.title()}. \nCurrent sections of text without a speaker: {no_speaker_counter}\n")
        for character in character_data:
            if speaker in character_data[character]["alias"]:
                
                # Set a name for the key that refers to the act
                act_key = f"act_{act_num}"
                
                if act_key not in character_data[character]:
                    character_data[character][act_key] = [lines]
                else:
                    character_data[character][act_key].append(lines)
                    
print(f"Total sections of text without a speaker: {no_speaker_counter}")

Total sections of text without a speaker: 0


In [20]:
character_data

{'araminta': {'alias': ['Aram.'],
  'act_2': ['Bless me! what have I said to move you thus?',
   "If Love be the Feeer which you mean; kind Heav'n avert the cure: Let me have Oil to feed that Flame and never let it be extinct, till I my self am Ashes.",
   "Fie, this is gross Affectation— A little of Bellmour's Company would change the Scene.",
   "I wonder Cousin you should imagine, I don't perceive you love him.",
   'Love a Man! yes, you would not love a Beast.',
   "Yes, yes, I can see something near it when you and Bellmour meet. You don't know that you dreamt of Bellmour last Night, and call'd him aloud in your sleep.",
   "But that's not all; you caught me in your Arms when you▪ named him, and press'd me to your Bosom— Sure if I had not pinch'd you till you wak'd, you had stisled me with Kisses.",
   'No Aspersion, Cousin, we are alone— Nay, I can tell you more.',
   'What, before you hear it?',
   'Ha, ha, ha, this is pleasant.',
   'Ha, ha, ha.',
   "Oh is it come out— Now you

In [21]:
# Check whether text has been collected in character_data from every single the "sp" tag
# Prints "True" if everything is working as intended
all_lines = []
for character in character_data:
    for key in character_data[character]:
        if key == "alias":
            continue
        all_lines.extend(character_data[character][key])
print(len(elements) + no_speaker_counter == len(all_lines))

True


In [22]:
with open(r"../JSON/the_old_bachelor.json", "w") as file:
    json.dump(character_data, file)