In [1]:
import json
from lxml import etree
import re
import string

In [2]:
# Load all the "speaker" elements from the xml file
tree = etree.parse(r"..\Texts\love_for_love.xml")
root = tree.getroot()
tei_namespace = {"tei":"http://www.tei-c.org/ns/1.0"}
elements = root.xpath("//tei:speaker", namespaces = tei_namespace)

In [3]:
len(elements)

1162

In [4]:
# Make a list of all elements that appear in the speaker tags
speakers_list = []
for element in elements:
    speaker = element.text.strip()
    if speaker not in speakers_list:
        speakers_list.append(speaker)
speakers_list = sorted(speakers_list)

In [5]:
len(speakers_list)

30

In [6]:
speakers_list

['Ang.',
 'Angl.',
 'Ben.',
 'Buck.',
 'Buckr.',
 'Fore.',
 'Frail.',
 'Ienny.',
 'Ier.',
 'Iere',
 'Iere.',
 'Iore.',
 'Miss Pru.',
 'Miss.',
 'Mrs Fore.',
 'Mrs. Fore.',
 'Mrs. Frail.',
 'Nurse.',
 'Off.',
 'Scan.',
 'Scan. aside.',
 'Serv.',
 'Sir Sam.',
 'Sir Samp',
 'Sir Samp.',
 'Taff.',
 'Tat.',
 'Tatt.',
 'Trap.',
 'Val.']

In [7]:
all_characters = [
    "angelica",
    "ben",
    "buckram",
    "foresight",
    "jenny",
    "jeremy",
    "miss prue",
    "mrs. foresight",
    "mrs. frail",
    "nurse",
    "officer",
    "scandal",
    "servant",
    "sir sampson legend",
    "tattle",
    "trapland",
    "valentine",
]

In [8]:
# Make a dictionary for storing the speakers' names based on how they appear in the XML file
character_data = {}
for character in all_characters:
    character_data[character] = {
        "alias": [],
    }

# Add aliases for all characters
for character in character_data:
    data_dict = character_data[character]
    match character:
        case "angelica":
            data_dict.update({"alias": ["Ang.", "Angl.",]})
        case "ben":
            data_dict.update({"alias": ["Ben.",]})
        case "buckram":
            data_dict.update({"alias": ["Buck.", "Buckr."]})
        case "foresight":
            data_dict.update({"alias": ["Fore.",]})
        case "jenny":
            data_dict.update({"alias": ["Ienny.",]})
        case "jeremy":
            data_dict.update({"alias": ["Ier.", "Iere", "Iere.", "Iore.",]})
        case "miss prue":
            data_dict.update({"alias": ["Miss Pru.", "Miss.",]})
        case "mrs. foresight":
            data_dict.update({"alias": ["Mrs Fore.", "Mrs. Fore.",]})
        case "mrs. frail":
            data_dict.update({"alias": ["Frail.", "Mrs. Frail.",]})
        case "nurse":
            data_dict.update({"alias": ["Nurse.",]})
        case "officer":
            data_dict.update({"alias": ["Off.",]})
        case "scandal":
            data_dict.update({"alias": ["Scan.", "Scan. aside.",]})
        case "servant":
            data_dict.update({"alias": ["Serv.",]})
        case "sir sampson legend":
            data_dict.update({"alias": ["Sir Sam.", "Sir Samp", "Sir Samp.",]})
        case "tattle":
            data_dict.update({"alias": ["Taff.", "Tat.", "Tatt.",]})
        case "trapland":
            data_dict.update({"alias": ["Trap.",]})
        case "valentine":
            data_dict.update({"alias": ["Val.",]})
        case _:
            print(f"Unrecognised character: {character}")

# Count elements with no "speaker" tag
no_speaker_counter = 0

# Add lines spoken by each character in each act in character_data
for act in root.xpath("//tei:div[@type='act']", namespaces = tei_namespace):

    # Store the number of the act
    act_num = act.get("n")
    
    for sp_tag in root.xpath(f"//tei:div[@n={act_num} and @type='act']//tei:sp", namespaces = tei_namespace):
        lines = ""
    
        # The lines are stored in one or multiple "l" or "p" tags
        # Iterate over the tags to store the lines
        # Ignore "speaker" and "stage" tags
        ltlist=sp_tag.xpath(".//text()[not(parent::tei:speaker or parent::tei:stage)]", namespaces=tei_namespace)
        for linetext in ltlist:
            linetext = linetext.strip()
            if linetext:
            # Add a whitespace before linetext to ensure proper formatting
            # Do not add a whitespace if the first character is a punctuation mark except specific ones
                if linetext[0] in string.punctuation and linetext[0] not in ["&", "(",]:
                    lines += linetext
                else:
                    lines += " " + linetext
        
        # Remove the leftmost whitespace that is produced as a side effect of concatenating
        # multiple lines with whitespaces in between
        lines = lines.lstrip()
    
        # Remove the stage directions in square brackets
        # and then get rid of the extra whitespaces in between words
        lines = re.sub(r"\[.*?\]", "", lines)
        lines = re.sub(r"\s+", " ", lines)
    
        # Find the speaker and assign the lines accordingly in character_data
        # If no "speaker" tag is found, it suggests continuation of speech from the last speaker
        # So the previous speaker will be assigned to the text
        try:
            speaker = sp_tag.find(".//tei:speaker", namespaces = tei_namespace).text.strip()
        except AttributeError:
            no_speaker_counter += 1
            for character in character_data:
                if speaker in character_data[character]["alias"]:
                    speaker_full_name = character
            print(f"No speaker found. The text will automatically be assigned to {speaker_full_name.title()}. \nCurrent sections of text without a speaker: {no_speaker_counter}\n")
        for character in character_data:
            if speaker in character_data[character]["alias"]:
                
                # Set a name for the key that refers to the act
                act_key = f"act_{act_num}"
                
                if act_key not in character_data[character]:
                    character_data[character][act_key] = [lines]
                else:
                    character_data[character][act_key].append(lines)
                    
print(f"Total sections of text without a speaker: {no_speaker_counter}")

No speaker found. The text will automatically be assigned to Valentine. 
Current sections of text without a speaker: 1

Total sections of text without a speaker: 1


In [9]:
character_data

{'angelica': {'alias': ['Ang.', 'Angl.'],
  'act_2': ["Is not it a good hour for Pleasure too? Uncle, pray lend me your Coach, mine's out of Order.",
   'Well, but I can neither make you a Cuckold, Uncle, by going abroad; nor secure you from being one, by staying at home.',
   "But my Inclinations are in force, I have a mind to go abroad; and if you won't lend me your Coach, I'll take a Hackney, or a Chair, and leave you to erect a Scheme, and find who's in Conjunction with your Wife. Why don't you keep her at Home, if you're Jealous when she's abroad? You know my Aunt is a little Retrograde (as you call it) in her Nature. Uncle, I'm afraid you are not Lord of the Ascendant, ha, ha, ha.",
   "Nay Uncle, don't be angry— If you are, I'll reap up all your false Prophecies, ridiculous Dreams, and idle Divinations. I'll swear you are a Nusance to the Neighbourhood— What a Bustle did you keep against the last Invisible Eclipse, laying in Provision as 'twere for a Siege? What a World of Fire 

In [10]:
# Check whether text has been collected in character_data from every single the "sp" tag
# Prints "True" if everything is working as intended
all_lines = []
for character in character_data:
    for key in character_data[character]:
        if key == "alias":
            continue
        all_lines.extend(character_data[character][key])
print(len(elements) + no_speaker_counter == len(all_lines))

True


In [11]:
with open(r"../JSON/love_for_love.json", "w") as file:
    json.dump(character_data, file)