In [1]:
import json
from lxml import etree
import re
import string

In [2]:
# Load all the "speaker" elements from the xml file
tree = etree.parse(r"..\Texts\the_double_dealer.xml")
root = tree.getroot()
tei_namespace = {"tei":"http://www.tei-c.org/ns/1.0"}
elements = root.xpath("//tei:speaker", namespaces = tei_namespace)

In [3]:
len(elements)

884

In [4]:
# Make a list of all elements that appear in the speaker tags
speakers_list = []
for element in elements:
    speaker = element.text.strip()
    if speaker not in speakers_list:
        speakers_list.append(speaker)
speakers_list = sorted(speakers_list)

In [5]:
len(speakers_list)

50

In [6]:
speakers_list

['All.',
 'Boy.',
 'Brisk',
 'Brisk.',
 'Care.',
 'Cyn.',
 'Cynt.',
 'Cynth.',
 'Cynthia.',
 'Foot.',
 'L. Froth.',
 'L. P.',
 'L. Touch.',
 'Lady F.',
 'Lady Fr,',
 'Lady Fr.',
 'Lady P.',
 'Lady Pl.',
 'Lady T.',
 'Lady. T.',
 'Ld F.',
 'Ld Fr.',
 'Ld T.',
 'Ld.',
 'Ld. F.',
 'Ld. Froth',
 'Ld. Froth.',
 'Ld. T.',
 'Ld. Touch.',
 'Ldy P.',
 'Ldy T.',
 'Ldy. F.',
 'Ldy. Froth.',
 'Ldy. Ply.',
 'Ldy. T.',
 'Lord F.',
 'Lord Fr.',
 'Lord T.',
 'Mal.',
 'Mas.',
 'Mask',
 'Mask.',
 'Mel.',
 'Mell.',
 'Melle.',
 'Mr. Saygrace,',
 'Sayg.',
 'Saygrace',
 'Sir P.',
 'Sir Paul.']

In [7]:
all_characters = [
    "all",
    "boy",
    "brisk",
    "careless",
    "cynthia",
    "footman",
    "lady froth",
    "lady plyant",
    "lady touchwood",
    "lord froth",
    "lord touchwood",
    "mellefont",
    "maskwell",
    "mr. saygrace",
    "sir paul plyant",
]

In [8]:
# Make a dictionary for storing the speakers' names based on how they appear in the XML file
character_data = {}
for character in all_characters:
    character_data[character] = {
        "alias": [],
    }

# Add aliases for all characters
for character in character_data:
    data_dict = character_data[character]
    match character:
        case "all":
            data_dict.update({"alias": ["All.",]})
        case "boy":
            data_dict.update({"alias": ["Boy.",]})
        case "brisk":
            data_dict.update({"alias": ["Brisk", "Brisk."]})
        case "careless":
            data_dict.update({"alias": ["Care.",]})
        case "cynthia":
            data_dict.update({"alias": ["Cyn.", "Cynt.", "Cynth.", "Cynthia.",]})
        case "footman":
            data_dict.update({"alias": ["Foot.",]})
        case "lady froth":
            data_dict.update({"alias": ["L. Froth.", "Lady F.", "Lady Fr,", "Lady Fr.", "Ldy. F.", "Ldy. Froth.",]})
        case "lady plyant":
            data_dict.update({"alias": ["L. P.", "Lady P.", "Lady Pl.", "Ldy P.", "Ldy. Ply.",]})
        case "lady touchwood":
            data_dict.update({"alias": ["L. Touch.", "Lady T.", "Lady. T.", "Ldy T.", "Ldy. T.",]})
        case "lord froth":
            data_dict.update({"alias": ["Ld F.", "Ld Fr.", "Ld. F.", "Ld. Froth", "Ld. Froth.", "Ld.", "Lord F.", "Lord Fr.",]})
        case "lord touchwood":
            data_dict.update({"alias": ["Ld T.", "Ld. T.", "Ld. Touch.", "Lord T.",]})
        case "mellefont":
            data_dict.update({"alias": ["Mal.", "Mel.", "Mell.", "Melle.",]})
        case "maskwell":
            data_dict.update({"alias": ["Mas.", "Mask", "Mask.",]})
        case "mr. saygrace":
            data_dict.update({"alias": ["Mr. Saygrace,", "Sayg.", "Saygrace",]})
        case "sir paul plyant":
            data_dict.update({"alias": ["Sir P.", "Sir Paul.",]})
        case _:
            print(f"Unrecognised character: {character}")

# Count elements with no "speaker" tag
no_speaker_counter = 0

# Add lines spoken by each character in each act in character_data
for act in root.xpath("//tei:div[@type='act']", namespaces = tei_namespace):

    # Store the number of the act
    act_num = act.get("n")
    
    for sp_tag in root.xpath(f"//tei:div[@n={act_num} and @type='act']//tei:sp", namespaces = tei_namespace):
        lines = ""
    
        # The lines are stored in one or multiple "l" or "p" tags
        # Iterate over the tags to store the lines
        # Ignore "speaker" and "stage" tags
        ltlist=sp_tag.xpath(".//text()[not(parent::tei:speaker or parent::tei:stage)]", namespaces=tei_namespace)
        for linetext in ltlist:
            linetext = linetext.strip()
            if linetext:
            # Add a whitespace before linetext to ensure proper formatting
            # Do not add a whitespace if the first character is a punctuation mark except specific ones
                if linetext[0] in string.punctuation and linetext[0] not in ["&", "(",]:
                    lines += linetext
                else:
                    lines += " " + linetext
        
        # Remove the leftmost whitespace that is produced as a side effect of concatenating
        # multiple lines with whitespaces in between
        lines = lines.lstrip()
    
        # Remove the stage directions in square brackets
        # and then get rid of the extra whitespaces in between words
        lines = re.sub(r"\[.*?\]", "", lines)
        lines = re.sub(r"\s+", " ", lines)
    
        # Find the speaker and assign the lines accordingly in character_data
        # If no "speaker" tag is found, it suggests continuation of speech from the last speaker
        # So the previous speaker will be assigned to the text
        try:
            speaker = sp_tag.find(".//tei:speaker", namespaces = tei_namespace).text.strip()
        except AttributeError:
            no_speaker_counter += 1
            for character in character_data:
                if speaker in character_data[character]["alias"]:
                    speaker_full_name = character
            print(f"No speaker found. The text will automatically be assigned to {speaker_full_name.title()}. \nCurrent sections of text without a speaker: {no_speaker_counter}\n")
        for character in character_data:
            if speaker in character_data[character]["alias"]:
                
                # Set a name for the key that refers to the act
                act_key = f"act_{act_num}"
                
                if act_key not in character_data[character]:
                    character_data[character][act_key] = [lines]
                else:
                    character_data[character][act_key].append(lines)
                    
print(f"Total sections of text without a speaker: {no_speaker_counter}")

Total sections of text without a speaker: 0


In [9]:
character_data

{'all': {'alias': ['All.'], 'act_5': ["What's the matter?"]},
 'boy': {'alias': ['Boy.'],
  'act_3': ["'Tis directed to your Worship.", 'No, an please you.']},
 'brisk': {'alias': ['Brisk', 'Brisk.'],
  'act_1': ["Boys, Boys, Lads, where are you? What do you give ground? Mortgage for a Bottle, ha? Careless, this is your trick; you're always spoiling Company by leaving it.",
   "Pooh, ha, ha, ha, I know you envy me. Spite, proud spite, by the Gods! and burning envy. — I'le be judged by Mellefont here, who gives and takes Raillery better, you or I. Pox, Man, when I say you spoil Company by leaving it, I mean you leave no body for the Company to Laugh at. I think there I was with you 〈◊〉 Mellefont.",
   "Oh, my dear Mellefont, let me perish, if thou art not the Soul of Conversation, the very Essence of Wit, and Spirit of Wine, — the Deuce take me if there were three good things said; or one, understood, since thy Amputation from the body of our Society. — He, I think that's pretty and Met

In [10]:
# Check whether text has been collected in character_data from every single the "sp" tag
# Prints "True" if everything is working as intended
all_lines = []
for character in character_data:
    for key in character_data[character]:
        if key == "alias":
            continue
        all_lines.extend(character_data[character][key])
print(len(elements) + no_speaker_counter == len(all_lines))

True


In [11]:
with open(r"../JSON/the_double_dealer.json", "w") as file:
    json.dump(character_data, file)