In [1]:
import re
import pandas as pd
import numpy as np
from IPython.display import clear_output
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import requests
from bs4 import BeautifulSoup
from lxml import etree

In [6]:
#function to get all proper nouns from a file
#takes xml doc and returns dataframe
def get_nps(file):
    acc_d = {'Speaker':[], 'Line':[], 'Term':[]}
    tree = etree.parse(file)
    #one-liner to create parent_map to traverse the tree backwards
    parent_map = dict((c, p) for p in tree.getiterator() for c in p)
    #get title
    title = tree.xpath('//title')[0].text
    #start with list of all proper nouns, then add line # and speaker info to each
    nps = tree.xpath('//w[@ana="#n1-nn"]')
    def get_sp(l):
        parent = parent_map[l]
        if parent.tag == 'sp':
            return parent
        else:
            return get_sp(parent)
    for np in nps:
        line = parent_map[np]
        line_num = np.get('n')
        speech = get_sp(np)
        if speech.find('speaker') == None:
            speaker = speech.get('who')
        else:
            i_speaker = speech.find('speaker')
            speaker = ''
            for w in i_speaker.findall('w'):
                speaker = speaker + w.text.capitalize() + ' '
            speaker = speaker[:-1]    
            
        acc_d['Speaker'].append(speaker) 
        acc_d['Line'].append(line_num)
        acc_d['Term'].append(np.text)
    df = pd.DataFrame.from_dict(acc_d)
    df['Play'] = title
    return df



def get_chars(file):
    char_list = []
    tree = etree.parse(file)
    #get list of characters
    for ci in tree.xpath('//castList//castItem'):
        if len(ci) > 0:
            c_id = ci.get('{http://www.w3.org/XML/1998/namespace}id')
            if c_id[1].islower() == True:
                r = ci.find('role')
                char_list.append((c_id, get_text(r).strip()))
    return char_list
        
#     x.text.replace('â\x80\x99', '\'')
#     x.replace('Ã©', 'e')
def get_text(role):
    text_arr = []
    if role.text != None:
        b = role.text.strip()
        text_arr.append(b)
    for n in role:
        n_txt = n.text
        n_tl = n.tail
        if n_txt != None:
            text_arr.append(n_txt.strip())
        if n_tl != None:
            if len(n_tl.strip()) > 0:
                text_arr.append(n_tl.strip())
    #combine strings in the array
    final_s = ''
    for s in text_arr:
        if s != '':
            final_s = final_s + s + ' '
    return final_s

    

In [4]:
HenryIV_1df= get_nps('Initial_Texts/HenryIV(1).xml')
HenryIV_2df = get_nps('Initial_Texts/HenryIV(2).xml')
HenryV_df = get_nps('Initial_Texts/HenryV.xml')
HenryVI_1df = get_nps('Initial_Texts/HenryVI(1).xml')
HenryVI_2df = get_nps('Initial_Texts/HenryVI(2).xml')
HenryVI_3df = get_nps('Initial_Texts/HenryVI(3).xml')
John_df = get_nps('Initial_Texts/John.xml')
RichardII_df = get_nps('Initial_Texts/RichardII.xml')
RichardIII_df = get_nps('Initial_Texts/RichardIII.xml')
all_dfs = [HenryIV_1df, HenryIV_2df, HenryV_df, HenryVI_1df, HenryVI_2df, HenryVI_3df, John_df, RichardII_df, RichardIII_df]

In [7]:
HenryIV_1chars = get_chars('Initial_Texts/HenryVI(1).xml')
HenryIV_2chars = get_chars('Initial_Texts/HenryVI(2).xml')
HenryV_chars = get_chars('Initial_Texts/HenryV.xml')
HenryVI_1chars = get_chars('Initial_Texts/HenryVI(1).xml')
HenryVI_2chars = get_chars('Initial_Texts/HenryVI(2).xml')
HenryVI_3chars = get_chars('Initial_Texts/HenryVI(3).xml')
John_chars = get_chars('Initial_Texts/John.xml')
RichardII_chars = get_chars('Initial_Texts/RichardII.xml')
RichardIII_chars = get_chars('Initial_Texts/RichardIII.xml')
all_chars = [HenryIV_1chars, HenryIV_2chars, HenryV_chars, HenryVI_1chars, HenryVI_2chars, HenryVI_3chars, John_chars, RichardII_chars, RichardIII_chars]

In [14]:
all_char_list = [inner for outer in all_chars for inner in outer]
character_df = pd.DataFrame(all_char_list)
writer = pd.ExcelWriter('characters.xlsx')
character_df.to_excel(writer, 'Sheet1')
writer.save()