In [1]:
import re
import pandas as pd
import numpy as np
from IPython.display import clear_output
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import requests
from bs4 import BeautifulSoup
from lxml import etree

In [2]:
#function to get all proper nouns from a file
#takes xml doc and returns dataframe
def get_nps(file):
    acc_d = {'Speaker':[], 'Line':[], 'Term':[]}
    tree = etree.parse(file)
    #one-liner to create parent_map to traverse the tree backwards
    parent_map = dict((c, p) for p in tree.getiterator() for c in p)
    #get title
    title = tree.xpath('//title')[0].text
    #start with list of all proper nouns, then add line # and speaker info to each
    nps = tree.xpath('//w[@ana="#n1-nn"]')
    def get_sp(l):
        parent = parent_map[l]
        if parent.tag == 'sp':
            return parent
        else:
            return get_sp(parent)
    for np in nps:
        line = parent_map[np]
        line_num = np.get('n')
        speech = get_sp(np)
        if speech.find('speaker') == None:
            speaker = speech.get('who')
        else:
            i_speaker = speech.find('speaker')
            speaker = ''
            for w in i_speaker.findall('w'):
                speaker = speaker + w.text.capitalize() + ' '
            speaker = speaker[:-1]    
            
        acc_d['Speaker'].append(speaker) 
        acc_d['Line'].append(line_num)
        acc_d['Term'].append(np.text)
    df = pd.DataFrame.from_dict(acc_d)
    df['Play'] = title
    return df



def get_chars(file):
    char_list = []
    tree = etree.parse(file)
    #get list of characters
    for ci in tree.xpath('//castList//castItem'):
        if len(ci) > 0:
            c_id = ci.get('{http://www.w3.org/XML/1998/namespace}id')
            if c_id[1].islower() == True:
                r = ci.find('role')
                char_list.append((c_id, get_text(r).strip()))
    return char_list
        
#     x.text.replace('â\x80\x99', '\'')
#     x.replace('Ã©', 'e')
def get_text(role):
    text_arr = []
    if role.text != None:
        b = role.text.strip()
        text_arr.append(b)
    for n in role:
        n_txt = n.text
        n_tl = n.tail
        if n_txt != None:
            text_arr.append(n_txt.strip())
        if n_tl != None:
            if len(n_tl.strip()) > 0:
                text_arr.append(n_tl.strip())
    #combine strings in the array
    final_s = ''
    for s in text_arr:
        if s != '':
            final_s = final_s + s + ' '
    return final_s

    

In [3]:
HenryIV_1df= get_nps('Initial_Texts/HenryIV(1).xml')
HenryIV_2df = get_nps('Initial_Texts/HenryIV(2).xml')
HenryV_df = get_nps('Initial_Texts/HenryV.xml')
HenryVI_1df = get_nps('Initial_Texts/HenryVI(1).xml')
HenryVI_2df = get_nps('Initial_Texts/HenryVI(2).xml')
HenryVI_3df = get_nps('Initial_Texts/HenryVI(3).xml')
HenryVIII_df = get_nps('Initial_Texts/HenryVIII.xml')
John_df = get_nps('Initial_Texts/John.xml')
RichardII_df = get_nps('Initial_Texts/RichardII.xml')
RichardIII_df = get_nps('Initial_Texts/RichardIII.xml')
all_dfs = [HenryIV_1df, HenryIV_2df, HenryV_df, HenryVI_1df, HenryVI_2df, HenryVI_3df, HenryVIII_df, John_df, RichardII_df, RichardIII_df]

In [4]:
HenryIV_1chars = get_chars('Initial_Texts/HenryVI(1).xml')
HenryIV_2chars = get_chars('Initial_Texts/HenryVI(2).xml')
HenryV_chars = get_chars('Initial_Texts/HenryV.xml')
HenryVI_1chars = get_chars('Initial_Texts/HenryVI(1).xml')
HenryVI_2chars = get_chars('Initial_Texts/HenryVI(2).xml')
HenryVI_3chars = get_chars('Initial_Texts/HenryVI(3).xml')
HenryVIII_chars = get_chars('Initial_Texts/HenryVIII.xml')
John_chars = get_chars('Initial_Texts/John.xml')
RichardII_chars = get_chars('Initial_Texts/RichardII.xml')
RichardIII_chars = get_chars('Initial_Texts/RichardIII.xml')
all_chars = [HenryIV_1chars, HenryIV_2chars, HenryV_chars, HenryVI_1chars, HenryVI_2chars, HenryVI_3chars, HenryVIII_chars, John_chars, RichardII_chars, RichardIII_chars]

In [None]:
# all_char_list = [inner for outer in all_chars for inner in outer]
# character_df = pd.DataFrame(all_char_list)
# writer = pd.ExcelWriter('characters1.xlsx')
# character_df.to_excel(writer, 'Sheet1')
# writer.save()

In [7]:
p_character_df = character_df[character_df['Place'] != 'None']

In [5]:
character_df = pd.read_excel('characters.xlsx')

In [12]:
character_p_ns = p_character_df['Place'].unique()

In [22]:
#join all plays into one df
big_df = HenryIV_1df.copy()
for df in all_dfs[1:]:
    big_df = big_df.append(df, ignore_index=True)
big_df.groupby(['Play'])
big_df.head()

Unnamed: 0,Speaker,Line,Term,Play
0,King,1.1.19,Christ,"Henry IV, Part I"
1,King,1.1.31,Westmoreland,"Henry IV, Part I"
2,Westmoreland,1.1.37,Wales,"Henry IV, Part I"
3,Westmoreland,1.1.38,Mortimer,"Henry IV, Part I"
4,Westmoreland,1.1.39,Herefordshire,"Henry IV, Part I"


In [53]:
more_stop_words = ['God', 'John', 'Henry', 'Richard', 'Edward', 'Harry', 'Clarence', 'Talbot', 'Bardolph', 'Margaret', 'Thomas',
                  'Jack', 'Percy', 'Falstaff', 'Kate', 'Mortimer', 'Hal', 'Charles', 'Plantagenet', 'Humphrey', 'George', 'Hubert',
                  'Pistol', 'Gaunt', 'Westmoreland', 'Arthur', 'Douglas', 'Francis', 'Cade', 'William', 'Davy', 'Catesby', 'Lewis',
                  'Stanley', 'Mowbray', 'Aumerle', 'Glendower', 'Montague', 'Ned', 'Robert', 'Doll', 'Poins', 'Cromwell', 'Grey',
                  'Jesu', 'Katherine', 'Joan', 'Scot', 'Dorset', 'Edmund', 'Rivers', 'Anne', 'Monmouth', 'Philip', 'Rutland',
                  'Bona', 'Christendom', 'Pole', 'Pucelle', 'Reignier', 'Walter', 'Faulconbridge', 'Gower', 'Lovell', 'Ratcliffe',
                  'Blunt', 'Nell', 'Nym', 'Roan', 'Scroop', 'Beaufort', 'Caesar', 'Eleanor', 'Griffith', 'Hotspur', 'Peter', 
                  'Blanche', 'Colevile', 'Louis', 'Owen', 'Alexander', 'Bushy', 'Christ', 'Constance', 'Peto', 'Tyrrel', 'Bullcalf',
                  'Cranmer', 'Englishman', 'Mars', 'Michael', 'Tearsheet', 'Vaughan', 'Bagot', 'Elizabeth', 'Hume', 'James', 
                  'Macmorris', 'Nicholas', 'Snare', 'Wart', 'Wednesday', 'Wolsey', 'jack', 'Chrish', 'Coeur', 'Day', 'Iden', 'Jove',
                  'Julius', 'Montjoy', 'Paul', 'Roger', 'Shore', 'Vernon', 'Welshman', 'de', 'Ascension', 'Barbary', 'Bourbon',
                  'Cain', 'Crispian', 'Dick', 'Gadshill', 'Gardiner', 'Jesus', 'Lionel', 'Lucy', 'May', 'Melun', 'Mercury', 'Mordake',
                  'Morton', 'Phoebus', 'Robin', 'Sands', 'Saye', 'Alban', 'Brandon', 'Bullen', 'Campeius', 'Cheshu', 'Dighton', 
                  'Elysium', 'Fang', 'Fastolf', 'Fitzwater', 'Forrest', 'Frenchman', 'Geoffrey', 'Hector', 'Helen', 'Hercules',
                  'Hugh', 'Icarus', 'Judas', 'Langley', 'Lion', 'Montgomery', 'Nightwork', 'Scales', 'Thursday', 'Tom', 'Travers',
                  'Whitmore', 'Abergavenny', 'Aeneas', 'Agamemnon', 'Alice', 'Amazon', 'Amurath', 'Anthony', 'Arc', 'Bar', 'Bigot',
                  'Butler', 'Capet', 'Cleitus', 'Colbrand', 'Crispin', 'Delabreth', 'Denis', 'Dennis', 'Dorothy', 'Falconbridge',
                  'Friday', 'Gargrave', 'Gawsey', 'Godâ\x80\x99s', 'Goodwin', 'Gregory', 'Gualtier', 'Guilford', 'Henton', 'Herbert',
                  'Hiren', 'Hopkins', 'Horner', 'Hungerford', 'Innocent', 'Jacques', 'Jamy', 'Jew', 'Jewry', 'Jourdain', 'June', 
                  'Kendal', 'Lent', 'Lethe', 'Lorraine', 'Machiavel', 'Mary', 'Nero', 'Pandulph', 'Peck', 'Pegasus', 'Pepin', 'Philippa',
                  'Picardy', 'Pilate', 'Place', 'Pompey', 'Rambures', 'Roussi', 'Sander', 'Simon', 'Spaniard', 'Stephen', 'Thump', 'Titan',
                  'Tuesday', 'Ulysses', 'Vaudemont', 'Vaux', 'Venus', 'Walloon', 'Will', 'Abel', 'Abraham', 'Absyrtis', 'Achitophel', 
                  'Aeolus', 'Aesop', 'Ajax',  'Alcides', 'Althaea', 'Althea', 'Amamon', 'Anchises', 'Andren', 'Angus', 'Antony', 'April',
                  'Archibald', 'Ascanius', 'Asher', 'Asmath', 'Ate', 'Atlas', 'Atropos', 'Aubrey', 'August', 'Banister', 'Barbason', 
                  'Bargulus', 'Barnes', 'Barson', 'Bartholomew', 'Bartholomew-tide', 'Basimecu', 'Basingstoke', 'Bates', 'Beelzebub', 'Bennet',
                  'Bess', 'Bevis', 'Blumer', 'Bohun', 'Bracy', 'Brocas', 'Brutus', 'Butts', 'Cadwallader', 'Calipolis', 'Capuchius', 'Car',
                  'Cassado', 'Cerberus', 'Charlemagne', 'Childeric', 'Christopher', 'Circe', 'Clement', 'Clothair', 'Clotharius', 'Coint', 
                  'Constantine', 'Cophetua', 'Courtney', 'Crispianus', 'Cromer', 'Daedalus', 'Dagonet', 'Darius', 'Deborah', 'December', 'Denny',
                  'Dian', 'Dickie', 'Dickon', 'Dido', 'Diomed', 'Dives', 'Dommelton', 'Dumb', 'Eden', 'Ellen', 'Emmanuel', 'End', 'Englishwoman',
                  'Erebus', 'Ermengare', 'Eve', 'Ferdinand', 'Ferrara', 'Ferrers', 'Froissart', 'Furnival', 'Galen', 'Gilliams', 'Gisors', 'Glansdale',
                  'Goodman', 'Gough', 'Gurney', 'Guy', 'Guynes', 'Hannibal', 'Hecate', 'Hermes', 'Holy-rood', 'Hood', 'Humfrey',
                  'Hydra', 'Hyperion', 'Iris', 'Irishman', 'Isabel', 'Jane', 'Japheth', 'Jephthah', 'Jeshu', 'July', 'Kentishman', 
                  'Laurence', 'Leviathan', 'Limoges', 'Lingare', 'Madeira', 'Mahomet', 'Margery', 'Mark', 'Marle', 'Martlemas',
                  'Matthew', 'Medea', 'Meg', 'Menelaus', 'Merlin', 'Michaelmas', 'Minos', 'Monday', 'More', 'Mountacute', 'Nemesis',
                  'Neptune', 'Nestor', 'Nevil', 'Neville', 'Norbery', 'Oldcastle', 'Orpheus', 'Pace', 'Partlet', 'Paunch', 'Peesell',
                  'Pendragon', 'Penker', 'Perkes', 'Perseus', 'Pickbone', 'Pierce', 'Ponton', 'Poultney', 'Proteus', 'Rafe', 'Rainold',
                  'Ralph', 'Ramston', 'Rivo', 'Rochester', 'Rochford', 'Roscius', 'Saba', 'Samingo', 'Sampson', 'Samson', 'Sandal', 
                  'Santrailles', 'Satan', 'Saturn', 'Savoy', 'Scarlet', 'Seely', 'Seymour', 'Shaa', 'Shallow', 'Shirley', 'Shrovetide',
                  'Simpcox', 'Sinon', 'Smith', 'Sneak', 'Southwell', 'Squele', 'Stockfish', 'Stokeley', 'Strange', 'Sure-card', 'Sylla',
                  'Tamyris', 'Tartar', 'Telamonius', 'Tressel', 'Tully', 'Turkâ\x80\x99s', 'Umfrevile', 'Ursula', 'Verdon', 'Vere', 
                  'Waterton', 'Wheeson', 'Whitsun', 'Yedward', 'christendom', 'ostler', 'scot', 'utis', 'Ketly', 'Gam', 'Lazarus',
                  'Harry', 'John', 'Jack', 'Hal', 'Scot', 'Walter', 'Douglas', 'Marian', 'Turk', 'Priam', 'Job', 'Lucifer', 'March',
                  'Europe']

In [54]:
#filter out stop words from big_df
place_df1 = [i for i, r in big_df.iterrows() if r.Term not in more_stop_words]
place_df2 = big_df.iloc[place_df1]
len(place_df2)

2676

In [9]:
#combine two word terms
lines = []
for play in place_df2['Play'].unique():
    temp_df = place_df2[place_df2['Play'] == play]
    for line, count in temp_df.Line.value_counts().iteritems():
        if count > 1:
            lines.append((play, line))

In [10]:
lines

[('Henry IV, Part I', '3.1.47'),
 ('Henry IV, Part I', '3.1.77'),
 ('Henry IV, Part I', '4.2.3'),
 ('Henry IV, Part I', '5.4.46'),
 ('Henry IV, Part I', '3.2.98'),
 ('Henry IV, Part I', '4.4.29'),
 ('Henry IV, Part I', '1.1.73'),
 ('Henry IV, Part I', '1.3.290'),
 ('Henry IV, Part I', '3.1.79'),
 ('Henry IV, Part II', '4.3.196'),
 ('Henry IV, Part II', '3.1.1'),
 ('Henry IV, Part II', '2.2.168'),
 ('Henry V', '3.5.46'),
 ('Henry V', '3.5.43'),
 ('Henry V', '4.3.56'),
 ('Henry V', '2.2.202'),
 ('Henry V', '1.2.58'),
 ('Henry V', '2.4.5'),
 ('Henry V', '5.2.87'),
 ('Henry V', '4.8.104'),
 ('Henry V', '1.2.57'),
 ('Henry V', '2.4.4'),
 ('Henry V', '4.8.107'),
 ('Henry V', '4.6.11'),
 ('Henry V', '1.2.45'),
 ('Henry V', '5.2.367'),
 ('Henry V', '2.4.141'),
 ('Henry V', '5.2.24'),
 ('Henry V', '4.8.130'),
 ('Henry V', '4.7.178'),
 ('Henry V', '5.EPI.12'),
 ('Henry V', '4.3.55'),
 ('Henry V', '5.EPI.10'),
 ('Henry V', '5.2.362'),
 ('Henry V', '1.2.49'),
 ('Henry V', '2.CHO.30'),
 ('Henry V',

In [55]:
#write big_df to excel file so I don't have to write a 'combine terms' function
writer = pd.ExcelWriter('singleton_terms.xlsx')
place_df2.to_excel(writer, 'Sheet1')
writer.save()
def show_terms(play, line):
    return place_df2[place_df2['Line'] == line][place_df2['Play'] == play]

In [182]:
show_terms('Richard III', '1.3.21')

  


Unnamed: 0,Speaker,Line,Term,Play
5715,Queen Elizabeth,1.3.21,Richmond,Richard III
5716,Queen Elizabeth,1.3.21,Derby,Richard III


In [None]:
#stop words cycle begins
stop_words = stopwords.words('english')

In [26]:
just_words = big_df['Term'].values
ns, cs = np.unique(just_words, return_counts = True)
combo = list(zip(ns, cs))
sorted_freq_list = sorted(combo, key=lambda x: x[1], reverse = True)
sorted_freq_list

[('God', 438),
 ('France', 311),
 ('York', 216),
 ('England', 205),
 ('John', 194),
 ('Henry', 166),
 ('Warwick', 165),
 ('Richard', 155),
 ('Edward', 135),
 ('Harry', 108),
 ('Clarence', 89),
 ('Gloucester', 87),
 ('Buckingham', 74),
 ('Talbot', 74),
 ('Suffolk', 72),
 ('Somerset', 65),
 ('Lancaster', 64),
 ('London', 64),
 ('Bolingbroke', 63),
 ('Clifford', 59),
 ('Bardolph', 54),
 ('Margaret', 54),
 ('Thomas', 51),
 ('Jack', 49),
 ('Northumberland', 49),
 ('Percy', 49),
 ('Hastings', 47),
 ('Salisbury', 47),
 ('Norfolk', 45),
 ('Falstaff', 43),
 ('Kate', 43),
 ('Mortimer', 43),
 ('Hal', 42),
 ('Charles', 41),
 ('Plantagenet', 41),
 ('Humphrey', 37),
 ('George', 35),
 ('Richmond', 35),
 ('Wales', 33),
 ('Hubert', 32),
 ('Pistol', 31),
 ('Gaunt', 30),
 ('Westmoreland', 30),
 ('Burgundy', 29),
 ('Ireland', 29),
 ('Arthur', 28),
 ('Douglas', 28),
 ('Hereford', 28),
 ('Oxford', 28),
 ('Francis', 27),
 ('Cade', 26),
 ('Exeter', 26),
 ('William', 25),
 ('Winchester', 25),
 ('Davy', 24),
 (

In [20]:
big_df[big_df['Term'] == 'March']

Unnamed: 0,Speaker,Line,Term,Play
91,King,1.3.86,March,"Henry IV, Part I"
491,Hotspur,4.1.117,March,"Henry IV, Part I"
529,Hotspur,4.3.100,March,"Henry IV, Part I"
689,King,5.5.42,March,"Henry IV, Part I"
3023,York,2.2.39,March,"Henry VI, Part 2"
3026,York,2.2.40,March,"Henry VI, Part 2"
3042,York,2.2.52,March,"Henry VI, Part 2"
3335,Cade,4.2.133,March,"Henry VI, Part 2"
3595,King Henry,1.1.109,March,"Henry VI, Part 3"
3764,Warwick,2.1.181,March,"Henry VI, Part 3"


In [217]:
place_df_nocoords[place_df_nocoords['Term'] == 'Bosworth']

Unnamed: 0,Speaker,Line,Term,Play
6274,Richard,5.3.1,Bosworth,Richard III


In [56]:
place_df_nocoords = pd.read_excel('singleton_terms.xlsx')

In [57]:
place_names = place_df_nocoords['Term'].unique()
len(place_names)

279

In [59]:
names_onlydf = pd.DataFrame(place_names)
writer = pd.ExcelWriter('GeoJSON_info.xlsx')
names_onlydf.to_excel(writer, 'Sheet1')
writer.save()