In [2]:
import re
import pandas as pd
import numpy as np
from IPython.display import clear_output
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import requests
from bs4 import BeautifulSoup
from lxml import etree

In [34]:
#function to get all proper nouns from a file
#takes xml doc and returns dataframe
def get_nps(file):
    acc_d = {'Speaker':[], 'Line':[], 'Term':[]}
    with open(file, encoding="UTF-8") as f:
        tree = etree.parse(f)
        #one-liner to create parent_map to traverse the tree backwards
        parent_map = dict((c, p) for p in tree.getiterator() for c in p)
        #get title
        title = tree.xpath('//title')[0].text
        #start with list of all proper nouns, then add line # and speaker info to each
        nps = tree.xpath('//w[@ana="#n1-nn"]')
        for np in nps:
            line = parent_map[np]
            line_num = line.get('n')
            parent = parent_map[line]
            if parent.tag == 'p':
                speech = parent_map[parent]
            else:
                speech = parent
            if speech.find('speaker') == None:
                speaker = speech.get('who')
            else:
                speaker = speech.find('speaker')[0].text
            acc_d['Speaker'].append(speaker) 
            acc_d['Line'].append(line_num)
            acc_d['Term'].append(np.text)
    df = pd.DataFrame.from_dict(acc_d)
    df['Play'] = title
    return df

In [35]:
HenryIV_1df = get_nps('Initial_Texts/HenryIV(1).xml')
HenryIV_2df = get_nps('Initial_Texts/HenryIV(2).xml')
HenryVdf = get_nps('Initial_Texts/HenryV.xml')
HenryVI_1df = get_nps('Initial_Texts/HenryVI(1).xml')
HenryVI_2df = get_nps('Initial_Texts/HenryVI(2).xml')
HenryVI_3df = get_nps('Initial_Texts/HenryVI(3).xml')
John_df = get_nps('Initial_Texts/John.xml')
RichardII_df = get_nps('Initial_Texts/RichardII.xml')
RichardIII_df = get_nps('Initial_Texts/RichardIII.xml')
all_dfs = [HenryIV_1df, HenryIV_2df, HenryVdf, HenryVI_1df, HenryVI_2df, HenryVI_3df, John_df, RichardII_df, RichardIII_df]

In [36]:
HenryIV_1df = get_nps('Initial_Texts/HenryIV(1).xml')
HenryIV_1df.head()

Unnamed: 0,Speaker,Line,Term,Play
0,KING,1.1.19,Christ,"Henry IV, Part I"
1,KING,1.1.31,Westmoreland,"Henry IV, Part I"
2,WESTMORELAND,1.1.37,Wales,"Henry IV, Part I"
3,WESTMORELAND,1.1.38,Mortimer,"Henry IV, Part I"
4,WESTMORELAND,1.1.39,Herefordshire,"Henry IV, Part I"


## Remove Non-Place Names

In [15]:
#join all plays into one df
joint_df = HenryIV_1df.copy()
for df in all_dfs[1:]:
    joint_df = joint_df.append(df, ignore_index=True)
joint_df.groupby('Play')
joint_df['Speaker'].apply(lambda x: x.replace('Ã\x89', 'E'))
joint_df.head()

AttributeError: 'NoneType' object has no attribute 'replace'

In [16]:
for i, x in enumerate(joint_df['Speaker']):
    if x == None:
        print(joint_df.iloc[[i]])

  Speaker    Line    Term              Play
0    None  1.1.19  Christ  Henry IV, Part I
  Speaker    Line          Term              Play
1    None  1.1.31  Westmoreland  Henry IV, Part I
  Speaker    Line   Term              Play
2    None  1.1.37  Wales  Henry IV, Part I
  Speaker    Line      Term              Play
3    None  1.1.38  Mortimer  Henry IV, Part I
  Speaker    Line           Term              Play
4    None  1.1.39  Herefordshire  Henry IV, Part I
  Speaker    Line       Term              Play
5    None  1.1.40  Glendower  Henry IV, Part I
  Speaker    Line      Term              Play
6    None  1.1.41  Welshman  Henry IV, Part I
  Speaker    Line       Term              Play
7    None  1.1.52  Holy-rood  Henry IV, Part I
  Speaker    Line     Term              Play
8    None  1.1.52  Hotspur  Henry IV, Part I
  Speaker    Line   Term              Play
9    None  1.1.53  Harry  Henry IV, Part I
   Speaker    Line   Term              Play
10    None  1.1.53  Percy  Henry

    Speaker     Line         Term              Play
113    None  1.3.180  Bolingbroke  Henry IV, Part I
    Speaker     Line Term              Play
114    None  1.3.221  God  Henry IV, Part I
    Speaker     Line  Term              Play
115    None  1.3.221  Scot  Henry IV, Part I
    Speaker     Line  Term              Play
116    None  1.3.222  Scot  Henry IV, Part I
    Speaker     Line      Term              Play
117    None  1.3.228  Mortimer  Henry IV, Part I
    Speaker     Line      Term              Play
118    None  1.3.229  Mortimer  Henry IV, Part I
    Speaker     Line      Term              Play
119    None  1.3.231  Mortimer  Henry IV, Part I
    Speaker     Line      Term              Play
120    None  1.3.233  Mortimer  Henry IV, Part I
    Speaker     Line         Term              Play
121    None  1.3.237  Bolingbroke  Henry IV, Part I
    Speaker     Line   Term              Play
122    None  1.3.238  Wales  Henry IV, Part I
    Speaker     Line         Term       

215    None  None  Ned  Henry IV, Part I
    Speaker  Line Term              Play
216    None  None  Ned  Henry IV, Part I
    Speaker  Line      Term              Play
217    None  None  Falstaff  Henry IV, Part I
    Speaker  Line     Term              Play
218    None  None  Francis  Henry IV, Part I
    Speaker  Line     Term              Play
219    None  None  Francis  Henry IV, Part I
    Speaker  Line     Term              Play
220    None  None  Francis  Henry IV, Part I
    Speaker  Line   Term              Play
221    None  None  Ralph  Henry IV, Part I
    Speaker  Line     Term              Play
222    None  None  Francis  Henry IV, Part I
    Speaker  Line     Term              Play
223    None  None  Francis  Henry IV, Part I
    Speaker  Line     Term              Play
224    None  None  Francis  Henry IV, Part I
    Speaker  Line     Term              Play
225    None  None  Francis  Henry IV, Part I
    Speaker  Line     Term              Play
226    None  None  Engla

    Speaker  Line      Term              Play
329    None  None  Falstaff  Henry IV, Part I
    Speaker  Line  Term              Play
330    None  None  Jack  Henry IV, Part I
    Speaker  Line      Term              Play
331    None  None  Falstaff  Henry IV, Part I
    Speaker  Line  Term              Play
332    None  None  Jack  Henry IV, Part I
    Speaker  Line      Term              Play
333    None  None  Falstaff  Henry IV, Part I
    Speaker  Line  Term              Play
334    None  None  Jack  Henry IV, Part I
    Speaker  Line      Term              Play
335    None  None  Falstaff  Henry IV, Part I
    Speaker  Line  Term              Play
336    None  None  Jack  Henry IV, Part I
    Speaker  Line      Term              Play
337    None  None  Falstaff  Henry IV, Part I
    Speaker  Line  Term              Play
338    None  None  Jack  Henry IV, Part I
    Speaker  Line      Term              Play
339    None  None  Falstaff  Henry IV, Part I
    Speaker  Line  Term     

420    None  3.2.175  Westmoreland  Henry IV, Part I
    Speaker     Line  Term              Play
421    None  3.2.176  John  Henry IV, Part I
    Speaker     Line       Term              Play
422    None  3.2.176  Lancaster  Henry IV, Part I
    Speaker     Line       Term              Play
423    None  3.2.178  Wednesday  Henry IV, Part I
    Speaker     Line   Term              Play
424    None  3.2.178  Harry  Henry IV, Part I
    Speaker     Line      Term              Play
425    None  3.2.179  Thursday  Henry IV, Part I
    Speaker     Line         Term              Play
426    None  3.2.180  Bridgenorth  Henry IV, Part I
    Speaker     Line   Term              Play
427    None  3.2.180  Harry  Henry IV, Part I
    Speaker     Line             Term              Play
428    None  3.2.181  Gloucestershire  Henry IV, Part I
    Speaker     Line         Term              Play
429    None  3.2.183  Bridgenorth  Henry IV, Part I
    Speaker  Line      Term              Play
430    No

522    None  4.3.38  Blunt  Henry IV, Part I
    Speaker    Line Term              Play
523    None  4.3.38  God  Henry IV, Part I
    Speaker    Line Term              Play
524    None  4.3.44  God  Henry IV, Part I
    Speaker    Line Term              Play
525    None  4.3.66  God  Henry IV, Part I
    Speaker    Line       Term              Play
526    None  4.3.67  Lancaster  Henry IV, Part I
    Speaker    Line            Term              Play
527    None  4.3.73  Northumberland  Henry IV, Part I
    Speaker    Line         Term              Play
528    None  4.3.83  Ravenspurgh  Henry IV, Part I
    Speaker     Line   Term              Play
529    None  4.3.100  March  Henry IV, Part I
    Speaker     Line   Term              Play
530    None  4.3.102  Wales  Henry IV, Part I
    Speaker     Line    Term              Play
531    None  4.3.114  Walter  Henry IV, Part I
    Speaker     Line Term              Play
532    None  4.3.121  God  Henry IV, Part I
    Speaker   Line     

    Speaker    Line   Term              Play
629    None  5.4.42  Wales  Henry IV, Part I
    Speaker    Line      Term              Play
630    None  5.4.45  Nicholas  Henry IV, Part I
    Speaker    Line    Term              Play
631    None  5.4.45  Gawsey  Henry IV, Part I
    Speaker    Line     Term              Play
632    None  5.4.46  Clifton  Henry IV, Part I
    Speaker    Line     Term              Play
633    None  5.4.46  Clifton  Henry IV, Part I
    Speaker    Line Term              Play
634    None  5.4.51  God  Henry IV, Part I
    Speaker    Line     Term              Play
635    None  5.4.54  Douglas  Henry IV, Part I
    Speaker    Line     Term              Play
636    None  5.4.58  Clifton  Henry IV, Part I
    Speaker    Line      Term              Play
637    None  5.4.58  Nicholas  Henry IV, Part I
    Speaker    Line    Term              Play
638    None  5.4.58  Gawsey  Henry IV, Part I
    Speaker    Line   Term              Play
639    None  5.4.59  Harry 

    Speaker     Line     Term               Play
734    None  1.1.140  Douglas  Henry IV, Part II
    Speaker     Line       Term               Play
735    None  1.1.147  Lancaster  Henry IV, Part II
    Speaker     Line          Term               Play
736    None  1.1.148  Westmoreland  Henry IV, Part II
    Speaker     Line            Term               Play
737    None  1.1.168  Northumberland  Henry IV, Part II
    Speaker     Line  Term               Play
738    None  1.1.173  Cain  Henry IV, Part II
    Speaker     Line  Term               Play
739    None  1.1.207  York  Henry IV, Part II
    Speaker     Line     Term               Play
740    None  1.1.224  Richard  Henry IV, Part II
    Speaker     Line     Term               Play
741    None  1.1.224  Pomfret  Henry IV, Part II
    Speaker     Line         Term               Play
742    None  1.1.228  Bolingbroke  Henry IV, Part II
    Speaker  Line Term               Play
743    None  None  God  Henry IV, Part II
    Speake

830    None  None  John  Henry IV, Part II
    Speaker  Line Term               Play
831    None  None  God  Henry IV, Part II
    Speaker  Line  Term               Play
832    None  None  Doll  Henry IV, Part II
    Speaker  Line       Term               Play
833    None  None  Tearsheet  Henry IV, Part II
    Speaker  Line         Term               Play
834    None  None  Basingstoke  Henry IV, Part II
    Speaker     Line       Term               Play
835    None  2.1.182  Lancaster  Henry IV, Part II
    Speaker     Line            Term               Play
836    None  2.1.183  Northumberland  Henry IV, Part II
    Speaker     Line   Term               Play
837    None  2.1.184  Wales  Henry IV, Part II
    Speaker     Line   Term               Play
838    None  2.1.186  Gower  Henry IV, Part II
    Speaker  Line   Term               Play
839    None  None  Gower  Henry IV, Part II
    Speaker  Line  Term               Play
840    None  None  John  Henry IV, Part II
    Speaker  Li

954    None  None  London  Henry IV, Part II
    Speaker  Line  Term               Play
955    None  None  Jesu  Henry IV, Part II
    Speaker  Line   Term               Play
956    None  None  Wales  Henry IV, Part II
    Speaker  Line  Term               Play
957    None  None  Hill  Henry IV, Part II
    Speaker  Line Term               Play
958    None  None  Hal  Henry IV, Part II
    Speaker  Line Term               Play
959    None  None  Hal  Henry IV, Part II
    Speaker  Line Term               Play
960    None  None  Ned  Henry IV, Part II
    Speaker  Line Term               Play
961    None  None  Ned  Henry IV, Part II
    Speaker  Line Term               Play
962    None  None  Hal  Henry IV, Part II
    Speaker  Line Term               Play
963    None  None  Ned  Henry IV, Part II
    Speaker  Line      Term               Play
964    None  None  Bardolph  Henry IV, Part II
    Speaker  Line      Term               Play
965    None  None  Bardolph  Henry IV, Part II
   

     Speaker  Line  Term               Play
1082    None  None  John  Henry IV, Part II
     Speaker  Line Term               Play
1083    None  None  God  Henry IV, Part II
     Speaker  Line Term               Play
1084    None  None  God  Henry IV, Part II
     Speaker  Line Term               Play
1085    None  None  God  Henry IV, Part II
     Speaker  Line     Term               Play
1086    None  None  Shallow  Henry IV, Part II
     Speaker  Line Term               Play
1087    None  None  God  Henry IV, Part II
     Speaker  Line      Term               Play
1088    None  None  Bardolph  Henry IV, Part II
     Speaker  Line      Term               Play
1089    None  None  Turnbull  Henry IV, Part II
     Speaker  Line      Term               Play
1090    None  None  Turkâs  Henry IV, Part II
     Speaker  Line  Term               Play
1091    None  None  John  Henry IV, Part II
     Speaker  Line   Term               Play
1092    None  None  Gaunt  Henry IV, Part II
     Spe

     Speaker     Line   Term               Play
1203    None  4.3.338  Harry  Henry IV, Part II
     Speaker     Line Term               Play
1204    None  4.3.341  God  Henry IV, Part II
     Speaker     Line   Term               Play
1205    None  4.3.371  Harry  Henry IV, Part II
     Speaker     Line Term               Play
1206    None  4.3.378  God  Henry IV, Part II
     Speaker     Line  Term               Play
1207    None  4.3.385  John  Henry IV, Part II
     Speaker     Line       Term               Play
1208    None  4.3.385  Lancaster  Henry IV, Part II
     Speaker     Line  Term               Play
1209    None  4.3.387  John  Henry IV, Part II
     Speaker     Line     Term               Play
1210    None  4.3.391  Warwick  Henry IV, Part II
     Speaker     Line     Term               Play
1211    None  4.3.392  Warwick  Henry IV, Part II
     Speaker     Line       Term               Play
1212    None  4.3.395  Jerusalem  Henry IV, Part II
     Speaker     Line Term  

     Speaker     Line    Term               Play
1303    None  5.3.106  Pistol  Henry IV, Part II
     Speaker     Line   Term               Play
1304    None  5.3.115  Harry  Henry IV, Part II
     Speaker     Line   Term               Play
1305    None  5.3.116  Harry  Henry IV, Part II
     Speaker     Line   Term               Play
1306    None  5.3.117  Harry  Henry IV, Part II
     Speaker     Line  Term               Play
1307    None  5.3.119  John  Henry IV, Part II
     Speaker     Line   Term               Play
1308    None  5.3.120  Harry  Henry IV, Part II
     Speaker     Line    Term               Play
1309    None  5.3.121  Pistol  Henry IV, Part II
     Speaker     Line      Term               Play
1310    None  5.3.122  Spaniard  Henry IV, Part II
     Speaker  Line      Term               Play
1311    None  None  Bardolph  Henry IV, Part II
     Speaker  Line    Term               Play
1312    None  None  Robert  Henry IV, Part II
     Speaker  Line    Term          

1482    None  None  Bardolph  Henry V
     Speaker  Line    Term     Play
1483    None  None  France  Henry V
     Speaker  Line Term     Play
1484    None  None  Nym  Henry V
     Speaker  Line Term     Play
1485    None  None  Nym  Henry V
     Speaker  Line Term     Play
1486    None  None  Nym  Henry V
     Speaker  Line  Term     Play
1487    None  None  John  Henry V
     Speaker  Line Term     Play
1488    None  None  Nym  Henry V
     Speaker   Line Term     Play
1489    None  2.2.1  God  Henry V
     Speaker    Line       Term     Play
1490    None  2.2.14  Cambridge  Henry V
     Speaker    Line    Term     Play
1491    None  2.2.14  Masham  Henry V
     Speaker    Line    Term     Play
1492    None  2.2.18  France  Henry V
     Speaker    Line    Term     Play
1493    None  2.2.41  Exeter  Henry V
     Speaker    Line       Term     Play
1494    None  2.2.60  Cambridge  Henry V
     Speaker    Line    Term     Play
1495    None  2.2.60  Scroop  Henry V
     Speaker    Line  

1619    None  3.3.27  Harfleur  Henry V
     Speaker    Line   Term     Play
1620    None  3.3.40  Jewry  Henry V
     Speaker    Line    Term     Play
1621    None  3.3.52  Exeter  Henry V
     Speaker    Line      Term     Play
1622    None  3.3.53  Harfleur  Henry V
     Speaker    Line    Term     Play
1623    None  3.3.57  Calais  Henry V
     Speaker    Line      Term     Play
1624    None  3.3.58  Harfleur  Henry V
     Speaker  Line   Term     Play
1625    None  None  Alice  Henry V
     Speaker  Line   Term     Play
1626    None  None  Alice  Henry V
     Speaker  Line    Term     Play
1627    None  None  France  Henry V
     Speaker   Line   Term     Play
1628    None  3.5.1  Somme  Henry V
     Speaker   Line    Term     Play
1629    None  3.5.3  France  Henry V
     Speaker    Line    Term     Play
1630    None  3.5.14  Albion  Henry V
     Speaker    Line    Term     Play
1631    None  3.5.32  France  Henry V
     Speaker    Line     Term     Play
1632    None  3.5.37  Mon

1779    None  None  Bardolph  Henry V
     Speaker  Line Term     Play
1780    None  None  Nym  Henry V
     Speaker    Line     Term     Play
1781    None  4.5.14  Bourbon  Henry V
     Speaker   Line  Term     Play
1782    None  4.6.3  York  Henry V
     Speaker    Line     Term     Play
1783    None  4.6.10  Suffolk  Henry V
     Speaker    Line     Term     Play
1784    None  4.6.11  Suffolk  Henry V
     Speaker    Line  Term     Play
1785    None  4.6.11  York  Henry V
     Speaker    Line     Term     Play
1786    None  4.6.15  Suffolk  Henry V
     Speaker  Line      Term     Play
1787    None  None  Monmouth  Henry V
     Speaker  Line   Term     Play
1788    None  None  Gower  Henry V
     Speaker  Line       Term     Play
1789    None  None  Alexander  Henry V
     Speaker  Line       Term     Play
1790    None  None  Alexander  Henry V
     Speaker  Line       Term     Play
1791    None  None  Alexander  Henry V
     Speaker  Line     Term     Play
1792    None  None  Maced

     Speaker    Line        Term     Play
1935    None  5.2.86  Gloucester  Henry V
     Speaker    Line     Term     Play
1936    None  5.2.87  Warwick  Henry V
     Speaker    Line        Term     Play
1937    None  5.2.87  Huntington  Henry V
     Speaker    Line       Term     Play
1938    None  5.2.97  Katherine  Henry V
     Speaker     Line       Term     Play
1939    None  5.2.101  Katherine  Henry V
     Speaker  Line     Term     Play
1940    None  None  England  Henry V
     Speaker  Line       Term     Play
1941    None  None  Katherine  Henry V
     Speaker  Line  Term     Play
1942    None  None  Kate  Henry V
     Speaker  Line  Term     Play
1943    None  None  Kate  Henry V
     Speaker  Line       Term     Play
1944    None  None  Katherine  Henry V
     Speaker  Line          Term     Play
1945    None  None  Englishwoman  Henry V
     Speaker  Line  Term     Play
1946    None  None  Kate  Henry V
     Speaker  Line  Term     Play
1947    None  None  Kate  Henry V
  

2089    None  1.2.56  Rome  Henry VI, Part 1
     Speaker    Line      Term              Play
2090    None  1.2.62  Reignier  Henry VI, Part 1
     Speaker    Line      Term              Play
2091    None  1.2.66  Reignier  Henry VI, Part 1
     Speaker     Line      Term              Play
2092    None  1.2.101  Touraine  Henry VI, Part 1
     Speaker     Line    Term              Play
2093    None  1.2.106  Amazon  Henry VI, Part 1
     Speaker     Line     Term              Play
2094    None  1.2.107  Deborah  Henry VI, Part 1
     Speaker     Line     Term              Play
2095    None  1.2.112  Pucelle  Henry VI, Part 1
     Speaker     Line      Term              Play
2096    None  1.2.128  Orleance  Henry VI, Part 1
     Speaker     Line    Term              Play
2097    None  1.2.142  Caesar  Henry VI, Part 1
     Speaker     Line     Term              Play
2098    None  1.2.143  Mahomet  Henry VI, Part 1
     Speaker     Line   Term              Play
2099    None  1.2.145  Hel

2273    None  2.5.84  Cambridge  Henry VI, Part 1
     Speaker    Line    Term              Play
2274    None  2.5.85  Edmund  Henry VI, Part 1
     Speaker    Line     Term              Play
2275    None  2.5.85  Langley  Henry VI, Part 1
     Speaker    Line  Term              Play
2276    None  2.5.85  York  Henry VI, Part 1
     Speaker     Line       Term              Play
2277    None  2.5.102  Lancaster  Henry VI, Part 1
     Speaker     Line      Term              Play
2278    None  2.5.122  Mortimer  Henry VI, Part 1
     Speaker     Line      Term              Play
2279    None  2.5.125  Somerset  Henry VI, Part 1
     Speaker   Line      Term              Play
2280    None  3.1.3  Humphrey  Henry VI, Part 1
     Speaker   Line        Term              Play
2281    None  3.1.3  Gloucester  Henry VI, Part 1
     Speaker    Line    Term              Play
2282    None  3.1.24  London  Henry VI, Part 1
     Speaker    Line        Term              Play
2283    None  3.1.28  Glouc

2374    None  3.2.130  Paris  Henry VI, Part 1
     Speaker     Line   Term              Play
2375    None  3.2.131  Henry  Henry VI, Part 1
     Speaker     Line    Term              Play
2376    None  3.2.132  Talbot  Henry VI, Part 1
     Speaker     Line      Term              Play
2377    None  3.2.132  Burgundy  Henry VI, Part 1
     Speaker     Line     Term              Play
2378    None  3.2.134  Bedford  Henry VI, Part 1
     Speaker     Line  Term              Play
2379    None  3.2.135  Roan  Henry VI, Part 1
     Speaker   Line  Term              Play
2380    None  3.3.2  Roan  Henry VI, Part 1
     Speaker   Line    Term              Play
2381    None  3.3.5  Talbot  Henry VI, Part 1
     Speaker    Line  Term              Play
2382    None  3.3.17  Joan  Henry VI, Part 1
     Speaker    Line      Term              Play
2383    None  3.3.19  Burgundy  Henry VI, Part 1
     Speaker    Line    Term              Play
2384    None  3.3.20  Talbot  Henry VI, Part 1
     Speake

     Speaker    Line  Term              Play
2485    None  4.3.44  Lucy  Henry VI, Part 1
     Speaker    Line   Term              Play
2486    None  4.3.46  Maine  Henry VI, Part 1
     Speaker    Line   Term              Play
2487    None  4.3.46  Blois  Henry VI, Part 1
     Speaker    Line       Term              Play
2488    None  4.3.46  Poictiers  Henry VI, Part 1
     Speaker    Line   Term              Play
2489    None  4.3.46  Tours  Henry VI, Part 1
     Speaker    Line      Term              Play
2490    None  4.3.47  Somerset  Henry VI, Part 1
     Speaker    Line   Term              Play
2491    None  4.3.53  Henry  Henry VI, Part 1
     Speaker   Line  Term              Play
2492    None  4.4.2  York  Henry VI, Part 1
     Speaker   Line    Term              Play
2493    None  4.4.2  Talbot  Henry VI, Part 1
     Speaker   Line    Term              Play
2494    None  4.4.5  Talbot  Henry VI, Part 1
     Speaker   Line  Term              Play
2495    None  4.4.8  York  H

2649    None  5.4.74  Alanson  Henry VI, Part 1
     Speaker    Line     Term              Play
2650    None  5.4.75  Alanson  Henry VI, Part 1
     Speaker    Line       Term              Play
2651    None  5.4.75  Machiavel  Henry VI, Part 1
     Speaker    Line     Term              Play
2652    None  5.4.78  Charles  Henry VI, Part 1
     Speaker    Line      Term              Play
2653    None  5.4.79  Reignier  Henry VI, Part 1
     Speaker    Line    Term              Play
2654    None  5.4.79  Naples  Henry VI, Part 1
     Speaker    Line         Term              Play
2655    None  5.4.97  Christendom  Henry VI, Part 1
     Speaker     Line     Term              Play
2656    None  5.4.112  Warwick  Henry VI, Part 1
     Speaker     Line     Term              Play
2657    None  5.4.112  Warwick  Henry VI, Part 1
     Speaker     Line    Term              Play
2658    None  5.4.113  France  Henry VI, Part 1
     Speaker     Line  Term              Play
2659    None  5.4.114  Yor

KeyboardInterrupt: 

In [42]:
#get all unique character names, add those to stop words
#many place names take the place of character names (e.g. "Harry of Hereford" == "Harry")
#for now I am just keeping all of the place names without addressing this issue
stop_words = stopwords.words('english')
c_list = [x for x in joint_df.Speaker.unique() if x != None]
characters = list(map(lambda x:x.capitalize(), c_list))

In [44]:
c_list

['KING',
 'WESTMORELAND',
 'FALSTAFF',
 'PRINCE',
 'POINS',
 'NORTHUMBERLAND',
 'HOTSPUR',
 'BLUNT',
 'WORCESTER',
 'FIRST',
 'SECOND',
 'GADSHILL',
 'CHAMBERLAIN',
 'TRAVELERS',
 'LADY',
 'FRANCIS',
 'VINTNER',
 'HOSTESS',
 'PETO',
 'GLENDOWER',
 'MORTIMER',
 'BARDOLPH',
 'DOUGLAS',
 'VERNON',
 'ARCHBISHOP',
 'SIR',
 'LANCASTER',
 'RUMOR',
 'LORD',
 'TRAVERS',
 'MORTON',
 'PAGE',
 'SERVANT',
 'CHIEF',
 'HASTINGS',
 'FANG',
 'GOWER',
 'WILL',
 'DOLL',
 'PISTOL',
 'WARWICK',
 'SHALLOW',
 'SILENCE',
 'FEEBLE',
 'BULLCALF',
 'MOWBRAY',
 'JOHN',
 'COLEVILE',
 'HUMPHREY',
 'THOMAS',
 'HARCOURT',
 'DAVY',
 'BEADLE',
 '#Epilogue_2H4',
 'CHORUS',
 'BISHOP',
 'AMBASSADOR',
 'NYM',
 'BOY',
 'BEDFORD',
 'EXETER',
 'SCROOP',
 'CAMBRIDGE',
 'DAUPHIN',
 'CONSTABLE',
 'MESSENGER',
 'FLUELLEN',
 'JAMY',
 'MACMORRIS',
 'KATHERINE',
 'BRITTANY',
 'MONTJOY',
 'ORLÃ\x89ANS',
 'RAMBURES',
 'ERPINGHAM',
 'COURT',
 'BATES',
 'GRANDPRÃ\x89',
 'SALISBURY',
 'BOURBON',
 'WILLIAMS',
 'QUEEN',
 'BURGUNDY',
 'ALIC

In [None]:

stop_words = stop_words.append(characters)

In [None]:
extra_stop_words = ['God', 'John', 'Henry', 'Richard', 'Edward', 'Harry', 'Clarence', 'Talbot', 'Bardolph', 'Margaret', 'Percy',
                   'Jack', 'Mortimer', 'Falstaff', 'Hal', 'Kate', 'Plantagenet', 'Humphrey', 'Charles', 'George', 'Thomas', 
                   'Hubert', 'Pistol', 'Gaunt', 'Douglas', 'Francis', 'Arthur', 'Cade', 'William', 'Davy', 'Lewis', 'Stanley',
                   'Catesby', 'Mowbray', 'Glendower', 'Ned', 'Robert', 'Montague']

In [9]:
terms = joint_df['Term']

In [16]:
no_chars = [x for x in terms if x not in stop_words]
freq_list = []
acc = []
for w in no_chars:
    if w not in acc:
        freq = no_chars.count(w)
        freq_list.append((freq, w))
        acc.append(w)
sorted_freq_list = sorted(freq_list, key=lambda x: x[0], reverse = True)

In [17]:
sorted_freq_list

[(412, 'God'),
 (303, 'France'),
 (209, 'York'),
 (193, 'England'),
 (191, 'John'),
 (165, 'Warwick'),
 (160, 'Henry'),
 (153, 'Richard'),
 (133, 'Edward'),
 (106, 'Harry'),
 (89, 'Clarence'),
 (87, 'Gloucester'),
 (74, 'Talbot'),
 (69, 'Suffolk'),
 (65, 'Somerset'),
 (64, 'Lancaster'),
 (63, 'Bolingbroke'),
 (62, 'London'),
 (61, 'Buckingham'),
 (59, 'Clifford'),
 (54, 'Bardolph'),
 (54, 'Margaret'),
 (49, 'Percy'),
 (49, 'Jack'),
 (48, 'Northumberland'),
 (47, 'Hastings'),
 (46, 'Salisbury'),
 (43, 'Mortimer'),
 (43, 'Falstaff'),
 (42, 'Hal'),
 (42, 'Kate'),
 (41, 'Plantagenet'),
 (40, 'Norfolk'),
 (37, 'Humphrey'),
 (36, 'Charles'),
 (35, 'George'),
 (35, 'Richmond'),
 (33, 'Wales'),
 (32, 'Thomas'),
 (32, 'Hubert'),
 (31, 'Pistol'),
 (30, 'Westmoreland'),
 (30, 'Gaunt'),
 (29, 'Burgundy'),
 (28, 'Douglas'),
 (28, 'Hereford'),
 (27, 'Francis'),
 (27, 'Arthur'),
 (27, 'Oxford'),
 (27, 'Ireland'),
 (26, 'Exeter'),
 (26, 'Cade'),
 (24, 'William'),
 (24, 'Davy'),
 (23, 'Lewis'),
 (23, '

In [45]:
tst = 'GRANDPRÃ\x89'


#function to replace this weirdness: Ã\x89 with E
tst.replace('Ã\x89', 'E')
    

'GRANDPRE'