In [None]:
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
#numpy helps us with null values

import pandas as pd

In [None]:
url = 'https://emed.folger.edu/sites/default/files/folger_encodings/xml/EMED-Prize-3.xml'
response = requests.get(url)
text = response.content.decode('utf-8')
print(bs(text, 'xml').prettify()[:300]) # print first 300 characters of text

<?xml version="1.0" encoding="utf-8"?>
<?xml-stylesheet type="text/xsl" href="emed.xsl"?>
<TEI xml:id="A27177_24" xmlns:="http://www.tei-c.org/ns/1.0">
 <teiHeader>
  <fileDesc>
   <titleStmt>
    <title>
     The Woman’s Prize, or The Tamer Tamed
    </title>
    <author>
     John Fletcher
    </a


In [None]:
# we want a function that, for any tag, it will go further down the XML tree,
# starting with that tag, in order to find the "choice" tag. then, from there,
# it will return the text of the <reg> tag with the "resp" attribute == to 
# "#EMED" *only* . But if not, then the resp attribute that matches "#SHC #EMED"
# But if not that, then it should return the <orig> tag text 
def elem_to_reg_text(elem, default=''):
    choice_elem = elem.find('choice')
    try:
        for choice in choice_elem.find('reg', {'resp':'#EMED'}):
            return(choice)
    except:
        try:
            for choice in choice_elem.find('reg', {'resp':'#SHC #EMED'}):
                return(choice)
        except:
            try:
                for choice in choice_elem.find('orig'):
                    return(choice)
            except:
                return(default)

In [None]:
# Define a function that takes an element and returns the act and scene numbers
# # of its parent <div> elements.
# # Assumes element belongs to only one act and one scene. 
# def elem_parent_data(elem):
#     parents = elem.parents
#     data = {}
#     for parent in parents:
#         if parent.name == 'div':
#             if parent['type'] == 'act':
#                 data['act'] = parent['n']
#             if parent['type'] == 'scene':
#                 data['scene'] = parent['n']
#     return(data)

In [None]:
# def elem_parent_data(elem):
#     parents = elem.parents
#     data = {}
#     for parent in parents:
#         if parent.name == 'div':
#           if parent['type'] == 'act':
#             data['act'] = parent['n']
#           if parent['type'] == 'scene':
#             data['scene'] = parent['n']
#         # elif parent.name == 'pc':
#         #   if parent['unit'] == 'sentence':
#         #     data['sentence'] = parent['xml:id']
#         #     print(data['sentence'])
#         return(data)

# Define a function that takes an element and returns the act and scene numbers
# of its parent <div> elements.
# Assumes element belongs to only one act and one scene. 
def elem_parent_data(elem):
    parents = elem.parents
    data = {}
    for parent in parents:
        if parent.name == 'div':
            if parent['type'] == 'act':
                data['act'] = parent['n']
            if parent['type'] == 'scene':
                data['scene'] = parent['n']
    return(data)

In [None]:
# Populate a dictionary of stage directions and speeches using the elements's id 
# as key and a dictionary of properties as value. 
# Each value dictionary should include the following properties:
# 1) speaker (if <sp>)
# 2) direction_type (if <stage>)
# 3) position
# 4) act
# 5) scene
# 6) text
data_dict = {} 
soup = bs(text, 'xml')
elems = soup.find_all(['stage','sp'])
for elem in elems:
    id = elem['xml:id']
    position = elems.index(elem)
    act = elem_parent_data(elem)['act']
    scene = elem_parent_data(elem)['scene']
   # sentence = elem_parent_data(elem)['sentence']
    if elem.name == 'sp':
        speaker = elem['who'].lstrip('#').replace('_Prize', '')
    else:
        speaker = np.nan
    if elem.name == 'stage':
        direction_type = elem['type']
    else:
        direction_type = np.nan
    elem_words = []
    for word in elem.find_all(['w','pc']):
      #find all in sequence, we added the p
      if word in elem.find_all('w'):
        reg = elem_to_reg_text(word)
        elem_words.append(reg)
      else:
        elem_words.append(word.text)
        # punctuation = elem.find('pc')
        # if punctuation is not None:
        # #for punctuation in punctuations:
          #elem_words.append(punctuation.text)
        #do I have to append the punctuation somehow?
    elem_words = [x for x in elem_words if x is not None] # omit empty elements
    if elem.name == 'sp':
        elem_text = ' '.join(elem_words[1:]) # omit character's name from speech
    else:
        elem_text = ' '.join(elem_words)
    elem_text = ' '.join(elem_text.split()) # normalize whitespace
    elem_data = {'speaker':speaker,
                 'direction_type':direction_type,
                 'position':position,
                 'act':act,
                 'scene':scene,
                 'text':elem_text}
    data_dict[id] = elem_data
print(list(data_dict.items())[:4]) # print first item in dictionary

[('stg-0006', {'speaker': nan, 'direction_type': 'entrance', 'position': 0, 'act': '1', 'scene': '1', 'text': 'Enter Moroso , Sophocles , and Tranio , with Rosemary , as from a wedding .'}), ('sp-0008', {'speaker': 'Moroso', 'direction_type': nan, 'position': 1, 'act': '1', 'scene': '1', 'text': '. GOd give ’em joy .'}), ('sp-0010', {'speaker': 'Tranio', 'direction_type': nan, 'position': 2, 'act': '1', 'scene': '1', 'text': 'Amen .'}), ('sp-0011', {'speaker': 'Sophocles', 'direction_type': nan, 'position': 3, 'act': '1', 'scene': '1', 'text': 'Amen , say I too : The Puddings now i’ th’ proof ; alas poor wench , Through what a mine of patience must thou work , Ere thou know’st good hour more ?'})]


In [None]:

data_dict = {} 
soup = bs(text, 'xml')
elems = soup.find_all(['stage','sp'])
for elem in elems:
    id = elem['xml:id']
    position = elems.index(elem)
    act = elem_parent_data(elem)['act']
    scene = elem_parent_data(elem)['scene']
   # sentence = elem_parent_data(elem)['sentence']
    if elem.name == 'sp':
        speaker = elem['who'].lstrip('#').replace('_Prize', '')
    else:
        speaker = np.nan
    if elem.name == 'stage':
        direction_type = elem['type']
    else:
        direction_type = np.nan
    elem_words = []
    for word in elem.find_all(['w','pc']):
      #find all in sequence, we added the p
        reg = elem_to_reg_text(word)
        elem_words.append(reg)
        #new part I tried adding today
        #maybe need to add a line here?
        #like
        #line = elem_text split by punctuation so that way it only prints the punctuation once?
        punctuations = elem.find_all('pc')
        for punctuation in punctuations:
          elem_words.append(punctuation.text)
        #do I have to append the punctuation somehow?
    elem_words = [x for x in elem_words if x is not None] # omit empty elements
    if elem.name == 'sp':
        elem_text = ' '.join(elem_words[1:]) # omit character's name from speech
    else:
        elem_text = ' '.join(elem_words)
    elem_text = ' '.join(elem_text.split()) # normalize whitespace
    elem_data = {'speaker':speaker,
                 'direction_type':direction_type,
                 'position':position,
                 'act':act,
                 'scene':scene,
                 'text':elem_text}
    data_dict[id] = elem_data
print(list(data_dict.items())[:2]) # print first item in dictionary

NameError: ignored

In [None]:
# Convert dictionary to dataframe
df = pd.DataFrame(data_dict).T.rename_axis('id').reset_index().sort_values('position')
print(df.head)

<bound method NDFrame.head of             id    speaker direction_type position act scene  \
0     stg-0006        NaN       entrance        0   1     1   
1      sp-0008     Moroso            NaN        1   1     1   
2      sp-0010     Tranio            NaN        2   1     1   
3      sp-0011  Sophocles            NaN        3   1     1   
4      sp-0016     Tranio            NaN        4   1     1   
...        ...        ...            ...      ...  ..   ...   
1417   sp-3396    Rowland            NaN     1417   5     4   
1418   sp-3398     Tranio            NaN     1418   5     4   
1419   sp-3400    Rowland            NaN     1419   5     4   
1420   sp-3401  Petruchio            NaN     1420   5     4   
1421  stg-3404        NaN           exit     1421   5     4   

                                                   text  
0     Enter , , , , . Moroso , , , , . , , , , . Sop...  
1          . . . . GOd . . give . . ’em . . joy . . . .  
2                                      

In [None]:
type(df['text'])

pandas.core.series.Series

In [None]:
(df['text'])[1]

'. . . . GOd . . give . . ’em . . joy . . . .'

In [None]:
def character_list(string):
  characters = ['Moroso', 'Tranio', 'Bianca', 'CityWife', 'CountryWife','Doctor', 'Jacques', 'Livia', 'Maids','Maria', 'Pedro', 'Petruchio', 'Rowland','Servant','Sophocles', 'Tranio', 'Watchmen'] # add full list here
  characters_list = []

  for word in string.split(" "):
    if word in characters:
      characters_list.append(word)
  return(characters_list)
#will need to run this function over the column


In [None]:
practice = character_list("Enter Moroso")

print(practice)

#so the function works for something simple like this, but isn't working for the dataframe below...

['Moroso']


In [None]:
# Use str.split and str.join and astype
df['text'] = df['text'].astype(str)

In [None]:
#df['character_list'] = df['text'].apply(character_list)

for index, row in df.iterrows():
  row['character_list'] = character_list(row['text']) 
 
df.head(20)

Unnamed: 0,id,speaker,direction_type,position,act,scene,text
0,stg-0006,,entrance,0,1,1,"Enter , , , , . Moroso , , , , . , , , , . Sop..."
1,sp-0008,Moroso,,1,1,1,. . . . GOd . . give . . ’em . . joy . . . .
2,sp-0010,Tranio,,2,1,1,. . Amen . .
3,sp-0011,Sophocles,,3,1,1,", : ; , , ? , : ; , , ? Amen , : ; , , ? , : ;..."
4,sp-0016,Tranio,,4,1,1,": , , , , ; . : , , , , ; . ’Tis : , , , , ; ...."
5,sp-0021,Moroso,,5,1,1,", . , . Methinks , . now , . , . He’s , . not ..."
6,sp-0023,Sophocles,,6,1,1,", , . , , . This , , . old , , . thief , , . f..."
7,sp-0025,Tranio,,7,1,1,? ? But ? shall ? he ? have ? her ? ?
8,sp-0026,Sophocles,,8,1,1,", . . , . . Yes , . . , . . when , . . I , . ...."
9,sp-0028,Moroso,,9,1,1,", . , . I’ll , . assure , . ye , . , . I , . h..."


In [None]:
gender_dict = {}

male_characters = ['Moroso','Sophocles', 'Tranio', 'Jacques', 'Doctor', 'Pedro', 'Petruchio', 'Rowland', 'Servant', 'Sophocles', 'Watchmen'] 
female_characters = ['Bianca', "CityWife", "CountryWife", "Livia", "Maids", "Maria"] 

for female_character in female_characters:
  gender_dict[female_character] = "F"

for male_character in male_characters:
  gender_dict[male_character] = "M"

# repeat for each character

#somehow add this to the dataframe..

print(list(gender_dict.items())[:1]) # print first item in dictionary.

[('Bianca', 'F')]


In [None]:
#convert dictionary to dataframe

df_gender = pd.DataFrame([gender_dict])

In [None]:
#transpose dataframe
df_gender_transposed = df_gender.T # or df1.transpose()

df_gender_transposed

Unnamed: 0,0
Bianca,F
CityWife,F
CountryWife,F
Livia,F
Maids,F
Maria,F
Moroso,M
Sophocles,M
Tranio,M
Jacques,M


In [None]:
#new version with fixed characters is called file2.json, original version was file.json

# storing the data in JSON format
df.to_json('file2.json', orient = 'split', compression = 'infer', index = 'true')
 
# reading the JSON file
df = pd.read_json('file2.json', orient ='split', compression = 'infer')

#I can see them when I access my google drive on the left side now as "file.json"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# save this as a JSON, then use the data frames I created in a new google colab notebook

In [None]:
# convert powerframes lexicon into dataframe w/ three columns
# iterrate through that and lemmatize each verb 
# add verb lemma as additional column in powerframes dataframe

# iterrate through each line (row) of main dataframe -- use iterrows()
  # for each line, pull out the ['text'] element
  # convert that element to a spacy object using nlp()
  # iterrate through list of words in nlp object
    # if the word is a verb, 
      # 1. lemmatize 
      # 2. add to temporary array of verbs 

  # initialize your 6 power counters at 0
    # e.g. pos_agency_counter = 0

  # iterrate through list of verbs
    # for each verb lemma, check to see if there is a match in the lemma column of the powerframes lexicon;
    # this involves looping through each row in the powerframes lexicon and looking for a match <-- might be better method than iterrows but maybe not

    # if there is a match, then:
      # look at value of power column in that row
      # increment appropriate power counter (one of 3 possible choices)
      # look at value of agency column in that row
      # increment appropriate agency counter (one of 3 possible choices)
      # break! 

  # now that you're done with all of the verbs in that dialogue chunk, add your six counter counts to the main dataframe for that row 
    
### SAVE DATAFRAME AS JSON!!!

# 



https://colab.research.google.com/drive/1KBRYPxsTJ2-VhiBKzUTE-98ayqMqjkVK#scrollTo=ESic8t4X2RPY

Lemmatizing Notebok

https://maartensap.com/connotation-frames/

Link to downloading the frames