In [1]:
import os
import stanza.pipeline
stanza.download('en')

# loading the tokenizer and ner model since stanza takes a while to run 
nlp_stanza = stanza.Pipeline('en',processors= 'tokenize,ner')

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 119MB/s]                     
2024-06-13 12:58:43 INFO: Downloaded file to /Users/tunji/stanza_resources/resources.json
2024-06-13 12:58:43 INFO: Downloading default packages for language: en (English) ...
2024-06-13 12:58:44 INFO: File exists: /Users/tunji/stanza_resources/en/default.zip
2024-06-13 12:58:46 INFO: Finished downloading models and saved to /Users/tunji/stanza_resources
2024-06-13 12:58:46 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 23.1MB/s]                    
2024-06-13 12:58:46 INFO: Downloaded file to /Users/tunji/stanza_resources/resources.js

In [2]:
import pandas as pd

In [3]:

def extract_entities_stanza(text):
    if type(text) != str:
        return pd.DataFrame()
    # first, get the doc
    doc = nlp_stanza(text)
    entList = []
    for sentence in doc.sentences:
        # for each entity we are saving the text, the number of words of the entity, 
        # the start and ending positions which will be used to check if the spans for Spacy and Stanza entities overlap
        # and the NER label
        for ent in sentence.ents:
            entList.append({
                "text": ent.text,
                "n_word": len(ent.text.split(" ")),
                "start_pos": ent.start_char,
                "end_pos": ent.end_char,
                "label": ent.type
            })
    # this information is saved in a df
    return pd.DataFrame(entList)

path = os.getcwd()
parent = os.path.dirname(os.path.dirname(path))
data_dir = os.path.join(parent,"Data Directory", "physics_and_chemistry_nobel_laureate.csv")
df = pd.read_csv(data_dir)

df_allents = pd.DataFrame()

# Please be patient, this will take a while
for index, row in df.iterrows():
# an empy df is going to be populated with some contextual information (text, category, subject)
# as well as the information we get with Stanza
  text = row['biography']
  category = row['category']
  subject = row["name"]
  df_ents = extract_entities_stanza(text)
  # adding the contextual info
  df_ents['subject'] = subject
  df_ents['biography'] = text
  df_ents['source_index'] = index
  df_ents["category"] = category
  # recursively adding up our master df with the enitty df
  df_allents = pd.concat([df_allents, df_ents], ignore_index = True)

In [4]:
df_allents.head()

Unnamed: 0,text,n_word,start_pos,end_pos,label,subject,biography,source_index,category
0,Wilhelm Conrad Röntgen,3.0,0.0,22.0,PERSON,Wilhelm Röntgen,Wilhelm Conrad Röntgen (; German pronunciation...,0,Physics
1,German,1.0,26.0,32.0,NORP,Wilhelm Röntgen,Wilhelm Conrad Röntgen (; German pronunciation...,0,Physics
2,27 March 1845,3.0,70.0,83.0,DATE,Wilhelm Röntgen,Wilhelm Conrad Röntgen (; German pronunciation...,0,Physics
3,10 February 1923,3.0,86.0,102.0,DATE,Wilhelm Röntgen,Wilhelm Conrad Röntgen (; German pronunciation...,0,Physics
4,German,1.0,110.0,116.0,NORP,Wilhelm Röntgen,Wilhelm Conrad Röntgen (; German pronunciation...,0,Physics


TO DO: STATS

In [3]:
# saving the df to make the comparison between SpaCy and Stanza in the next notebook
df_path = os.path.join(os.getcwd(), "df_entities_stanza.csv")

df_allents.to_csv(df_path, index=False)

NameError: name 'os' is not defined