In [2]:
import os
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def extract_ent(text):
  if type(text) != str:
    return pd.DataFrame()
  # first, get the doc
  doc = nlp(text)
  entList = []
  for ent in doc.ents:
        # for each entity we are saving the text, the number of words of the entity, 
        # the start and ending positions which will be used to check if the spans for Spacy and Stanza entities overlap
        # and the NER label
    entList.append({
      "text": ent.text,
      "n_word": len(ent.text.split(" ")),
      "start_pos": ent.start_char,
      "end_pos": ent.end_char,
      "label": ent.label_
    })
    # this information is saved in a df
  return pd.DataFrame(entList)


path = os.getcwd()
parent = os.path.dirname(os.path.dirname(path))
data_dir = os.path.join(parent,"Data Directory", "physics_and_chemistry_nobel_laureate.csv")
df = pd.read_csv(data_dir)

df.head()

Unnamed: 0,year,name,country,category,biography,clean_biography
0,1901,Wilhelm Röntgen,German Empire,Physics,Wilhelm Conrad Röntgen (; German pronunciation...,Wilhelm Conrad Röntgen german pronunciation ˈv...
1,1902,Hendrik Lorentz,Netherlands,Physics,Hendrik Antoon Lorentz (; 18 July 1853 – 4 Feb...,Hendrik Antoon Lorentz July February dutch phy...
2,1902,Pieter Zeeman,Netherlands,Physics,Pieter Zeeman (Dutch: [ˈzeːmɑn]; 25 May 1865 –...,Pieter Zeeman Dutch ˈzeːmɑn October dutch phys...
3,1903,Henri Becquerel,France,Physics,Antoine Henri Becquerel (; French pronunciatio...,Antoine Henri Becquerel french pronunciation b...
4,1903,Pierre Curie,France,Physics,"Pierre Curie ( KURE-ee, French: [pjɛʁ kyʁi]; 1...",Pierre Curie KURE ee French pjɛʁ kyʁi April fr...


In [4]:


df_entities_spacy = pd.DataFrame()
for index, row in df.iterrows():
# an empy df is going to be populated with some contextual information (text, category, subject) 
# as well as the information we get with SpaCy
  text = row['biography']
  category = row['category']
  subject = row["name"]
  df_ents = extract_ent(text)
  df_ents['biography'] = text
  df_ents['subject'] = subject
  df_ents['source_index'] = index
  df_ents["category"] = category
  # recursively adding up our master df with the enitty df
  df_entities_spacy = pd.concat([df_entities_spacy, df_ents], ignore_index = True)

In [9]:
df_entities_spacy[df_entities_spacy["subject"].str.contains("Aage")]

Unnamed: 0,text,n_word,start_pos,end_pos,label,biography,subject,source_index,category
37867,Danish,1,17,23,NORP,Aage Niels Bohr (Danish: [ˈɔːwə ˈne̝ls ˈpoɐ̯ˀ]...,Aage Bohr,98,Physics
37868,19 June 1922,3,49,61,DATE,Aage Niels Bohr (Danish: [ˈɔːwə ˈne̝ls ˈpoɐ̯ˀ]...,Aage Bohr,98,Physics
37869,8 September 2009,3,64,80,DATE,Aage Niels Bohr (Danish: [ˈɔːwə ˈne̝ls ˈpoɐ̯ˀ]...,Aage Bohr,98,Physics
37870,Danish,1,88,94,NORP,Aage Niels Bohr (Danish: [ˈɔːwə ˈne̝ls ˈpoɐ̯ˀ]...,Aage Bohr,98,Physics
37871,the Nobel Prize in Physics,5,124,150,WORK_OF_ART,Aage Niels Bohr (Danish: [ˈɔːwə ˈne̝ls ˈpoɐ̯ˀ]...,Aage Bohr,98,Physics
...,...,...,...,...,...,...,...,...,...
38145,11 December 1975,3,10188,10204,DATE,Aage Niels Bohr (Danish: [ˈɔːwə ˈne̝ls ˈpoɐ̯ˀ]...,Aage Bohr,98,Physics
38146,30 January 1963,3,10287,10302,DATE,Aage Niels Bohr (Danish: [ˈɔːwə ˈne̝ls ˈpoɐ̯ˀ]...,Aage Bohr,98,Physics
38147,American Institute of Physics,4,10304,10333,ORG,Aage Niels Bohr (Danish: [ˈɔːwə ˈne̝ls ˈpoɐ̯ˀ]...,Aage Bohr,98,Physics
38148,Niels Bohr Library,3,10335,10353,ORG,Aage Niels Bohr (Danish: [ˈɔːwə ˈne̝ls ˈpoɐ̯ˀ]...,Aage Bohr,98,Physics


## Statistics for Spacy

In [6]:
# saving the df to make the comparison between SpaCy and Stanza in the next notebook
df_path = os.path.join(os.getcwd(), "df_entities_spacy.csv")

df_entities_spacy.to_csv(df_path, index=False)