In [32]:
# !pip install spacy pandas
# !python -m spacy download en_core_web_sm

In [33]:
import spacy
import pandas as pd


Load models

In [34]:
nlp = spacy.load("en_core_web_sm")

Named Entity Recognition

In [35]:
def extract_ent(text):
  doc = nlp(text)
  entList = []
  for ent in doc.ents:
    entList.append({
      "text": ent.text,
      "n_word": len(ent.text.split(" ")),
      "start_pos": ent.start_char,
      "end_pos": ent.end_char,
      "label": ent.label_
    })
  return pd.DataFrame(entList)


df = pd.read_csv('physics_and_chemistry_nobel_laureate.csv')

df_entities_spacy = pd.DataFrame()
for index, row in df.iterrows():
  text = row['biography']
  category = row['category']
  subject = row["name"]
  df_ents = extract_ent(text)
  df_ents['subject'] = subject
  df_ents['source_index'] = index
  df_ents["category"] = category

  df_entities_spacy = pd.concat([df_entities_spacy, df_ents], ignore_index = True)

In [36]:
df_entities_spacy.head()

Unnamed: 0,text,n_word,start_pos,end_pos,label,subject,source_index,category
0,Wilhelm Conrad Röntgen,3,0,22,PERSON,Wilhelm Röntgen,0,Physics
1,German,1,26,32,NORP,Wilhelm Röntgen,0,Physics
2,27 March 1845,3,70,83,DATE,Wilhelm Röntgen,0,Physics
3,10 February 1923,3,86,102,DATE,Wilhelm Röntgen,0,Physics
4,German,1,110,116,NORP,Wilhelm Röntgen,0,Physics


## Statistics for Spacy

In [37]:
# def computeStats(df_ents):

#     stats_count = df_ents['label'].value_counts().describe()

#     df_ents['length'] = df_ents['text'].apply(len)
#     stats_length = df_ents.groupby('label')['length'].describe()

#     return stats_count, stats_length



## Function to compare 2 entity dataframes

In [38]:
import numpy as np

In [39]:
list_people = np.unique(df_entities_spacy["subject"])

In [44]:
aggregate_results = []

for people in list_people:
    
    results = {}
    results["subject"] = people
    
    filter_df_spacy = df_entities_spacy[df_entities_spacy["subject"] == people][["text", "label"]]
    filter_df_stanza = df_entities_spacy[df_entities_spacy["subject"] == people][["text", "label"]]
    
    entities_spacy = filter_df_spacy["text"].to_list()
    entities_stanza = filter_df_stanza["text"].to_list()
    
    labels_spacy = filter_df_spacy["label"].to_list()
    labels_stanza = filter_df_stanza["label"].to_list()
    
    dict_label_spacy = {k:v for k in entities_spacy for v in labels_spacy}
    dict_label_stanza = {k:v for k in entities_stanza for v in labels_stanza}
    
    common_entities = [ent for ent in entities_spacy if ent in entities_stanza]
    spacy_diff = [ent for ent in entities_spacy if ent not in entities_stanza]
    stanza_diff =  [ent for ent in entities_stanza if ent not in entities_spacy]
    
    total_num_entities = len(common_entities) + len(spacy_diff) + len(stanza_diff)
    diff_spacy = len(spacy_diff)/total_num_entities
    diff_stanza = len(stanza_diff)/total_num_entities
    
    agreement_entities = len(common_entities)/total_num_entities
    
    results["agreement_entities"] = agreement_entities
    results["diff_spacy_entities"] = diff_spacy
    results["diff_stanza_entities"] = diff_stanza
    
    common_labels = [(ent, dict_label_spacy[ent]) for ent in common_entities if dict_label_spacy[ent] == dict_label_stanza[ent]]
    spacy_label_diff = [(ent, dict_label_spacy[ent]) for ent in common_entities if dict_label_spacy[ent] != dict_label_stanza[ent]]
    stanza_label_diff =  [(ent, dict_label_stanza[ent]) for ent in common_entities if dict_label_spacy[ent] != dict_label_stanza[ent]]
    
    agreement_labels = len(common_labels)/len(common_entities)
    
    results["agreement_labels"] = agreement_labels
    
    
    if len(common_entities) == len(entities_stanza):
        agreement = "Total Agreement"
    else: 
        agreement = "Partial Agreement"
    
    aggregate_results.append(results)
    
    ############# LETF TO DO: COMPUTE STATS OF ACCURACY ETC


In [45]:
aggregate_results

[{'subject': 'Aage Bohr',
  'agreement_entities': 1.0,
  'diff_spacy_entities': 0.0,
  'diff_stanza_entities': 0.0,
  'agreement_labels': 1.0},
 {'subject': 'Aaron Klug',
  'agreement_entities': 1.0,
  'diff_spacy_entities': 0.0,
  'diff_stanza_entities': 0.0,
  'agreement_labels': 1.0},
 {'subject': 'Adolf Friedrich Johann Butenandt',
  'agreement_entities': 1.0,
  'diff_spacy_entities': 0.0,
  'diff_stanza_entities': 0.0,
  'agreement_labels': 1.0},
 {'subject': 'Adolf Otto Reinhold Windaus',
  'agreement_entities': 1.0,
  'diff_spacy_entities': 0.0,
  'diff_stanza_entities': 0.0,
  'agreement_labels': 1.0},
 {'subject': 'Adolf von Baeyer',
  'agreement_entities': 1.0,
  'diff_spacy_entities': 0.0,
  'diff_stanza_entities': 0.0,
  'agreement_labels': 1.0},
 {'subject': 'Albert A. Michelson',
  'agreement_entities': 1.0,
  'diff_spacy_entities': 0.0,
  'diff_stanza_entities': 0.0,
  'agreement_labels': 1.0},
 {'subject': 'Albert Einstein',
  'agreement_entities': 1.0,
  'diff_spacy_en