*Note: to do the comparisons of entities, we will be working with the CSV files we created with the entities per bio since running the entity recognizer for each package would take a long time, specially for Stanza.

Checking that the list of entities possible for both packages are the same

In [8]:
import stanza

# get the list of tags from Stanza to double-check that they are the same as the SpaCy ones
nlp_stanza = stanza.Pipeline('en',processors= 'tokenize,ner')
tags_stanza = list(nlp_stanza.processors['ner'].get_known_tags())

2024-06-15 20:35:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 5.56MB/s]                    
2024-06-15 20:35:34 INFO: Downloaded file to C:\Users\alber\stanza_resources\resources.json
2024-06-15 20:35:35 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2024-06-15 20:35:35 INFO: Using device: cuda
2024-06-15 20:35:35 INFO: Loading: tokenize
2024-06-15 20:35:35 INFO: Loading: mwt
2024-06-15 20:35:35 INFO: Loading: ner
2024-06-15 20:35:36 INFO: Done loading processors!


In [9]:
import spacy

# get the list of tags from SpaCy to double-check that they are the same as the Stanza ones
nlp = spacy.load("en_core_web_sm")
tags_spacy = list(nlp.get_pipe('ner').labels)

In [10]:
# they are indeed the same!
tags_spacy == tags_stanza

True

## SpaCy and Stanza agreement

In [11]:
import numpy as np
import pandas as pd
import os

In [12]:
df_entities_spacy = pd.read_csv(os.path.join(os.getcwd(), "df_entities_spacy_processed.csv"))
df_entities_stanza = pd.read_csv(os.path.join(os.getcwd(), "df_entities_stanza_processed.csv"))

In [13]:
# saving a list of people we will use to iterate over the dataframes
list_people = np.unique(df_entities_spacy["subject"])

In [14]:
df_entities_spacy.columns

Index(['text', 'n_word', 'start_pos', 'end_pos', 'label', 'subject',
       'source_index', 'category', 'range_span'],
      dtype='object')

### Function to check individual biographies

In [15]:
def comparison_bio_entities(bio_subject, df_spacy, df_stanza):
    
    filter_df_spacy = df_spacy[df_entities_spacy["subject"] == bio_subject][["subject", "text", "label", "start_pos", "range_span", "category"]]
    filter_df_stanza = df_stanza[df_entities_stanza["subject"] == bio_subject][["subject", "text", "label", "start_pos", "range_span", "category"]]
    category = filter_df_stanza["category"].iloc[0]

    # initiating indexes for the spacy and stanza filter dfs
    i = 0
    j = 0
    
    # agreement counts
    partial_agreement = 0
    total_agreement = 0
    label_agreement = 0
    
    dict_ners = {}
    for ner in tags_spacy:
        dict_ners[ner] = {"Total": 0, "Agreed": 0}
        
    # iterate over the rows of the filter dfs as long as there are rows left to check in BOTH of them
    # the purpose of this while loop is to check the agreements so it is fine if it ends once it has checked either df
    while i+1 < len(filter_df_spacy) and j+1 < len(filter_df_stanza): # adjusting for mismatch between indexing and length

        # print(i, j)
        token_text_spacy = filter_df_spacy.iloc[i]["text"]
        token_text_stanza = filter_df_stanza.iloc[j]["text"]
        
        # using list objects in pandas is a bit complicated since they are not saved in list type
        # these two lines are manually reconstructing the list object
        token_span_spacy = filter_df_spacy.iloc[i]["range_span"].replace("[", "").replace("]", "").replace("\n", "").replace("  ", " ").strip().split(" ")
        token_span_stanza = filter_df_stanza.iloc[j]["range_span"].replace("[", "").replace("]", "").replace("\n", "").replace("  ", " ").strip().split(" ")
        
        
        pos_token_spacy = filter_df_spacy.iloc[i]["label"]
        pos_token_stanza = filter_df_stanza.iloc[j]["label"]
        
        # checking if the entities have overlapping indexes, that is, if they are partial matches the list will be of length > 1
        list_agreement_span = [index for index in token_span_spacy if index in token_span_spacy and index in token_span_stanza]
            
        # if the spans are the same, the entities will be the same and we will have a total agreement
        if token_span_spacy == token_span_stanza:
        # print(token_text_spacy, token_text_stanza)
            total_agreement = total_agreement + 1
            dict_ners[pos_token_spacy]["Total"] = dict_ners[pos_token_spacy]["Total"] + 1
            dict_ners[pos_token_stanza]["Total"] = dict_ners[pos_token_stanza]["Total"] + 1
                
                # checking the labels for the entities are the same
            if pos_token_spacy == pos_token_stanza:
                label_agreement = label_agreement + 1
                    
                dict_ners[pos_token_spacy]["Agreed"] = dict_ners[pos_token_spacy]["Agreed"] + 1
                dict_ners[pos_token_stanza]["Agreed"] = dict_ners[pos_token_stanza]["Agreed"] + 1
                
                # we advance the indexes of both filter dfs
            i = i + 1
            j = j + 1
            
        elif len(list_agreement_span) > 1:
                # print(token_text_spacy, token_text_stanza)
                partial_agreement = partial_agreement + 1 
                
                dict_ners[pos_token_spacy]["Total"] = dict_ners[pos_token_spacy]["Total"] + 1
                dict_ners[pos_token_stanza]["Total"] = dict_ners[pos_token_stanza]["Total"] + 1
                
                # checking the labels for the entities are the same
                if pos_token_spacy == pos_token_stanza:
                    label_agreement = label_agreement + 1
                    
                    dict_ners[pos_token_spacy]["Agreed"] = dict_ners[pos_token_spacy]["Agreed"] + 1
                    dict_ners[pos_token_stanza]["Agreed"] = dict_ners[pos_token_stanza]["Agreed"] + 1
                
                # we advance the indexes of both filter dfs    
                i = i + 1
                j = j + 1
            
            # if there is no agreement we will advance the indexes differently; 
        else:
            # if the ending spam of the entity of spacy is smaller than the one in stanza, 
            # that means that stanza is "further" along the text so even if these indexes did not have agreement in entities,
            # the following index in the spacy df could be a match with the current index of the stanza df hence we have to check them too
            if token_span_spacy[-1] < token_span_stanza[-1]:
                i = i + 1
                j = j
            # for the reverse case
            elif token_span_spacy[-1] > token_span_stanza[-1]:
                i = i
                j = j + 1
                
    total = len(filter_df_spacy) + len(filter_df_stanza) - partial_agreement - total_agreement
    per_agreement = round(((total_agreement+partial_agreement)/total), 2)
    
    # saving the results in a dictionary which will later be turned into a df for analysis
    results_person = {"subject":bio_subject,
                      "category": category,
                    "total_entities": total,
                    "percent_partial_or_total_agreement_span":per_agreement,  # over the total amount of entities
                    "total_agreement_per": round((total_agreement/total), 2),  # over the total amount of entities
                    "partial_agreement_per": round((partial_agreement/total), 2),  # over the total amount of entities
                    "ner_breakdown": dict_ners}
        # getting aggregate results for the entity types
    for key in results_person["ner_breakdown"].keys():
        results_person["total_ner_aggregate"] = round((sum([results_person["ner_breakdown"][key]["Agreed"] for key in results_person["ner_breakdown"].keys()])  / sum([results_person["ner_breakdown"][key]["Total"] for key in results_person["ner_breakdown"].keys()])), 2)
        if results_person["ner_breakdown"][key]["Total"] != 0: # avoiding zero division
            agreement_label_perc = round((results_person["ner_breakdown"][key]["Agreed"] / results_person["ner_breakdown"][key]["Total"]), 2)
            results_person["ner_breakdown"][key]["agreement_per"] = agreement_label_perc
            
        else:
            results_person["ner_breakdown"][key]["agreement_per"] = "Not applicable"
            
    return results_person
    

In [16]:
results_scientist_1 = comparison_bio_entities(list_people[0], df_entities_spacy, df_entities_stanza)

In [17]:
results_scientist_1

{'subject': 'Aage Bohr',
 'category': 'Physics',
 'total_entities': 315,
 'percent_partial_or_total_agreement_span': 0.83,
 'total_agreement_per': 0.74,
 'partial_agreement_per': 0.09,
 'ner_breakdown': {'CARDINAL': {'Total': 28,
   'Agreed': 24,
   'agreement_per': 0.86},
  'DATE': {'Total': 129, 'Agreed': 126, 'agreement_per': 0.98},
  'EVENT': {'Total': 0, 'Agreed': 0, 'agreement_per': 'Not applicable'},
  'FAC': {'Total': 3, 'Agreed': 2, 'agreement_per': 0.67},
  'GPE': {'Total': 73, 'Agreed': 46, 'agreement_per': 0.63},
  'LANGUAGE': {'Total': 0, 'Agreed': 0, 'agreement_per': 'Not applicable'},
  'LAW': {'Total': 0, 'Agreed': 0, 'agreement_per': 'Not applicable'},
  'LOC': {'Total': 1, 'Agreed': 0, 'agreement_per': 0.0},
  'MONEY': {'Total': 2, 'Agreed': 2, 'agreement_per': 1.0},
  'NORP': {'Total': 26, 'Agreed': 26, 'agreement_per': 1.0},
  'ORDINAL': {'Total': 10, 'Agreed': 10, 'agreement_per': 1.0},
  'ORG': {'Total': 92, 'Agreed': 70, 'agreement_per': 0.76},
  'PERCENT': {'Tot

### Total results

In [18]:
total_results = [comparison_bio_entities(person, df_entities_spacy, df_entities_stanza) for person in list_people]

In [19]:
# getting the relevant info for our analysis
keys_aggregate = ["subject", "category", "percent_partial_or_total_agreement_span", "total_agreement_per", "partial_agreement_per", "total_ner_aggregate"]
results_aggregate_clean = []
for result in total_results:
    dict_result = {k:v for k,v in result.items() if k in keys_aggregate}
    for k, val in result["ner_breakdown"].items():
        dict_result[k] = result["ner_breakdown"][k]["agreement_per"]
    results_aggregate_clean.append(dict_result)

In [20]:
df_results = pd.DataFrame(results_aggregate_clean)

In [21]:
df_results

Unnamed: 0,subject,category,percent_partial_or_total_agreement_span,total_agreement_per,partial_agreement_per,total_ner_aggregate,CARDINAL,DATE,EVENT,FAC,...,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,Aage Bohr,Physics,0.83,0.74,0.09,0.81,0.86,0.98,Not applicable,0.67,...,1.0,1.0,1.0,0.76,Not applicable,0.79,0.0,Not applicable,1.0,0.58
1,Aaron Klug,Chemistry,0.81,0.69,0.12,0.78,1.0,1.00,Not applicable,0.0,...,Not applicable,0.71,Not applicable,0.7,Not applicable,0.76,Not applicable,Not applicable,Not applicable,0.43
2,Adolf Friedrich Johann Butenandt,Chemistry,0.06,0.06,0.01,1.00,Not applicable,1.00,1.0,Not applicable,...,Not applicable,1.0,1.0,1.0,Not applicable,1.0,Not applicable,Not applicable,Not applicable,1.0
3,Adolf Otto Reinhold Windaus,Chemistry,0.20,0.18,0.02,0.86,1.0,1.00,0.0,Not applicable,...,Not applicable,1.0,Not applicable,0.86,Not applicable,0.8,Not applicable,Not applicable,Not applicable,0.86
4,Adolf von Baeyer,Chemistry,0.10,0.08,0.01,0.83,0.67,0.92,Not applicable,Not applicable,...,Not applicable,0.92,Not applicable,0.57,Not applicable,0.8,Not applicable,Not applicable,Not applicable,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,William N. Lipscomb,Chemistry,0.05,0.04,0.01,0.96,1.0,1.00,Not applicable,Not applicable,...,Not applicable,1.0,Not applicable,0.91,Not applicable,1.0,Not applicable,Not applicable,Not applicable,0.67
196,William Shockley,Physics,0.78,0.63,0.15,0.81,0.93,0.96,0.5,0.0,...,1.0,0.76,1.0,0.68,0.67,0.79,0.17,Not applicable,Not applicable,0.73
197,Willis Lamb,Physics,0.88,0.84,0.04,0.89,0.8,0.97,Not applicable,0.0,...,Not applicable,0.8,1.0,0.81,Not applicable,0.95,0.0,Not applicable,Not applicable,0.73
198,Wolfgang Pauli,Physics,0.79,0.64,0.15,0.63,0.73,0.98,0.33,0.0,...,Not applicable,0.9,0.92,0.55,Not applicable,0.62,0.0,Not applicable,1.0,0.22


In [22]:
# saving the df to analyse it in the next notebook
df_results.to_csv("results_ner_comparison.csv", index=False)