In [21]:
import os
import stanza

data_dir = os.getcwd()
data_dir = os.path.join(data_dir, "..", "..", "Data Directory")

# get the list of tags from Stanza to double-check that they are the same as the SpaCy ones
nlp_stanza = stanza.Pipeline('en',processors= 'tokenize,ner')
tags_stanza = list(nlp_stanza.processors['ner'].get_known_tags())

2024-06-13 11:55:49 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 24.6MB/s]                    
2024-06-13 11:55:49 INFO: Downloaded file to /Users/tunji/stanza_resources/resources.json
2024-06-13 11:55:50 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2024-06-13 11:55:50 INFO: Using device: cpu
2024-06-13 11:55:50 INFO: Loading: tokenize
2024-06-13 11:55:50 INFO: Loading: mwt
2024-06-13 11:55:50 INFO: Loading: ner
2024-06-13 11:55:50 INFO: Done loading processors!


In [22]:
import spacy

# get the list of tags from SpaCy to double-check that they are the same as the Stanza ones
nlp = spacy.load("en_core_web_sm")
tags_spacy = list(nlp.get_pipe('ner').labels)

In [23]:
# they are indeed the same!
tags_spacy == tags_stanza

True

## Comparison

In [24]:
import numpy as np
import pandas as pd

In [25]:
df_entities_spacy = pd.read_csv(os.path.join(data_dir, "df_entities_spacy_processed.csv"))

In [26]:
df_entities_stanza = pd.read_csv(os.path.join(data_dir, "df_entities_stanza_processed.csv"))

In [27]:
# saving a list of people we will use to iterate over the dataframes
list_people = np.unique(df_entities_spacy["subject"])

In [28]:
df_entities_spacy.columns

Index(['text', 'n_word', 'start_pos', 'end_pos', 'label', 'biography',
       'subject', 'source_index', 'category', 'range_span'],
      dtype='object')

### FX

In [29]:
results = []

for person in list_people:
    
    # filter the dataframes by the person to get granular results per biography
    filter_df_spacy = df_entities_spacy[df_entities_spacy["subject"] == person][["subject", "text", "label", "start_pos", "range_span"]]
    filter_df_stanza = df_entities_stanza[df_entities_stanza["subject"] == person][["subject", "text", "label", "start_pos", "range_span"]]
    
    # initiating indexes for the spacy and stanza filter dfs
    i = 0
    j = 0

    partial_agreement = 0
    total_agreement = 0
    label_agreement = 0

    dict_ners = {}
    for ner in list_ners:
        dict_ners[ner] = {"Total": 0, "Agreed": 0}
    
    # print(dict_ners)
    # print(dict_ners.keys())

    # iterate over the rows of the filter dfs as long as there are rows left to check in BOTH of them
    while i+1 < len(filter_df_spacy) and j+1 < len(filter_df_stanza): # adjusting for mismatch between indexing and length

        # print(i, j)
        token_text_spacy = filter_df_spacy.iloc[i]["text"]
        token_text_stanza = filter_df_stanza.iloc[j]["text"]
        
        # print(token_text_spacy, token_text_stanza)
        # print(len(token_text_spacy), len(token_text_stanza))
        
        # using list objects in pandas is a bit complicated since they are not saved in list type
        # these two lines are manually reconstructing the list object
        token_span_spacy = filter_df_spacy.iloc[i]["range_span"].replace("[", "").replace("]", "").replace("\n", "").replace("  ", " ").strip().split(" ")
        token_span_stanza = filter_df_stanza.iloc[j]["range_span"].replace("[", "").replace("]", "").replace("\n", "").replace("  ", " ").strip().split(" ")

        # print(token_span_spacy, token_span_stanza)
        # print(len(token_span_spacy), len(token_span_stanza))

        pos_token_spacy = filter_df_spacy.iloc[i]["label"]
        pos_token_stanza = filter_df_stanza.iloc[j]["label"]

        # checking if the entities have overlapping indexes, that is, if they are partial matches the list will be of length > 1
        list_agreement_span = [index for index in token_span_spacy if index in token_span_spacy and index in token_span_stanza]
        
        # if the spans are the same, the entities will be the same and we will have a total agreement
        if token_span_spacy == token_span_stanza:
            # print(token_text_spacy, token_text_stanza)
            total_agreement = total_agreement + 1
            
            # print(pos_token_spacy)
            # print(pos_token_stanza)
            # print(dict_ners[pos_token_spacy])
            # print(dict_ners[pos_token_stanza])
            # print(dict_ners)
            
            
            dict_ners[pos_token_spacy]["Total"] = dict_ners[pos_token_spacy]["Total"] + 1
            dict_ners[pos_token_stanza]["Total"] = dict_ners[pos_token_stanza]["Total"] + 1
            
            # checking the labels for the entities are the same
            if pos_token_spacy == pos_token_stanza:
                label_agreement = label_agreement + 1
                
                dict_ners[pos_token_spacy]["Agreed"] = dict_ners[pos_token_spacy]["Agreed"] + 1
                dict_ners[pos_token_stanza]["Agreed"] = dict_ners[pos_token_stanza]["Agreed"] + 1
            
            # we advance the indexes of both filter dfs
            i = i + 1
            j = j + 1
            
        # if there are common indexes in the span
        elif len(list_agreement_span) > 1:
            # print(token_text_spacy, token_text_stanza)
            partial_agreement = partial_agreement + 1 
            
            dict_ners[pos_token_spacy]["Total"] = dict_ners[pos_token_spacy]["Total"] + 1
            dict_ners[pos_token_stanza]["Total"] = dict_ners[pos_token_stanza]["Total"] + 1
            
            # checking the labels for the entities are the same
            if pos_token_spacy == pos_token_stanza:
                label_agreement = label_agreement + 1
                
                dict_ners[pos_token_spacy]["Agreed"] = dict_ners[pos_token_spacy]["Agreed"] + 1
                dict_ners[pos_token_stanza]["Agreed"] = dict_ners[pos_token_stanza]["Agreed"] + 1
            
            # we advance the indexes of both filter dfs    
            i = i + 1
            j = j + 1
        
        # if there is no agreement we will advance the indexes differently; 
        else:
            # if the ending spam of the entity of spacy is smaller than the one in stanza, 
            # that means that stanza is "further" along the text so even if these indexes did not have agreement in entities,
            # the following index in the spacy df could be a match with the current index of the stanza df hence we have to check them too
            if token_span_spacy[-1] < token_span_stanza[-1]:
                i = i + 1
                j = j
            # for the reverse case
            elif token_span_spacy[-1] > token_span_stanza[-1]:
                i = i
                j = j + 1
                
        # print(dict_ners)
        # print(dict_ners.keys())
        # print("DONE WITH TOKENS", token_text_spacy, token_text_stanza)
        
    total = len(filter_df_spacy) + len(filter_df_stanza) - partial_agreement - total_agreement
    per_agreement = round(((total_agreement+partial_agreement)/total), 2)
    
    # saving the results in a dictionary which will later be turned into a df for analysis
    results_person = {"subject":person,
                      "total_entities": total,
                      "percent_partial_or_total_agreement_span":per_agreement,  # over the total amount of entities
                      "total_agreement_per": round((total_agreement/total), 2),  # over the total amount of entities
                      "partial_agreement_per": round((partial_agreement/total), 2),  # over the total amount of entities
                      "ner_breakdown": dict_ners}
    
    # getting aggregate results for the entity types
    for key in results_person["ner_breakdown"].keys():
        results_person["total_ner_aggregate"] = round((sum([results_person["ner_breakdown"][key]["Agreed"] for key in results_person["ner_breakdown"].keys()])  / sum([results_person["ner_breakdown"][key]["Total"] for key in results_person["ner_breakdown"].keys()])), 2)
        if results_person["ner_breakdown"][key]["Total"] != 0: # avoiding zero division
            agreement_label_perc = round((results_person["ner_breakdown"][key]["Agreed"] / results_person["ner_breakdown"][key]["Total"]), 2)
            results_person["ner_breakdown"][key]["agreement_per"] = agreement_label_perc
            
        else:
            results_person["ner_breakdown"][key]["agreement_per"] = "Not applicable"
            
    results.append(results_person)
    
    # print(person)
    # print(" Same annotations:", total_agreement)
    # print(" Partial annotations:", partial_agreement)
    # print(" Total:", total)
    # print(" Perc. total or partial annotations:", per_agreement)
    # print()

KeyError: 'NORP'

In [None]:
results[0]

{'subject': 'Aage Bohr',
 'total_entities': 328,
 'percent_partial_or_total_agreement_span': 0.76,
 'total_agreement_per': 0.68,
 'partial_agreement_per': 0.08,
 'ner_breakdown': {'CARDINAL': {'Total': 26,
   'Agreed': 22,
   'agreement_per': 0.85},
  'DATE': {'Total': 125, 'Agreed': 122, 'agreement_per': 0.98},
  'EVENT': {'Total': 0, 'Agreed': 0, 'agreement_per': 'Not applicable'},
  'FAC': {'Total': 2, 'Agreed': 2, 'agreement_per': 1.0},
  'GPE': {'Total': 66, 'Agreed': 42, 'agreement_per': 0.64},
  'LANGUAGE': {'Total': 0, 'Agreed': 0, 'agreement_per': 'Not applicable'},
  'LAW': {'Total': 0, 'Agreed': 0, 'agreement_per': 'Not applicable'},
  'LOC': {'Total': 1, 'Agreed': 0, 'agreement_per': 0.0},
  'MONEY': {'Total': 2, 'Agreed': 2, 'agreement_per': 1.0},
  'NORP': {'Total': 26, 'Agreed': 26, 'agreement_per': 1.0},
  'ORDINAL': {'Total': 10, 'Agreed': 10, 'agreement_per': 1.0},
  'ORG': {'Total': 84, 'Agreed': 62, 'agreement_per': 0.74},
  'PERCENT': {'Total': 0, 'Agreed': 0, 'agr

In [None]:
# getting the revelant info we will analyse
keys_aggregate = ["percent_partial_or_total_agreement_span", "total_agreement_per", "partial_agreement_per", "total_ner_aggregate"]
results_aggregate_clean = []
for result in results:
    dict_result = {k:v for k,v in result.items() if k in keys_aggregate}
    for k, val in result["ner_breakdown"].items():
        dict_result[k] = result["ner_breakdown"][k]["agreement_per"]
    results_aggregate_clean.append(dict_result)

In [None]:
# results_aggregate_clean

In [None]:
df_results = pd.DataFrame(results_aggregate_clean)

In [None]:
# saving the df to analyse it in the next notebook
df_results.to_csv(os.path.join(data_dir, "results_ner_comparison.csv"), index=False)