In [134]:
# !pip install spacy pandas
# !python -m spacy download en_core_web_sm

In [135]:
import spacy
import pandas as pd


Load models

In [136]:
nlp = spacy.load("en_core_web_sm")

Named Entity Recognition

In [137]:
def extract_ent(text):
  doc = nlp(text)
  entList = []
  for ent in doc.ents:
    entList.append({
      "text": ent.text,
      "n_word": len(ent.text.split(" ")),
      "start_pos": ent.start_char,
      "end_pos": ent.end_char,
      "label": ent.label_
    })
  return pd.DataFrame(entList)


df = pd.read_csv('physics_and_chemistry_nobel_laureate.csv')

df_entities_spacy = pd.DataFrame()
for index, row in df.iterrows():
  text = row['biography']
  category = row['category']
  subject = row["name"]
  df_ents = extract_ent(text)
  df_ents['subject'] = subject
  df_ents['source_index'] = index
  df_ents["category"] = category

  df_entities_spacy = pd.concat([df_entities_spacy, df_ents], ignore_index = True)

In [None]:
df_entities_spacy.head()

Unnamed: 0,text,n_word,start_pos,end_pos,label,subject,source_index,category
0,Wilhelm Conrad Röntgen,3,0,22,PERSON,Wilhelm Röntgen,0,Physics
1,German,1,26,32,NORP,Wilhelm Röntgen,0,Physics
2,27 March 1845,3,70,83,DATE,Wilhelm Röntgen,0,Physics
3,10 February 1923,3,86,102,DATE,Wilhelm Röntgen,0,Physics
4,German,1,110,116,NORP,Wilhelm Röntgen,0,Physics


## Statistics for Spacy

In [None]:
# def computeStats(df_ents):

#     stats_count = df_ents['label'].value_counts().describe()

#     df_ents['length'] = df_ents['text'].apply(len)
#     stats_length = df_ents.groupby('label')['length'].describe()

#     return stats_count, stats_length



## Function to compare 2 entity dataframes

In [None]:
import numpy as np

In [None]:
df_entities_stanza = pd.read_csv("df_entities_stanza.csv")

HACK to fix the subject things


In [None]:
df_entities_stanza["subject"] = df_entities_stanza["source_index"]

In [None]:
df_no_duplicates = df_entities_spacy[["subject", "source_index"]]

In [None]:
df_no_duplicates = df_no_duplicates.drop_duplicates()
df_no_duplicates.reset_index(inplace=True, drop=True)

In [None]:
df_no_duplicates

Unnamed: 0,subject,source_index
0,Wilhelm Röntgen,0
1,Hendrik Lorentz,1
2,Pieter Zeeman,2
3,Henri Becquerel,3
4,Pierre Curie,4
...,...,...
195,Aaron Klug,195
196,Henry Taube,196
197,Robert Bruce Merrifield,197
198,Herbert A. Hauptman,198


In [None]:
list_subjects = df_no_duplicates["subject"].to_list()
list_indexes = df_no_duplicates["source_index"].to_list()

In [None]:
list_subjects = df_entities_spacy["subject"].to_list()
list_indexes = df_entities_spacy["source_index"].to_list()

In [None]:
translation_dict = {}
for index in list_indexes:
    translation_dict[index] = list_subjects[index]

In [None]:
df_entities_stanza["subject"] = df_entities_stanza["source_index"].replace(translation_dict)

In [None]:
# df_entities_stanza

## Comparing the results

In [None]:
list_people = np.unique(df_entities_spacy["subject"])

In [None]:
aggregate_results = []

for people in list_people:
    
    results = {}
    results["subject"] = people
    
    filter_df_spacy = df_entities_spacy[df_entities_spacy["subject"] == people][["text", "label", "start_pos", "end_pos"]]
    filter_df_stanza = df_entities_stanza[df_entities_stanza["subject"] == people][["text", "label", "start_pos", "end_pos"]]
    
    entities_spacy = filter_df_spacy["text"].to_list()
    entities_stanza = filter_df_stanza["text"].to_list()
    
    labels_spacy = filter_df_spacy["label"].to_list()
    labels_stanza = filter_df_stanza["label"].to_list()
    
    dict_label_spacy = {k:v for k in entities_spacy for v in labels_spacy}
    dict_label_stanza = {k:v for k in entities_stanza for v in labels_stanza}    
    
    # Entity agreements
    common_entities = [ent for ent in entities_spacy if ent in entities_stanza]
    
    spacy_diff = [ent for ent in entities_spacy if ent not in entities_stanza]
    stanza_diff =  [ent for ent in entities_stanza if ent not in entities_spacy]
    
    total_num_entities = len(common_entities) + len(spacy_diff) + len(stanza_diff)
    diff_spacy = len(spacy_diff)/total_num_entities
    diff_stanza = len(stanza_diff)/total_num_entities
    
    total_num_disagreements = len(spacy_diff) + len(stanza_diff)
    
    agreement_entities = len(common_entities)/total_num_entities
    
    results["agreement_entities"] = agreement_entities
    results["diff_spacy_entities"] = diff_spacy
    results["diff_stanza_entities"] = diff_stanza
    
    #Partial Span agreement
    
    if agreement_entities != 1.0: # if the agreement is not total - avoid zero division
        
        print("skip for now")
        
        # Issue with getting the spans for the entities:
        # sometimes the entity appears more than once so we can't get the index by filtering like filter[filter["text"] = ent] 
        # since it will return more than one value
        # ideally, a way to get each entity with its index to do filter[index][start] and filter[index][stop] instead
    
        # spacy_spans = []
        # for entity in spacy_diff:
        #     start = filter_df_spacy["start_pos"]
        #     stop = filter_df_spacy["end_pos"]
        #     print(start, stop)
        #     entity_spans = np.arange(start, stop + 1, 1) # CHECK IF THE STOP SPAN IN SPACY IS INCLUDING OR EXCLUDING
        #     spacy_spans.append(entity_spans)

        # stanza_spans = []
        # for entity in stanza_diff:
        #     start = filter_df_stanza["start_pos"]
        #     stop = filter_df_stanza["end_pos"]
        #     print(start, stop)
        #     entity_spans = np.arange(start, stop + 1, 1) # CHECK IF THE STOP SPAN IN STANZA IS INCLUDING OR EXCLUDING
        #     stanza_spans.append(entity_spans)

        # common_entity_spans = []
        # for entity_span_spacy in spacy_spans:
        #     for entity_span_stanza in stanza_spans:
        #         common_span = [index for index in entity_span_spacy if index in entity_span_stanza]
        #         if len(common_span) > 1:
        #             n_entity_spacy = spacy_spans.index(entity_span_spacy)
        #             n_entity_stanza = stanza_spans.index(entity_span_stanza)
        #             tuple_indexes = (n_entity_spacy, n_entity_stanza)
        #             common_entity_spans.append(tuple_indexes)
        
        # common_entity_spans = set(common_entity_spans)
        # partial_agreement_ratio = len(common_entity_spans)/total_num_disagreements
        # results["partial_agreements_over_total_disagreements"] = partial_agreement_ratio
        
        # total_partial_agreements = (len(common_entity_spans) + len(common_entities))/total_num_entities
        # results["partial_agreements"] = total_partial_agreements
    
    # Label agreements
    common_labels = [(ent, dict_label_spacy[ent]) for ent in common_entities if dict_label_spacy[ent] == dict_label_stanza[ent]]
    spacy_label_diff = [(ent, dict_label_spacy[ent]) for ent in common_entities if dict_label_spacy[ent] != dict_label_stanza[ent]]
    stanza_label_diff =  [(ent, dict_label_stanza[ent]) for ent in common_entities if dict_label_spacy[ent] != dict_label_stanza[ent]]
    
    if len(common_labels) > 0:
        agreement_labels = len(common_labels)/len(common_entities)
    else:
        agreement_labels = 0
    
    results["agreement_labels"] = agreement_labels
    
    
    if len(common_entities) == len(entities_stanza):
        agreement = "Total Agreement"
    else: 
        agreement = "Partial Agreement"
    
    aggregate_results.append(results)
    
    ############# LETF TO DO: COMPUTE STATS OF ACCURACY ETC


IndentationError: expected an indented block after 'if' statement on line 40 (188539401.py, line 82)

In [None]:
aggregate_results

[]

INTER ANNOTATOR AGREEMENT BETWEEN THE TWO PACKAGES