In [1]:
import os 
import pandas as pd 
import numpy as np
import json
import re

In [2]:
par_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(par_dir, '2.2 NER - analysis by entity type')

stanza_dir = os.path.join(data_dir, "df_entities_spacy_processed.csv")
spacy_dir = os.path.join(data_dir, "df_entities_stanza_processed.csv")

entities_stanza = pd.read_csv(stanza_dir)
entities_spacy = pd.read_csv(spacy_dir)

In [3]:
list_people = list(set(list(np.unique(entities_spacy["subject"])) + list(np.unique(entities_stanza["subject"]))))
list_dfs = [entities_spacy, entities_stanza]

### Functions

In [4]:
def make_regex(entity):
    entity_extra = entity.split(" ")
    entity_new = "_?".join(entity_extra) # add optional _ since the entities usually appear that way in the graph
    
    return entity_new

In [5]:
def verify_kg_person_package(person, df): 
    
    # filtering by the subject of the bio
    df_filter = df[df["subject"] == person]
    
    # getting the category to know which Data Directory subfolder we have to access
    category = df_filter["category"].iloc[0]

    # getting the entities found in the bios p
    set_entities_df = set(df_filter["text"].to_list())

    
    # total amount of unique entities
    total_entities = len(set_entities_df)

    # loading the json graphs to query them
    par_dir = os.path.dirname(os.path.dirname(os.getcwd()))
    name_file_graph = person.replace(" ", "__") + ".json"
    data_dir = os.path.join(par_dir, "Data Directory")
    
    if category == "Chemistry":
        graph_dir = os.path.join(data_dir, "chemistry_nobel_laureate")
    else:
        graph_dir = os.path.join(data_dir, "physics_nobel_laureate")  
    person_graph_path = os.path.join(graph_dir, name_file_graph)
    with open(person_graph_path, "r") as f:
        personal_graph = json.load(f)

    list_subjects = [dict_graph["subject"]["value"].strip("http://dbpedia.org/resource/").rstrip() for dict_graph in personal_graph]
    list_objects = [dict_graph["object"]["value"].strip("http://dbpedia.org/resource/").rstrip() for dict_graph in personal_graph]
    
    list_entities_graph = list(set(list_subjects + list_objects))
    
    regex_entities = [ent for ent in set_entities_df if ent.count("(") == ent.count(")")]
    
    found_matches = 0
    
    # looking for the regexed entities in the graph
    for ent in list_entities_graph:
        match_search = []
        for regex in regex_entities:
            reg = make_regex(regex)
            try:
                # print(regex, ent)
                match = re.search(reg, ent, flags=re.IGNORECASE)
                match_search.append(match)
            except:
                try:
                    ent_clean = str(ent).replace("[", "").replace("]", "").replace(":", "").replace("+", "").replace("\(", "").replace("\)", "").replace("\\", "").replace("\(", "").replace("\)", "")
                    regex_clean = str(reg).replace("[", "").replace("]", "").replace(":", "").replace("+", "").replace("\(", "").replace("\)", "").replace("\\", "").replace("\(", "").replace("\)", "")
                    match = re.search(regex_clean, ent_clean, flags=re.IGNORECASE)
                    match_search.append(match)
                except:
                    continue
            
        match = [(search.group(0), search) for search in match_search if search != None]
        if match:
            found_matches = found_matches + 1

    prop_matches = round((found_matches/total_entities), 2)
    return prop_matches

  ent_clean = str(ent).replace("[", "").replace("]", "").replace(":", "").replace("+", "").replace("\(", "").replace("\)", "").replace("\\", "").replace("\(", "").replace("\)", "")
  ent_clean = str(ent).replace("[", "").replace("]", "").replace(":", "").replace("+", "").replace("\(", "").replace("\)", "").replace("\\", "").replace("\(", "").replace("\)", "")
  ent_clean = str(ent).replace("[", "").replace("]", "").replace(":", "").replace("+", "").replace("\(", "").replace("\)", "").replace("\\", "").replace("\(", "").replace("\)", "")
  ent_clean = str(ent).replace("[", "").replace("]", "").replace(":", "").replace("+", "").replace("\(", "").replace("\)", "").replace("\\", "").replace("\(", "").replace("\)", "")
  regex_clean = str(reg).replace("[", "").replace("]", "").replace(":", "").replace("+", "").replace("\(", "").replace("\)", "").replace("\\", "").replace("\(", "").replace("\)", "")
  regex_clean = str(reg).replace("[", "").replace("]", "").replace(":", "").replace("+", "").

### Results for SpaCy

In [6]:
results_spacy = [verify_kg_person_package(person, entities_spacy) for person in list_people]

In [7]:
# we can confidently say that around 9% of the entities detected by spaCy where present in the KG
proportion_spacy = np.sum(results_spacy)/len(results_spacy)
proportion_spacy

0.09375

### Results for Stanza

In [8]:
results_stanza = [verify_kg_person_package(person, entities_stanza) for person in list_people]

In [9]:
# we can confidently say that around 9% of the entities detected by Stanza where present in the KG
proportion_stanza = np.sum(results_stanza)/len(results_stanza)
proportion_stanza

0.1052

In [10]:
# for person in list_people:
    
#     # filtering each df by the subject of the bio
#     filter_df_spacy = entities_spacy[entities_spacy["subject"] == person]
#     filter_df_stanza = entities_stanza[entities_stanza["subject"] == person]
    
#     # getting the category to know which Data Directory subfolder we have to access
#     category = filter_df_spacy["category"].iloc[0]
    
#     # print(category)
#     # getting the entities found in the bios per package
#     set_entities_spacy = set(filter_df_spacy["text"].to_list())
#     set_entities_stanza = set(filter_df_stanza["text"].to_list())
    
#     # total amount of unique entities
#     total_entities_spacy = len(set_entities_spacy)
#     total_entities_stanza = len(set_entities_stanza)
    
#     found_entities_spacy = 0
#     found_entities_stanza = 0   
    
#     # loading the json graphs to query them
#     par_dir = os.path.dirname(os.path.dirname(os.getcwd()))
#     name_file_graph = person.replace(" ", "__") + ".json"
#     data_dir = os.path.join(par_dir, "Data Directory")
    
#     if category == "Chemistry":
#         graph_dir = os.path.join(data_dir, "chemistry_nobel_laureate")
    
#     else:
#         graph_dir = os.path.join(data_dir, "physics_nobel_laureate")
        
#     person_graph_path = os.path.join(graph_dir, name_file_graph)
#     # print(person_graph_path)
    
#     with open(person_graph_path, "r") as f:
#         personal_graph = json.load(f)
#     # if f:
#     #     print("Graph load successfully!")
    
#     list_subjects = [dict_graph["subject"]["value"].strip("http://dbpedia.org/resource/").rstrip() for dict_graph in personal_graph]
#     list_objects = [dict_graph["object"]["value"].strip("http://dbpedia.org/resource/").rstrip() for dict_graph in personal_graph]
    
#     list_entities_graph = list(set(list_subjects + list_objects))
    
#     # print(list_entities_graph)
    
#     regex_entities_stanza = [ent for ent in set_entities_stanza if ent.count("(") == ent.count(")")]
#     regex_entities_spacy = [ent for ent in set_entities_spacy if ent.count("(") == ent.count(")")]
    
#     matches_spacy = 0
    
#     for ent in list_entities_graph:
#         # print(ent)
#         # print(regex_entities_spacy)
        
#         match_search = []
#         for regex in regex_entities_spacy:
#             try:
#                 # print(regex, ent)
#                 match = re.search(regex, ent, flags=re.IGNORECASE)
#                 match_search.append(match)
#             except:
#                 ent_clean = str(ent).replace("[", "").replace("]", "").replace(":", "").replace("+", "")
#                 regex_clean = str(regex).replace("[", "").replace("]", "").replace(":", "").replace("+", "")
#                 match = re.search(regex_clean, ent_clean, flags=re.IGNORECASE)
#                 match_search.append(match)
        
#         match = [(search.group(0), search) for search in match_search if search != None]
#         # print(ent)
#         # print(match)
#         if match:
#             matches_spacy = matches_spacy + 1
        
#         # print(matches_spacy)
#     prop_matches = round((matches_spacy/total_entities_spacy), 2)
#     print(prop_matches)