In [1]:
# file reading/writing 
import storage 
import csv

# analysis 
import networkx as nx
from haversine import haversine

# standard plotting 
import seaborn 
import matplotlib.pyplot as plt

# mapping 
import folium
from IPython.display import display, IFrame
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster
from shapely.geometry import shape, Point


# standard utility
import numpy as np
import random
import json
import pandas as pd

# text processing
import spacy


In [2]:
departments = storage.retrieve_all_departments()
article_dept_links = storage.get_article_department_links()
print(departments[0:5])


[(1, 'Department of Medicine, David Geffen School of Medicine, University of California, Los Angeles, CA, USA. ', 34.054077, -118.24168), (2, 'Department of Pediatrics, Division of Endocrinology & Diabetes, Department of Genetics, Stanford Diabetes Research Center, Stanford University School of Medicine, Stanford, CA, USA.', 57.45865, -6.30317), (3, 'Department of Pediatrics, Indiana University School of Medicine, Indianapolis, IN, USA.', 39.77458, -86.176796), (4, 'Division of Endocrinology, Diabetes and Metabolism, The Ohio State University College of Medicine, Columbus, OH, USA.', 39.999941, -83.008032), (5, 'Department of Metabolism, Digestion and Reproduction, Imperial College London, and Imperial College NHS Trust, London, UK.', 51.497978, -0.176781)]


In [3]:
print(len(departments))

1816


Save this to a JSONL for annotation

In [4]:
all_dep_list = []
for i in departments:
    all_dep_list.append({"listed": i[1], "annotated_name":""})
with open('dep_for_annotation.jsonl', 'w') as file:
    json.dump(all_dep_list, file)
    

Load annotated university names

In [7]:
from io import StringIO
with open('annotated_university_names.txt') as f:
    annotated_names = json.load(f)
    annotated_names_dataframe = pd.json_normalize(annotated_names)
with open('annotated_university_2.txt') as f:
    annotated_names = json.load(f)
    annotated_names_dataframe_2 = pd.json_normalize(annotated_names)
with open('annotated_university_3.txt') as f:
    annotated_names = json.load(f)
    annotated_names_dataframe_3 = pd.json_normalize(annotated_names)
annotated_names_dataframe.head()
annotated_names_dataframe = pd.concat([annotated_names_dataframe, annotated_names_dataframe_2, annotated_names_dataframe_3])

annotated_names_dataframe.replace({'annotated_name': ''}, np.nan, inplace=True)
annotated_names_dataframe = annotated_names_dataframe.dropna()
annotated_names_dataframe.head()


Unnamed: 0,listed,annotated_name
0,"Department of Neurosurgery, Stanford Universit...",Stanford University
1,"Department of Chemistry, Stanford University, ...",Stanford University
2,"Department of Chemistry, Stanford University, ...",Stanford University
3,"Department of Neurosurgery, Stanford Universit...",Stanford University
4,"Department of Neurosurgery, Stanford Universit...",Stanford University


In [8]:
print(annotated_names_dataframe.describe())

                                                   listed  \
count                                                1018   
unique                                                617   
top     Department of Immunology, University of Pittsb...   
freq                                                   34   

                  annotated_name  
count                       1018  
unique                       401  
top     University of Pittsburgh  
freq                          50  


## Loading list of universities 

Utilizing [this public domain Kaggle dataset by *The Devastator*](https://www.kaggle.com/datasets/thedevastator/all-universities-in-the-world?resource=download) the institutions available in our scraped dataset can be resolved to a known and fixed university.

In [7]:
countries_df = pd.read_csv("world-universities.csv", header=None)
countries_df.columns = ['countrycode', 'name', 'website']  
countries_df.head()

Unnamed: 0,countrycode,name,website
0,AD,University of Andorra,http://www.uda.ad/
1,AE,Abu Dhabi University,http://www.adu.ac.ae/
2,AE,Ajman University of Science & Technology,http://www.ajman.ac.ae/
3,AE,Alain University of Science and Technology,http://www.alainuniversity.ac.ae/
4,AE,Al Ghurair University,http://www.agu.ae/


It's best to rename our columns to something more descriptive.

In [8]:
countries_df.describe()

Unnamed: 0,countrycode,name,website
count,9358,9363,9363
unique,203,9276,9286
top,US,Arab Open University,http://www.aku.edu/
freq,2074,6,5


We would like to disambiguate the mentions of universities to a unique reference point to ensure our graph has the correct number of nodes. 
As such, we will make use of the functionalities of the [spAcy EntityLinker](https://spacy.io/api/entitylinker).

First, load a spacy model

In [11]:
nlp = spacy.load("en_core_web_sm")

Next, we create a Spacy knowledge base. This will serve as a repository of entities which may be resolved to.

In [25]:
from spacy.kb import InMemoryLookupKB
kb = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=96)


In [26]:
def clean_annotated_tag(tag):
    tag = tag.replace(".", "")
    return(tag)

In [37]:
#addToKB args: pipeline, knowledgebase, ID, name
dict_strict_name = {}
def addToKB(nlp, kb, real_name, fuzzed):
    
    real_name = clean_annotated_tag(real_name)
    print(real_name)
    if(real_name is not pd.NA):
        document_name = nlp(real_name)
        name_vector = document_name.vector
        if(len(name_vector)==96):
            if(str(real_name) not in kb.get_entity_strings()):
                #add entity
                kb.add_entity(entity=real_name, entity_vector=name_vector, freq=666)
                if(fuzzed not in kb.get_alias_strings()):
                    #add alias which ensures a resolution to our entity
                    kb.add_alias(alias=fuzzed, entities=[real_name], probabilities=[1.0])
                    kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])
    return(kb)

In [38]:
# this could be rewritten using df.apply 
for row, col in annotated_names_dataframe.iterrows():
    kb = addToKB(nlp, kb, col['annotated_name'],col['listed'])

Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Chan Zuckerberg Biohub
Stanford University
Stanford University
University Hospital, Bern, Switzerland
Emory University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Chan Zuckerberg Biohub
Chan Zuckerberg Biohub
Chan Zuckerberg Biohub
Oregon Health and Science University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
University of Melbourne
St Jude Children's Research Hospital
University of Melbourne
La Trobe University
St Jude Children's Research Hospital
Fudan University
Fudan University
Harvard Medical School
Harvard Medical School
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
U

In [39]:
#print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

Aliases in the KB: ['Department of Anatomy and Cell Biology, Kyoto University, Kyoto 6068501, Japan.', 'International Association of Providers of AIDS Care, Washington, DC, USA.', 'Institute of Nutrition Josué de Castro, Federal University of Rio de Janeiro, Rio de Janeiro, Brazil.', 'Zoological Institute of Russian Academy of Sciences, SaintPetersburg, Russia.', 'Department of Rheumatology and Immunology, The First Medical Center, Chinese PLA General Hospital, Beijing 100853, China.', 'Lady Davis Institute, McGill University, Montreal, QC, Canada.', 'Department of Nutrition and Public Health, Faculty of Health and Sport Science, University of Agder, Universitetsveien 25, Kristiansand, Agder4630, Norway.', 'Institute of Medical Biochemistry, Center for Molecular Biology of Inflammation (ZMBE), University of Münster, Münster 48149, Germany.', 'Department of Prevention, Care and Treatment, Infectious Diseases Institute, Kampala, Uganda', 'National AIDS, STIs and Hepatitis Control Program

In [40]:

print(f"Entities in KB: {kb.get_entity_strings()}")

Entities in KB: ['University of the West Indies', 'University of Extremadura', 'Sciensano, Ixelles, Belgium', 'Zhejiang University School', 'Federal University of Ouro Preto', 'University of Sciences Techniques and Technologies of Bamako', 'Royal Sussex County Hospital', 'East China Normal University', 'University of Illinois at Urbana-Champaign', 'Vir Biotechnology, San Francisco, CA 94158, USA', 'University of South Florida', 'Johns Hopkins Bloomberg School of Public Health', 'Medical University of Lublin', 'Ruane Clinical Research, Los Angeles', 'University of Bern', 'Université Laval', 'Deakin University', 'Universidad Complutense Madrid', 'Bordeaux University Hospital ', 'Chulalongkorn University', 'Ministry of Public Health, Antananarivo, Madagascar', 'Tampere University', 'University of Glasgow', 'University of KwaZuluNatal', 'MRC Laboratory of Medical Sciences', 'SACEMA, Geneva, Switzerland', 'University of Minnesota', 'National Cancer Center Research Institute, Tokyo, Japan', 

In [45]:
print(kb.get_alias_candidates("UM School of Medicine and Health, Department of Clinical Medicine  Clinical Department for Internal Medicine II, University Medical Centre, Technical University of Munich, Munich, Germany."))

[]


In [46]:
candidates = kb.get_candidates(nlp("TUM School of Medicine and Health, Department of Clinical Medicine  Clinical Department for Internal Medicine II, University Medical Centre, Technical University of Munich, Munich, Germany."))
print(candidates)
for c in candidates:
    print(" ", c.entity_)


[<spacy.kb.candidate.Candidate object at 0x79c07fffef20>]
  Technical University of Munich
