In [4]:
# file reading/writing 
import storage 
import csv

# analysis 
import networkx as nx
from haversine import haversine

# standard plotting 
import seaborn 
import matplotlib.pyplot as plt

# mapping 
import folium
from IPython.display import display, IFrame
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster
from shapely.geometry import shape, Point


# standard utility
import numpy as np
import random
import json
import pandas as pd

# text processing
import spacy


In [5]:
departments = storage.retrieve_all_departments()
article_dept_links = storage.get_article_department_links()
print(departments[0:5])


[(1, 'Department of Medicine, David Geffen School of Medicine, University of California, Los Angeles, CA, USA. ', 34.054077, -118.24168), (2, 'Department of Pediatrics, Division of Endocrinology & Diabetes, Department of Genetics, Stanford Diabetes Research Center, Stanford University School of Medicine, Stanford, CA, USA.', 57.45865, -6.30317), (3, 'Department of Pediatrics, Indiana University School of Medicine, Indianapolis, IN, USA.', 39.77458, -86.176796), (4, 'Division of Endocrinology, Diabetes and Metabolism, The Ohio State University College of Medicine, Columbus, OH, USA.', 39.999941, -83.008032), (5, 'Department of Metabolism, Digestion and Reproduction, Imperial College London, and Imperial College NHS Trust, London, UK.', 51.497978, -0.176781)]


In [6]:
print(len(departments))

1816


Save this to a JSONL for annotation

In [7]:
all_dep_list = []
for i in departments:
    all_dep_list.append({"listed": i[1], "annotated_name":""})
with open('dep_for_annotation.jsonl', 'w') as file:
    json.dump(all_dep_list, file)
    

Load annotated university names

In [19]:
from io import StringIO
with open('annotated_university_names.txt') as f:
    annotated_names = json.load(f)
    annotated_names_dataframe = pd.json_normalize(annotated_names)
with open('annotated_university_2.txt') as f:
    annotated_names = json.load(f)
    annotated_names_dataframe_2 = pd.json_normalize(annotated_names)
with open('annotated_university_3.txt') as f:
    annotated_names = json.load(f)
    annotated_names_dataframe_3 = pd.json_normalize(annotated_names)
annotated_names_dataframe.head()
annotated_names_dataframe = pd.concat([annotated_names_dataframe, annotated_names_dataframe_2, annotated_names_dataframe_3])

annotated_names_dataframe.replace({'annotated_name': ''}, np.nan, inplace=True)
annotated_names_dataframe = annotated_names_dataframe.dropna()
annotated_names_dataframe.head()


Unnamed: 0,listed,annotated_name
0,"Department of Neurosurgery, Stanford Universit...",Stanford University
1,"Department of Chemistry, Stanford University, ...",Stanford University
2,"Department of Chemistry, Stanford University, ...",Stanford University
3,"Department of Neurosurgery, Stanford Universit...",Stanford University
4,"Department of Neurosurgery, Stanford Universit...",Stanford University


In [20]:
print(annotated_names_dataframe.describe())

                                                   listed  \
count                                                1018   
unique                                                617   
top     Department of Immunology, University of Pittsb...   
freq                                                   34   

                  annotated_name  
count                       1018  
unique                       401  
top     University of Pittsburgh  
freq                          50  


## Loading list of universities 

Utilizing [this public domain Kaggle dataset by *The Devastator*](https://www.kaggle.com/datasets/thedevastator/all-universities-in-the-world?resource=download) the institutions available in our scraped dataset can be resolved to a known and fixed university.

In [21]:
countries_df = pd.read_csv("world-universities.csv", header=None)
countries_df.columns = ['countrycode', 'name', 'website']  
countries_df.head()

Unnamed: 0,countrycode,name,website
0,AD,University of Andorra,http://www.uda.ad/
1,AE,Abu Dhabi University,http://www.adu.ac.ae/
2,AE,Ajman University of Science & Technology,http://www.ajman.ac.ae/
3,AE,Alain University of Science and Technology,http://www.alainuniversity.ac.ae/
4,AE,Al Ghurair University,http://www.agu.ae/


It's best to rename our columns to something more descriptive.

In [22]:
countries_df.describe()

Unnamed: 0,countrycode,name,website
count,9358,9363,9363
unique,203,9276,9286
top,US,Arab Open University,http://www.aku.edu/
freq,2074,6,5


We would like to disambiguate the mentions of universities to a unique reference point to ensure our graph has the correct number of nodes. 
As such, we will make use of the functionalities of the [spAcy EntityLinker](https://spacy.io/api/entitylinker).

First, load a spacy model

In [23]:
nlp = spacy.load("en_core_web_sm")

Next, we create a Spacy knowledge base. This will serve as a repository of entities which may be resolved to.

In [24]:
from spacy.kb import InMemoryLookupKB
kb = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=96)


In [25]:
def clean_annotated_tag(tag):
    tag = tag.replace(".", "")
    return(tag)

In [26]:
#addToKB args: pipeline, knowledgebase, ID, name
dict_strict_name = {}
def addToKB(nlp, kb, real_name, fuzzed):
    
    real_name = clean_annotated_tag(real_name)
    print(real_name)
    if(real_name is not pd.NA):
        document_name = nlp(real_name)
        name_vector = document_name.vector
        if(len(name_vector)==96):
            if(str(real_name) not in kb.get_entity_strings()):
                #add entity
                kb.add_entity(entity=real_name, entity_vector=name_vector, freq=666)
                if(fuzzed not in kb.get_alias_strings()):
                    #add alias which ensures a resolution to our entity
                    kb.add_alias(alias=fuzzed, entities=[real_name], probabilities=[1.0])
                    kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])
    return(kb)

In [27]:
# this could be rewritten using df.apply 
for row, col in annotated_names_dataframe.iterrows():
    kb = addToKB(nlp, kb, col['annotated_name'],col['listed'])

Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Chan Zuckerberg Biohub
Stanford University
Stanford University
University Hospital, Bern, Switzerland
Emory University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
Chan Zuckerberg Biohub
Chan Zuckerberg Biohub
Chan Zuckerberg Biohub
Oregon Health and Science University
Stanford University
Stanford University
Stanford University
Stanford University
Stanford University
University of Melbourne
St Jude Children's Research Hospital
University of Melbourne
La Trobe University
St Jude Children's Research Hospital
Fudan University
Fudan University
Harvard Medical School
Harvard Medical School
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
University of Melbourne
U

  kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])
  kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])
  kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])


DanaFarber Cancer Institute
Harvard Medical School
DanaFarber Cancer Institute
Sloan Kettering Institute
Sloan Kettering Institute
National Institute of Chemistry, Hajdrihova 19, 1001 Ljubljana, Slovenia
National Institute of Chemistry, Hajdrihova 19, 1001 Ljubljana, Slovenia
Sloan Kettering Institute
Max Planck Institute
Max Planck Institute
Max Planck Institute
Max Planck Institute
Universitätsklinikum Münster
Fraunhofer Institute
Max Planck Institute
European Molecular Biology Laboratory
Kanazawa University
University of Münster
Max Planck Institute
University of Hohenheim
Leuven Cancer Institute
Juntendo University Graduate School of Medicine
University of Freiburg
Max Planck Institute
European Molecular Biology Laboratory
Max Planck Institute
Max Planck Institute
Fraunhofer Institute
Fraunhofer Institute
Fraunhofer Institute
Universitätsklinikum Münster
Rostock University
Max Planck Institute
Max Planck Institute
Harvard Medical School
Peking University
Columbia University
Harvard

  kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])
  kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])
  kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])


Action for Health Initiatives, Quezon City, Philippines
Superhumans Center, Kyiv, Ukraine
Thorne Harbour Health, Melbourne, VIC, Australia
AIDS Healthcare Foundation, Miami, FL, USA
San Francisco Community Health Center, San Francisco, CA, USA
Young Positive Women Voices, Nairobi, Kenya
SACEMA, Geneva, Switzerland
University of the Witwatersrand
Massachusetts General Hospital
Medicines Patent Pool, Geneva, Switzerland
Geneva University
University of the Witwatersrand
Fred Hutchinson Cancer Center, Seattle, WA 98109, USA
French Embassy, French Ministry of Foreign Affairs, Antananarivo 101, Madagascar
University of Antananarivo
University of Antananarivo
National AIDS Committee, Antananarivo, Madagascar
Ministry of Public Health, Antananarivo, Madagascar
French Ministry of Foreign Affairs
University of North Carolina at Chapel Hill
Technical University of Munich
Kaiser Permanente, San Francisco
Kaiser Permanente, San Francisco
University of Miami
Ruane Clinical Research, Los Angeles
Univ

  kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])
  kb.add_alias(alias=real_name, entities=[real_name], probabilities=[1.0])


Huazhong University of Science and Technology
Huazhong University of Science and Technology
Huazhong University of Science and Technology
Precision Scientific (Beijing) Co, Ltd, Beijing 100085, China
Huazhong University of Science and Technology
University of Michigan
University of Michigan
University of Michigan
Medical University of Lublin
University of Michigan
University of Michigan
University of Michigan
University of Michigan
Yale University
Guangzhou Laboratory
University of Washington
Vir Biotechnology, San Francisco, CA 94158, USA
Charité Universitätsmedizin Berlin
University Hospital Bonn
Charité  Universitätsmedizin Berlin
Charité  Universitätsmedizin Berlin
Goethe Universität Frankfurt am Main
Université de Paris Cité
Université de Paris Cité
Université de Paris Cité
Université de Paris Cité
University of California, Berkeley
University of California, San Francisco
Baylor College
Academy of Sciences of Sakha Republic
Russian Academy of Sciences
Boston Children's Hospital
Na

In [28]:
#print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

Aliases in the KB: ['Department of Anatomy and Cell Biology, Kyoto University, Kyoto 6068501, Japan.', 'Federal University of Ouro Preto', 'University of Sciences Techniques and Technologies of Bamako', 'Royal Sussex County Hospital', 'East China Normal University', 'Vir Biotechnology, San Francisco, CA 94158, USA', 'University of South Florida', 'Department of Rheumatology and Immunology, The First Medical Center, Chinese PLA General Hospital, Beijing 100853, China.', 'Medical University of Lublin', 'University of Bern', 'Universidad Complutense Madrid', 'Department of Nutrition and Public Health, Faculty of Health and Sport Science, University of Agder, Universitetsveien 25, Kristiansand, Agder4630, Norway.', 'Chulalongkorn University', 'Perinatal HIV Research Unit, Chris Hani Baragwanath Hospital, Soweto, South Africa.', 'Department of Prevention, Care and Treatment, Infectious Diseases Institute, Kampala, Uganda', "British Columbia Children's Hospital, Vancouver, BC, Canada.", 'Pop

In [29]:

print(f"Entities in KB: {kb.get_entity_strings()}")

Entities in KB: ['University of the West Indies', 'University of Extremadura', 'Sciensano, Ixelles, Belgium', 'Zhejiang University School', 'Federal University of Ouro Preto', 'University of Sciences Techniques and Technologies of Bamako', 'Royal Sussex County Hospital', 'East China Normal University', 'University of Illinois at Urbana-Champaign', 'Vir Biotechnology, San Francisco, CA 94158, USA', 'University of South Florida', 'Johns Hopkins Bloomberg School of Public Health', 'Medical University of Lublin', 'Ruane Clinical Research, Los Angeles', 'University of Bern', 'Université Laval', 'Deakin University', 'Universidad Complutense Madrid', 'Bordeaux University Hospital ', 'Chulalongkorn University', 'Ministry of Public Health, Antananarivo, Madagascar', 'Tampere University', 'University of Glasgow', 'University of KwaZuluNatal', 'MRC Laboratory of Medical Sciences', 'SACEMA, Geneva, Switzerland', 'University of Minnesota', 'National Cancer Center Research Institute, Tokyo, Japan', 

In [30]:
print(kb.get_alias_candidates("UM School of Medicine and Health, Department of Clinical Medicine  Clinical Department for Internal Medicine II, University Medical Centre, Technical University of Munich, Munich, Germany."))

[]


In [31]:
candidates = kb.get_candidates(nlp("TUM School of Medicine and Health, Department of Clinical Medicine  Clinical Department for Internal Medicine II, University Medical Centre, Technical University of Munich, Munich, Germany."))
print(candidates)
for c in candidates:
    print(" ", c.entity_)


[<spacy.kb.candidate.Candidate object at 0x71cb50502b00>]
  Technical University of Munich


In [32]:
print(article_dept_links[0:5])

[(1, 2, 1, 'Cell'), (1, 3, 1, 'Cell'), (1, 4, 1, 'Cell'), (1, 5, 1, 'Cell'), (1, 6, 1, 'Cell')]
