In [1]:
# file reading/writing 
import storage 
import csv

# analysis 
import networkx as nx
from haversine import haversine

# standard plotting 
import seaborn 
import matplotlib.pyplot as plt

# mapping 
import folium
from IPython.display import display, IFrame
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster
from shapely.geometry import shape, Point


# standard utility
import numpy as np
import random
import json
import pandas as pd

# text processing
import spacy


In [36]:

departments = storage.retrieve_all_departments()
article_dept_links = storage.get_article_department_links()
print(departments[0:5])

[(1, 'Department of Neurosurgery, Stanford University School of Medicine, Stanford, CA 94305, USA.', 37.424669, -122.170275), (2, 'Department of Chemistry, Stanford University, Stanford, CA 94305, USA.', 37.424669, -122.170275), (3, 'Department of Chemistry, Stanford University, Stanford, CA 94305, USA.', 37.424669, -122.170275), (4, 'Department of Neurosurgery, Stanford University School of Medicine, Stanford, CA 94305, USA.', 37.424669, -122.170275), (5, 'Department of Neurosurgery, Stanford University School of Medicine, Stanford, CA 94305, USA.', 37.424669, -122.170275)]


Save this to a JSONL for Prodigy annotation

In [39]:
all_dep_list = []
for i in departments:
    all_dep_list.append({"listed": i[1], "annotated_name":""})
with open('dep_for_annotation.jsonl', 'w') as file:
    json.dump(all_dep_list, file)
    

## Loading list of universities 

Utilizing [this public domain Kaggle dataset by *The Devastator*](https://www.kaggle.com/datasets/thedevastator/all-universities-in-the-world?resource=download) the institutions available in our scraped dataset can be resolved to a known and fixed university.

In [4]:
countries_df = pd.read_csv("world-universities.csv", header=None)
countries_df.head()

Unnamed: 0,0,1,2
0,AD,University of Andorra,http://www.uda.ad/
1,AE,Abu Dhabi University,http://www.adu.ac.ae/
2,AE,Ajman University of Science & Technology,http://www.ajman.ac.ae/
3,AE,Alain University of Science and Technology,http://www.alainuniversity.ac.ae/
4,AE,Al Ghurair University,http://www.agu.ae/


It's best to rename our columns to something more descriptive.

In [5]:
countries_df.columns = ['countrycode', 'name', 'website']  
countries_df.head()

Unnamed: 0,countrycode,name,website
0,AD,University of Andorra,http://www.uda.ad/
1,AE,Abu Dhabi University,http://www.adu.ac.ae/
2,AE,Ajman University of Science & Technology,http://www.ajman.ac.ae/
3,AE,Alain University of Science and Technology,http://www.alainuniversity.ac.ae/
4,AE,Al Ghurair University,http://www.agu.ae/


We would like to disambiguate the mentions of universities to a unique reference point to ensure our graph has the correct number of nodes. 
As such, we will make use of the functionalities of the [spAcy EntityLinker](https://spacy.io/api/entitylinker).

First, load a spacy model

In [6]:
nlp = spacy.load("en_core_web_sm")

Next, we create a Spacy knowledge base. This will serve as a repository of entities which may be resolved to.

In [17]:
from spacy.kb import InMemoryLookupKB
kb = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=96)

In [18]:
#addToKB args: pipeline, knowledgebase, ID, name
def addToKB(nlp, kb, ID, name):
    document_name = nlp(name)
    name_vector = document_name.vector
    #add entity
    kb.add_entity(entity=str(ID), entity_vector=name_vector, freq=666)
    #add alias which ensures a resolution to our entity
    kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
    return(kb)

In [19]:
# this could be rewritten using df.apply 
for row, col in countries_df.iterrows():
    kb = addToKB(nlp, kb, row, col['name'])

  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(ID)], probabilities=[1])
  kb.add_a

In [20]:
#print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

Entities in the KB: ['2224', '4466', '8579', '8707', '4968', '3003', '4171', '4993', '1254', '4506', '7590', '1900', '5415', '6766', '1767', '5245', '8905', '6500', '8010', '7219', '860', '5291', '6423', '7653', '5052', '3244', '6380', '3015', '2859', '1883', '2922', '5706', '2206', '1428', '5927', '4699', '7084', '7666', '1985', '9035', '4787', '1937', '3374', '4298', '6787', '7623', '7407', '9046', '9255', '5185', '529', '3602', '6683', '7032', '9041', '9360', '2698', '7263', '5248', '326', '2960', '3022', '6746', '7335', '8728', '2435', '1348', '4982', '7612', '6068', '5461', '3230', '3699', '6182', '7378', '3401', '7157', '5988', '6276', '4727', '1062', '546', '2538', '4282', '5322', '5414', '4478', '5898', '6273', '2999', '7681', '9144', '8298', '9337', '290', '3879', '1816', '7860', '6445', '7905', '1851', '8997', '7685', '6957', '8797', '4587', '3525', '8008', '8180', '2355', '6147', '7130', '9359', '7832', '799', '6984', '1928', '5669', '4885', '2761', '1417', '7398', '1664', '

In [35]:
print(f"Candidates for Boston University': {[c.alias_ for c in kb.get_alias_candidates('boston university')]}")


Candidates for Boston University': []
