In [1]:
########## CONFIGURATION ###########
GRAPH_PATH = "../out/graph.ttl"

EXTRA_PATH = "../out/extra/"  # temporary data output

In [2]:
import json
from glob import glob

data_works = []
for filename in glob("..\\data_collection\\2_get_works_output\\*.json"):
    with open(filename, encoding="utf8") as f:
        data_works.extend(json.load(f))

data_works[0]

{'id': 'https://openalex.org/W3176618563',
 'doi': 'https://doi.org/10.18653/v1/2021.acl-demo.17',
 'title': 'REM: Efficient Semi-Automated Real-Time Moderation of Online Forums',
 'display_name': 'REM: Efficient Semi-Automated Real-Time Moderation of Online Forums',
 'publication_year': 2021,
 'publication_date': '2021-08-01',
 'ids': {'openalex': 'https://openalex.org/W3176618563',
  'doi': 'https://doi.org/10.18653/v1/2021.acl-demo.17',
  'mag': '3176618563'},
 'host_venue': {'id': None,
  'issn_l': None,
  'issn': None,
  'display_name': 'meeting of the association for computational linguistics',
  'publisher': 'Association for Computational Linguistics',
  'type': 'publisher',
  'url': 'https://doi.org/10.18653/v1/2021.acl-demo.17',
  'is_oa': True,
  'version': 'publishedVersion',
  'license': 'cc-by'},
 'type': 'proceedings-article',
 'open_access': {'is_oa': True,
  'oa_status': 'hybrid',
  'oa_url': 'https://aclanthology.org/2021.acl-demo.17.pdf'},
 'authorships': [{'author_po

In [3]:
from glob import glob

import pandas as pd

locations = []
venues = []

ven_files = glob("../data_collection/venues/*.json")
loc_files = glob("../data_collection/locations/*.json")

ven_dfs = []
loc_dfs = []
for fn_ven, fn_loc in zip(ven_files, loc_files):
    with open(fn_ven) as f_ven, open(fn_loc) as f_loc:
        loc_dfs.append(pd.read_json(f_loc, orient="records"))
        ven_dfs.append(pd.read_json(f_ven, orient="records"))

ven_df = pd.concat(ven_dfs, axis=0, ignore_index=True)
loc_df = pd.concat(loc_dfs, axis=0, ignore_index=True)
loc_df = loc_df.dropna(subset=["city", "country"])

str_cols = ["title", "type", "venue", "url"]
ven_df[str_cols] = ven_df[str_cols].astype(str)

str_cols = ["city", "country", "venue"]
loc_df[str_cols] = loc_df[str_cols].astype(str)

df = pd.merge(ven_df, loc_df, on=["venue", "year"])
df = df.drop_duplicates()
df.head()

Unnamed: 0,title,type,venue,year,url,city,country
0,Word-Level Uncertainty Estimation for Black-Bo...,Conference and Workshop Papers,COLING,2020,https://dblp.org/db/conf/coling/index.html,Barcelona,Spain
2,The Two Shades of Dubbing in Neural Machine Tr...,Conference and Workshop Papers,COLING,2020,https://dblp.org/db/conf/coling/index.html,Barcelona,Spain
4,Exploring Amharic Sentiment Analysis from Soci...,Conference and Workshop Papers,COLING,2020,https://dblp.org/db/conf/coling/index.html,Barcelona,Spain
10,Towards Visual Data Science - An Exploration,Conference and Workshop Papers,IHIET,2019,https://dblp.org/db/conf/ihiet/index.html,Nice,France
12,Evaluating the Scaling of Graph-Algorithms for...,Conference and Workshop Papers,OBD,2016,https://dblp.org/db/conf/obd/index.html,Vienna,Austria


In [4]:
import os.path
import pickle

coord_fn = EXTRA_PATH + "city_coords.pkl"
if os.path.isfile(coord_fn):
    with open(coord_fn, 'rb') as f:
        city_coords = pickle.load(f)
else:
    city_coords = {}

In [5]:
import requests
from urllib.parse import quote_plus
from graph.oa_graph_json import OpenAlexGraph, _resource_from_uri


SAVE_AFTER_BATCH = 200


def get_dbr_resource(term: str):
    if not isinstance(term, str):
        return
    term = term.replace(' ', '_')
    resource = quote_plus(term)
    return resource


def get_coord_from_dbo(city: str):
    """
    Workaround for service queries in rdflib not working:
    Request node uri and get coordinates.

    :param city: City resource name
    :return: lat, lng
    """
    if not isinstance(city, str):
        return None, None
    if city in city_coords:
        return city_coords[city]
    uri = "http://dbpedia.org/resource/" + city
    resp = requests.get(uri, headers={"accept": "application/json"})
    try:
        node = resp.json()[uri]
        lng = node['http://www.w3.org/2003/01/geo/wgs84_pos#long'][0]["value"]
        lat = node['http://www.w3.org/2003/01/geo/wgs84_pos#lat'][0]["value"]
        city_coords[city] = (lat, lng)
        return lat, lng
    except KeyError:
        return None, None


g = OpenAlexGraph()

total = len(data_works)
for i, item in enumerate(data_works):
    print(
        f"\rAdding work {i}/{total} ({i/total * 100:.2g}%)      ",
        end=""
    )
    host_venue = item["host_venue"]
    host_id = _resource_from_uri(host_venue["id"])

    # Add work
    work_id = _resource_from_uri(item["id"])
    work_name = item["display_name"]
    g.add_work(
        identifier=work_id,
        name=work_name,
        date_published=item["publication_date"],
    )

    for cites_by_year in item["counts_by_year"]:
        g.add_citations_in_year(
            identifier=work_id,
            year=cites_by_year["year"],
            citations=cites_by_year["cited_by_count"]
        )

    # Add authors of paper and is_author relation
    prev_author = None
    for authorship in item["authorships"]:
        author = authorship["author"]
        author_id = _resource_from_uri(author["id"])
        author_name = author["display_name"]

        inst_id = None
        if authorship["institutions"]:
            inst = authorship["institutions"][0]
            inst_id = _resource_from_uri(inst["id"])
            inst_name = inst["display_name"]
            g.add_institution(inst_id, inst_name)
            g.add_associated_with_institution(work_id, inst_id)

        g.add_author(author_id, author_name, inst_id)
        if prev_author:
            g.add_colleague(author_id, prev_author)
        prev_author = author_id
        g.add_is_author(author_id, work_id)

    ven = df[df["title"] == work_name]
    if not ven.empty:
        ven = ven.iloc[0]
        ven_id = _resource_from_uri(ven["url"], res_loc=-2)
        ven_city = get_dbr_resource(ven["city"])
        ven_country = get_dbr_resource(ven["country"])
        lat, lng = get_coord_from_dbo(ven_city)  # workaround for SERVICE not working
        g.add_venue(
            work_id=work_id,
            identifier=ven_id,
            name=ven["venue"],
            year=ven["year"],
            city=ven_city,
            country=ven_country,
            lat=lat,
            lng=lng,

        )

    if i % SAVE_AFTER_BATCH == 0:
        print(f"\nSaving for iteration {i}\n")
        g.serialize(GRAPH_PATH)

g.serialize(GRAPH_PATH)

Adding work 0/4891 (0%)      
Saving for iteration 0

Adding work 200/4891 (4.1%)      
Saving for iteration 200

Adding work 400/4891 (8.2%)      
Saving for iteration 400

Adding work 600/4891 (12%)       
Saving for iteration 600

Adding work 800/4891 (16%)      
Saving for iteration 800

Adding work 1000/4891 (20%)      
Saving for iteration 1000

Adding work 1200/4891 (25%)      
Saving for iteration 1200

Adding work 1400/4891 (29%)      
Saving for iteration 1400

Adding work 1600/4891 (33%)      
Saving for iteration 1600

Adding work 1800/4891 (37%)      
Saving for iteration 1800

Adding work 2000/4891 (41%)      
Saving for iteration 2000

Adding work 2200/4891 (45%)      
Saving for iteration 2200

Adding work 2400/4891 (49%)      
Saving for iteration 2400

Adding work 2600/4891 (53%)      
Saving for iteration 2600

Adding work 2800/4891 (57%)      
Saving for iteration 2800

Adding work 3000/4891 (61%)      
Saving for iteration 3000

Adding work 3200/4891 (65%)      
Sa

<Graph identifier=N5289a4e85e114a349ae6394991c52742 (<class 'graph.oa_graph.OpenAlexGraph'>)>

In [6]:
from graph.oa_request import oa_request


SAVE_AFTER_BATCH = 50


def get_dbr_uri(term: str):
    if not term:
        return
    term = term.replace(' ', '_')
    resource = quote_plus(term)
    return "https://dbpedia.org/resource/" + resource


def get_institutions(g):
    q = """
        SELECT DISTINCT ?inst_id
        WHERE {
            ?inst_id a schema:EducationalOrganization .
            FILTER NOT EXISTS {
              ?inst_id schema:location ?x
            }
        }
    """
    q_res = g.query(q)
    result = []
    for row in q_res:
        result.append(row.inst_id)
    return result


institutions = get_institutions(g)
print(f"adding {len(institutions)} institutions\n")
for i, inst in enumerate(institutions):
    print(f"\rAdding location for institution, no. {i}", end="")
    inst_id = _resource_from_uri(inst)
    if inst_id.startswith("_"):
        continue
    _filter = {"openalex_id": str(inst)}
    loc = oa_request("institutions", _filter)[0]["geo"]
    dbo_country = get_dbr_uri(loc["country"])
    dbo_city = get_dbr_uri(loc["city"])
    g.add_located_at(
        identifier=inst_id,
        country=_resource_from_uri(dbo_country),
        city=_resource_from_uri(dbo_city),
        latitude=loc["latitude"],
        longitude=loc["longitude"]
    )
    if i % SAVE_AFTER_BATCH == 0:
        print(f"\nSaving for iteration {i}\n")
        g.serialize(GRAPH_PATH)

g.serialize(GRAPH_PATH)

adding 1755 institutions

Adding location for institution, no. 0
Saving for iteration 0

Adding location for institution, no. 50
Saving for iteration 50

Adding location for institution, no. 250
Saving for iteration 250

Adding location for institution, no. 300
Saving for iteration 300

Adding location for institution, no. 350
Saving for iteration 350

Adding location for institution, no. 400
Saving for iteration 400

Adding location for institution, no. 650
Saving for iteration 650

Adding location for institution, no. 850
Saving for iteration 850

Adding location for institution, no. 900
Saving for iteration 900

Adding location for institution, no. 950
Saving for iteration 950

Adding location for institution, no. 1050
Saving for iteration 1050

Adding location for institution, no. 1200
Saving for iteration 1200

Adding location for institution, no. 1400
Saving for iteration 1400

Adding location for institution, no. 1450
Saving for iteration 1450

Adding location for institution, n

<Graph identifier=N5289a4e85e114a349ae6394991c52742 (<class 'graph.oa_graph.OpenAlexGraph'>)>

In [7]:
len(g)

115139

In [8]:
with open("../out/city_coords.pkl", 'wb') as f:
    pickle.dump(city_coords, f)