In [2]:
########## CONFIGURATION ###########
GRAPH_PATH = "../out/coord_graph.ttl"

In [3]:
import json
from glob import glob

data_authors = []
for filename in glob("..\\data_collection\\1_get_author_ids_output\\*.json"):
    with open(filename, encoding="utf8") as f:
        data_authors.extend(json.load(f))

data_authors[:2]

[{'search_name': 'Christian Ahlf',
  'institution_id': 'https://openalex.org/I70451448',
  'results': []},
 {'search_name': 'Jakob Andersen',
  'institution_id': 'https://openalex.org/I70451448',
  'results': [{'id': 'https://openalex.org/A2525626150',
    'display_name': 'Jakob Smedegaard Andersen'},
   {'id': 'https://openalex.org/A4214226477',
    'display_name': 'Jakob Andersen'}]}]

In [4]:
data_works = []
for filename in glob("..\\data_collection\\2_get_works_output\\*.json"):
    with open(filename, encoding="utf8") as f:
        data_works.extend(json.load(f))

data_works[0]

{'id': 'https://openalex.org/W3176618563',
 'doi': 'https://doi.org/10.18653/v1/2021.acl-demo.17',
 'title': 'REM: Efficient Semi-Automated Real-Time Moderation of Online Forums',
 'display_name': 'REM: Efficient Semi-Automated Real-Time Moderation of Online Forums',
 'publication_year': 2021,
 'publication_date': '2021-08-01',
 'ids': {'openalex': 'https://openalex.org/W3176618563',
  'doi': 'https://doi.org/10.18653/v1/2021.acl-demo.17',
  'mag': '3176618563'},
 'host_venue': {'id': None,
  'issn_l': None,
  'issn': None,
  'display_name': 'meeting of the association for computational linguistics',
  'publisher': 'Association for Computational Linguistics',
  'type': 'publisher',
  'url': 'https://doi.org/10.18653/v1/2021.acl-demo.17',
  'is_oa': True,
  'version': 'publishedVersion',
  'license': 'cc-by'},
 'type': 'proceedings-article',
 'open_access': {'is_oa': True,
  'oa_status': 'hybrid',
  'oa_url': 'https://aclanthology.org/2021.acl-demo.17.pdf'},
 'authorships': [{'author_po

In [5]:
from glob import glob

import pandas as pd

locations = []
venues = []

ven_files = glob("../data_collection/venues/*.json")
loc_files = glob("../data_collection/locations/*.json")

ven_dfs = []
loc_dfs = []
for fn_ven, fn_loc in zip(ven_files, loc_files):
    with open(fn_ven) as f_ven, open(fn_loc) as f_loc:
        loc_dfs.append(pd.read_json(f_loc, orient="records"))
        ven_dfs.append(pd.read_json(f_ven, orient="records"))

ven_df = pd.concat(ven_dfs, axis=0, ignore_index=True)
loc_df = pd.concat(loc_dfs, axis=0, ignore_index=True)
loc_df = loc_df.dropna(subset=["city", "country"])

In [6]:
str_cols = ["title", "type", "venue", "url"]
ven_df[str_cols] = ven_df[str_cols].astype(str)

In [7]:
str_cols = ["city", "country", "venue"]
loc_df[str_cols] = loc_df[str_cols].astype(str)

In [8]:
df = pd.merge(ven_df, loc_df, on=["venue", "year"])
df = df.drop_duplicates()
df.head()

Unnamed: 0,title,type,venue,year,url,city,country
0,Word-Level Uncertainty Estimation for Black-Bo...,Conference and Workshop Papers,COLING,2020,https://dblp.org/db/conf/coling/index.html,Barcelona,Spain
2,The Two Shades of Dubbing in Neural Machine Tr...,Conference and Workshop Papers,COLING,2020,https://dblp.org/db/conf/coling/index.html,Barcelona,Spain
4,Exploring Amharic Sentiment Analysis from Soci...,Conference and Workshop Papers,COLING,2020,https://dblp.org/db/conf/coling/index.html,Barcelona,Spain
10,Towards Visual Data Science - An Exploration,Conference and Workshop Papers,IHIET,2019,https://dblp.org/db/conf/ihiet/index.html,Nice,France
12,Evaluating the Scaling of Graph-Algorithms for...,Conference and Workshop Papers,OBD,2016,https://dblp.org/db/conf/obd/index.html,Vienna,Austria


In [12]:
city_coords = {}

In [15]:
import requests
from urllib.parse import quote_plus
from graph.oa_graph_json import OpenAlexGraph, _resource_from_uri


def get_dbr_resource(term: str):
    if not isinstance(term, str):
        return
    term = term.replace(' ', '_')
    resource = quote_plus(term)
    return resource


def get_coord_from_dbo(city: str):
    """
    Workaround for service queries in rdflib not working:
    Request node uri and get coordinates.

    :param city: City resource name
    :return: lat, lng
    """
    if not isinstance(city, str):
        return None, None
    if city in city_coords:
        return city_coords[city]
    uri = "http://dbpedia.org/resource/" + city
    resp = requests.get(uri, headers={"accept": "application/json"})
    try:
        node = resp.json()[uri]
        lng = node['http://www.w3.org/2003/01/geo/wgs84_pos#long'][0]["value"]
        lat = node['http://www.w3.org/2003/01/geo/wgs84_pos#lat'][0]["value"]
        city_coords[city] = (lat, lng)
        return lat, lng
    except KeyError:
        return None, None


g = OpenAlexGraph()

total = len(data_works)
for i, item in enumerate(data_works):
    print(
        f"\rAdding work {i}/{total} ({i/total * 100:.2g}%)      ",
        end=""
    )
    host_venue = item["host_venue"]
    host_id = _resource_from_uri(host_venue["id"])

    # Add work
    work_id = _resource_from_uri(item["id"])
    work_name = item["display_name"]
    g.add_work(
        identifier=work_id,
        name=work_name,
        date_published=item["publication_date"],
    )

    for cites_by_year in item["counts_by_year"]:
        g.add_citations_in_year(
            identifier=work_id,
            year=cites_by_year["year"],
            citations=cites_by_year["cited_by_count"]
        )

    # Add authors of paper and is_author relation
    prev_author = None
    for authorship in item["authorships"]:
        author = authorship["author"]
        author_id = _resource_from_uri(author["id"])
        author_name = author["display_name"]

        inst_id = None
        if authorship["institutions"]:
            inst = authorship["institutions"][0]
            inst_id = _resource_from_uri(inst["id"])
            inst_name = inst["display_name"]
            g.add_institution(inst_id, inst_name)
            g.add_associated_with_institution(work_id, inst_id)

        g.add_author(author_id, author_name, inst_id)
        if prev_author:
            g.add_colleague(author_id, prev_author)
        prev_author = author_id
        g.add_is_author(author_id, work_id)

    ven = df[df["title"] == work_name]
    if not ven.empty:
        ven = ven.iloc[0]
        ven_id = _resource_from_uri(ven["url"], res_loc=-2)
        ven_city = get_dbr_resource(ven["city"])
        ven_country = get_dbr_resource(ven["country"])
        lat, lng = get_coord_from_dbo(ven_city)
        g.add_venue(
            work_id=work_id,
            identifier=ven_id,
            name=ven["venue"],
            year=ven["year"],
            city=ven_city,
            country=ven_country,
            lat=lat,
            lng=lng,

        )

Adding work 4890/4891 (1e+02%)      

In [16]:
len(g)

110175

In [17]:
g.serialize(GRAPH_PATH)

<Graph identifier=N8bd7fa3cdeb34a7aa52f70edc4aa87c7 (<class 'graph.oa_graph.OpenAlexGraph'>)>

In [18]:
city_coords

{'Barcelona': (41.38333511352539, 2.183333396911621),
 'Nice': (43.70339965820312, 7.266300201416016),
 'Vienna': (48.20833206176758, 16.37249946594238),
 'San_Francisco': (37.77750015258789, -122.416389465332),
 'Boppard': (50.23138809204102, 7.590833187103271),
 'Turku': (60.45000076293945, 22.26666641235352),
 'Prague': (50.08333206176758, 14.41666698455811),
 'Berlin': (52.52000045776367, 13.40499973297119),
 'Geneva': (46.20166778564453, 6.146944522857666),
 'Salzburg': (47.79999923706055, 13.03333377838135),
 'Osaka': (34.69388961791992, 135.5022277832031),
 'Rome': (41.89333343505859, 12.48277759552002),
 'Gothenburg': (57.70000076293945, 11.96666622161865),
 'Pisa': (43.71666717529297, 10.39999961853027),
 'Clearwater_Beach': (27.97993087768555, -82.82723236083984),
 'Paderborn': (51.71805572509766, 8.754166603088379),
 'Cannes': (43.55130004882812, 7.012800216674805),
 'Langenargen': (47.59999847412109, 9.541666984558105),
 'Marburg': (50.81000137329102, 8.770833015441895),
 '

In [21]:
import pickle
with open("../out/city_coords.pkl", 'wb') as f:
    pickle.dump(city_coords, f)