In [2]:
from collections import Counter, defaultdict
import urllib
from copy import deepcopy
import uuid
import csv
import json
from utils import chunks, write_payloads, VERSION, IDENTITY, CITES

In [3]:
with open('identifiers.csv', 'r') as csvfile:
    identifiers = list(csv.reader(csvfile, delimiter=','))
print(len(identifiers))

1800000


In [4]:
ids_dict = {}
for id_, value, scheme in identifiers:
    id_num = int(id_.split('https://w3id.org/oc/corpus/br/')[1])
    if id_num not in ids_dict:
        ids_dict[id_num] = []
    ids_dict[id_num].append((value, scheme))

In [5]:
ids_dict[73676]

[('http://www.eurosurveillance.org/viewarticle.aspx%3Farticleid', 'url'),
 ('http://www.eurosurveillance.org/viewarticle.aspx%3Farticleid', 'url'),
 ('10.2807/1560-7917.es2014.19.9.20720', 'doi'),
 ('http://dx.doi.org/10.2807/1560-7917.es2014.19.9.20720', 'url'),
 ('24626205', 'pmid'),
 ('10.2807/1560-7917', 'doi'),
 ('http://dx.doi.org/10.2807/1560-7917', 'url')]

In [6]:
doi_prefixes = [
    'http://dx.doi.org',
    'http://dx.doi.org',
    'http://doi.org',
    'http://www.dx.doi.org',
    'http://www.doi.org',
    'http://dx/doi.org',
    'https://dx.doi.org',
    'https://doi.org',
    'https://dx.doi.org'
]

In [7]:
def doi_value_fixer(value):
    if '%' in value:
        value = urllib.parse.unquote(value)
    value = value.strip()
    if value.startswith('/'):
        value = value[1:].strip()
    while ' ' in value:
        value = value.split(' ')[0]
        value = value.strip()
    if value[-2:] == ' (':
        value = value[:-2]
    if value[-1:] == '.':
        value = value[:-1]
    return value.strip()


def url_value_fixer(value):
    if '%' in value:
        value = urllib.parse.unquote(value)
    return value.strip()

In [8]:
doi_value_fixer('/10.1148/radiology.212.3.r99au29711')

'10.1148/radiology.212.3.r99au29711'

In [9]:
fails = []  # Invalid DOI-values
ids_clean = []  # Cleaned identifiers
for id_, value, scheme in identifiers:
    v_orig = value
    id_num = int(id_.split('https://w3id.org/oc/corpus/br/')[1])
    
    # Check if Identifier value is one of the DOI urls. Change to just DOI instead
    doi_pref = next((pref for pref in doi_prefixes if value.startswith(pref)), None)
    if doi_pref:
        value = value.split(doi_pref)[1]
        scheme = 'doi'
    if scheme == 'doi':
        value = doi_value_fixer(value)
    if scheme == 'url':
        value = url_value_fixer(value)
    if value:
        ids_clean.append((id_num, value, scheme))
    else:
        fails.append(v_orig)

In [10]:
Counter(fails)  # Discarded values

Counter({'http://dx.doi.org': 37105, 'http://www.dx.doi.org': 1})

In [11]:
ids_grouped = defaultdict(set)
for id_, value, scheme in ids_clean:
    ids_grouped[id_].add((value, scheme))

In [12]:
double_dois = []
for id_, ids in ids_grouped.items():
    dups = [(v, s) for v, s in ids if s == 'doi']
    if len(dups) == 2:
        v1 = dups[0][0]
        v2 = dups[1][0]
        if v1 != v2:
            double_dois.append((id_, ids))

In [14]:
identity_relationships = []
for id_, ids in ids_grouped.items():
    ids = list(ids)
    for idx in range(len(ids) - 1):
        rel = deepcopy(IDENTITY)
        val1, scheme1 = ids[idx]
        val2, scheme2 = ids[idx + 1]
        rel['Source']['Identifier'] = {"ID": val1, "IDScheme": scheme1}
        rel['Target']['Identifier'] = {"ID": val2, "IDScheme": scheme2}
        identity_relationships.append(rel)

In [15]:
print(len(identity_relationships))
print(len(ids_grouped))

535715
541067


In [16]:
Counter(len(vals) for _, vals in ids_grouped.items())

Counter({1: 142387, 2: 262536, 3: 135300, 4: 810, 5: 29, 6: 3, 7: 1, 13: 1})

In [18]:
write_payloads('events/opencitations/identity', identity_relationships, chunk_size=100,
               size=len(identity_relationships))

In [19]:
with open('citations.csv', 'r') as csvfile:
    citations = list(csv.reader(csvfile, delimiter=','))
print(len(citations))

1900000


In [20]:
cits = []
for idA, idB in citations:
    idA = int(idA.split('https://w3id.org/oc/corpus/br/')[1])
    idB = int(idB.split('https://w3id.org/oc/corpus/br/')[1])
    if (idA in ids_grouped and ids_grouped[idA]) and (idB in ids_grouped and ids_grouped[idB]):
        cits.append((idA, idB))

In [21]:
len(cits)

644657

In [22]:
citation_relationships = []
for idA, idB in cits:        
    rel = deepcopy(CITES)
    val1, scheme1 = list(ids_grouped[idA])[0]
    val2, scheme2 = list(ids_grouped[idB])[0]
    rel['Source']['Identifier'] = {"ID": val1, "IDScheme": scheme1}
    rel['Target']['Identifier'] = {"ID": val2, "IDScheme": scheme2}
    citation_relationships.append(rel)

In [23]:
write_payloads('events/opencitations/cites', citation_relationships, chunk_size=100,
               size=len(citation_relationships))