In [None]:
from collections import Counter
from copy import deepcopy
import json
import uuid
import os
from utils import IDENTITY, VERSION, CITES, write_payloads
from pprint import pprint

### Fetch the data first
Run the ``prepare_data.sh`` first, otherwise the next cell will not be able to open the JSON file.

In [None]:
import json
with open("records-2018-04-20T15:22:08.json", 'r') as fp:
    records = [json.loads(line) for line in fp.readlines()]
print(len(records))

In [None]:
records = [r for r in records if 'doi' in r['metadata']]  # Records without DOI (~300) don't have relations

In [None]:
ZENODO_TYPE_MAP = {
    'dataset': 'dataset',
    'image': 'dataset',
    'lesson': 'literature',
    'other': 'unknown',
    'poster': 'literature',
    'presentation': 'literature',
    'publication': 'literature',
    'software': 'software',
    'video': 'dataset'
}
def zenodo_type_to_asclepias(record):
    return ZENODO_TYPE_MAP[record['metadata']['resource_type']['type']]

In [None]:
version_rels = []
for r in records:
    if 'conceptdoi' in r:
        rel = deepcopy(VERSION)
        rel['Source']['Identifier'] = {"ID": r['conceptdoi'], "IDScheme": 'doi'}
        rel['Source']['Type'] = {'Name': zenodo_type_to_asclepias(r)}
        # DOI isVersionOf ConceptDOI
        rel['Target']['Identifier'] = {"ID": r['doi'], "IDScheme": 'doi'}
        rel['Target']['Type'] = {'Name': zenodo_type_to_asclepias(r)}
        version_rels.append(rel)
write_payloads('events/zenodo/version', version_rels, size=len(version_rels))
del version_rels

In [None]:
identity_rels = []
for r in records:
    rel = deepcopy(IDENTITY)
    rel['Source']['Identifier'] = {"ID": r['doi'], "IDScheme": 'doi'}
    rel['Source']['Type'] = {'Name': zenodo_type_to_asclepias(r)}
    # DOI isVersionOf ConceptDOI
    rel['Target']['Identifier'] = {"ID": r['links']['html'], "IDScheme": 'url'}
    rel['Target']['Type'] = {'Name': zenodo_type_to_asclepias(r)}
    identity_rels.append(rel)
write_payloads('events/zenodo/identity', identity_rels, size=len(identity_rels))
del identity_rels

In [None]:
related_rels = []
for r in records:
    if 'related_identifiers' in r['metadata']:
        for ri in r['metadata']['related_identifiers']:
            related_rels.append((r['doi'], ri['relation'], ri['identifier'], ri['scheme']))

In [None]:
print(len(version_rels))
print(len(identity_rels))
print(len(related_rels))

In [None]:
Counter(ri[-1] for ri in related_rels)

In [None]:
Counter(r['metadata']['resource_type']['type'] for r in records)

In [None]:
records[0]