In [1]:
from collections import Counter
from copy import deepcopy
import json
import uuid
import os
from utils import IDENTITY, VERSION, CITES, write_payloads
from pprint import pprint

### Fetch the data first
Run the ``fetch_zenodo_data.sh`` first, otherwise the next cell will not be able to open the JSON file.

In [2]:
import json
with open("records-2018-04-20T15:22:08.json", 'r') as fp:
    records = [json.loads(line) for line in fp.readlines()]
print(len(records))

391694


In [3]:
records = [r for r in records if 'doi' in r['metadata']]  # Records without DOI (~300) don't have relations

In [4]:
ZENODO_TYPE_MAP = {
    'dataset': 'dataset',
    'image': 'dataset',
    'lesson': 'literature',
    'other': 'unknown',
    'poster': 'literature',
    'presentation': 'literature',
    'publication': 'literature',
    'software': 'software',
    'video': 'dataset'
}
def zenodo_type_to_asclepias(record):
    return ZENODO_TYPE_MAP[record['metadata']['resource_type']['type']]

In [6]:
version_rels = []
for r in records:
    if 'conceptdoi' in r:
        rel = deepcopy(VERSION)
        rel['Source']['Identifier'] = {"ID": r['conceptdoi'], "IDScheme": 'doi'}
        rel['Source']['Type'] = {'Name': zenodo_type_to_asclepias(r)}
        # DOI isVersionOf ConceptDOI
        rel['Target']['Identifier'] = {"ID": r['doi'], "IDScheme": 'doi'}
        rel['Target']['Type'] = {'Name': zenodo_type_to_asclepias(r)}
        version_rels.append(rel)
write_payloads('events/zenodo/version', version_rels, size=len(version_rels))
del version_rels

In [8]:
identity_rels = []
for r in records:
    rel = deepcopy(IDENTITY)
    rel['Source']['Identifier'] = {"ID": r['doi'], "IDScheme": 'doi'}
    rel['Source']['Type'] = {'Name': zenodo_type_to_asclepias(r)}
    # DOI isVersionOf ConceptDOI
    rel['Target']['Identifier'] = {"ID": r['links']['html'], "IDScheme": 'url'}
    rel['Target']['Type'] = {'Name': zenodo_type_to_asclepias(r)}
    identity_rels.append(rel)
write_payloads('events/zenodo/identity', identity_rels, size=len(identity_rels))
del identity_rels

In [49]:
related_rels = []
for r in records:
    if 'related_identifiers' in r['metadata']:
        for ri in r['metadata']['related_identifiers']:
            related_rels.append((r['doi'], ri['relation'], ri['identifier'], ri['scheme']))

In [182]:
print(len(version_rels))
print(len(identity_rels))
print(len(related_rels))

128457
391355
1336983


In [51]:
Counter(ri[-1] for ri in related_rels)

Counter({'cites': 41440,
         'compiles': 771,
         'documents': 179,
         'hasPart': 465820,
         'isCitedBy': 218930,
         'isCompiledBy': 1383,
         'isDocumentedBy': 836,
         'isIdenticalTo': 6393,
         'isNewVersionOf': 3528,
         'isPartOf': 549031,
         'isPreviousVersionOf': 387,
         'isReferencedBy': 646,
         'isSupplementTo': 37175,
         'isSupplementedBy': 1041,
         'isVersionOf': 5428,
         'references': 3995})

In [168]:
Counter(r['metadata']['resource_type']['type'] for r in records)

Counter({'dataset': 28298,
         'image': 180635,
         'lesson': 520,
         'other': 680,
         'poster': 1787,
         'presentation': 3864,
         'publication': 140515,
         'software': 34379,
         'video': 677})

In [150]:
records[0]

{'conceptrecid': '659578',
 'created': '2016-11-02T07:21:21.734324+00:00',
 'doi': '10.5281/zenodo.164231',
 'id': 164231,
 'links': {'badge': 'https://zenodo.org/badge/doi/10.5281/zenodo.164231.svg',
  'bucket': 'https://zenodo.org/api/files/0869895b-ac3f-42f1-bc69-43fc85e88e5f',
  'doi': 'https://doi.org/10.5281/zenodo.164231',
  'html': 'https://zenodo.org/record/164231',
  'latest': 'https://zenodo.org/api/records/164231',
  'latest_html': 'https://zenodo.org/record/164231',
  'thumb250': 'https://zenodo.org/api/iiif/v2/0869895b-ac3f-42f1-bc69-43fc85e88e5f:18cc2de0-d511-4f97-8e21-9c544a25debb:figure.png/full/250,/0/default.png'},
 'metadata': {'access_right': 'open',
  'access_right_category': 'success',
  'communities': [{'id': 'biosyslit'}],
  'creators': [{'name': 'Shear, William A.'}, {'name': 'Warfel, Joseph G.'}],
  'description': 'FIGURES 58 – 63. Taracus marchingtoni. Fig. 58, female, dorsal view. Fig. 59, penis. Fig. 60, glans penis. Fig. 61, female, lateral view. Fig. 62,