Skip to content

Commit

Permalink
harvester: zenodo versioning harvester
Browse files Browse the repository at this point in the history
  • Loading branch information
ChiaraBi authored and slint committed Dec 12, 2018
1 parent f5a3ee4 commit 1bd0957
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 77 deletions.
2 changes: 2 additions & 0 deletions asclepias_broker/harvester/config.py
Expand Up @@ -10,6 +10,7 @@
from kombu import Exchange

from .metadata import ADSMetadataHarvester, DOIMetadataHarvester
from .zenodo import ZenodoVersioningHarvester

ASCLEPIAS_HARVESTER_HISTORY_PREFIX = 'asclepias-harvester'

Expand All @@ -35,6 +36,7 @@
ASCLEPIAS_HARVESTER_METADATA_HARVESTERS = {
'doi': (DOIMetadataHarvester, {}),
'ads': (ADSMetadataHarvester, {}),
'zenodo': (ZenodoVersioningHarvester, {}),
}

ASCLEPIAS_HARVESTER_ADS_API_TOKEN = None
Expand Down
180 changes: 104 additions & 76 deletions asclepias_broker/harvester/metadata.py
Expand Up @@ -18,6 +18,19 @@

from ..metadata.api import update_metadata
from .base import MetadataHarvester
from .crossref import CrossrefAPIException


class DataCiteAPIException(Exception):
"""DataCite REST API exception."""


class AdsAPIException(Exception):
"""Ads REST API exception."""


class MetadataAPIException(Exception):
"""Metadata REST API exception."""


def _date_from_parts(parts):
Expand All @@ -34,74 +47,82 @@ def crossref_metadata(doi: str) -> dict:
# TODO: Add "mailto" parameter as described in
# https://www.eventdata.crossref.org/guide/service/query-api
resp = requests.get(f'https://api.crossref.org/works/{doi}')
metadata = resp.json()['message']
result = {}
result['Identifier'] = [{'IDScheme': 'doi', 'ID': doi}]
res_type = metadata['type']
result['Type'] = {
'Name': res_type if res_type == 'dataset' else 'literature',
}
if metadata.get('title'):
result['Title'] = metadata['title'][0]
creators = []
for author_field in ('author', 'editor'):
authors = metadata.get(author_field, [])
for author in authors:
if author.get('family') and author.get('given'):
creators.append(
'{}, {}'.format(author['family'], author['given']))
if creators:
result['Creator'] = [{'Name': c} for c in creators]

if metadata.get('publisher'):
result['Publisher'] = [{'Name': metadata['publisher']}]

for date_field in ('issued', 'published-online', 'published-print'):
if metadata.get(date_field):
result['PublicationDate'] = _date_from_parts(
metadata[date_field]['date-parts'][0])
break
return result
if resp.ok:
metadata = resp.json()['message']
result = {}
result['Identifier'] = [{'IDScheme': 'doi', 'ID': doi}]
res_type = metadata['type']
result['Type'] = {
'Name': res_type if res_type == 'dataset' else 'literature',
}
if metadata.get('title'):
result['Title'] = metadata['title'][0]
creators = []
for author_field in ('author', 'editor'):
authors = metadata.get(author_field, [])
for author in authors:
if author.get('family') and author.get('given'):
creators.append(
'{}, {}'.format(author['family'], author['given']))
if creators:
result['Creator'] = [{'Name': c} for c in creators]

if metadata.get('publisher'):
result['Publisher'] = [{'Name': metadata['publisher']}]

for date_field in ('issued', 'published-online', 'published-print'):
if metadata.get(date_field):
result['PublicationDate'] = _date_from_parts(
metadata[date_field]['date-parts'][0])
break
return result
else:
raise CrossrefAPIException()


def datacite_metadata(doi: str) -> dict:
"""."""
# TODO: Consider using marshmallow for parsing these responses...
mimetype = 'application/vnd.datacite.datacite+json'
resp = requests.get(f'https://data.datacite.org/{mimetype}/{doi}')
metadata = resp.json()
result = {}

result['Identifier'] = [{'IDScheme': 'doi', 'ID': doi}]
alt_ids = metadata.get('alternate_identifier') or []
if not isinstance(alt_ids, list):
alt_ids = [alt_ids]
for ai in alt_ids:
result['Identifier'].append({'IDScheme': ai['type'], 'ID': ai['name']})

res_type = metadata['resource_type_general'].lower()
result['Type'] = {
'Name': (res_type if res_type in ('dataset', 'software')
else 'literature')
}
if metadata.get('title'):
result['Title'] = metadata['title']

creators = []
if metadata.get('creator'):
for author in metadata['creator']:
if isinstance(author, str):
creators.append(author)
elif author.get('name'):
creators.append(author['name'])
elif author.get('familyName') and author.get('givenName'):
creators.append(
'{}, {}'.format(author['familyName'], author['givenName']))
if creators:
result['Creator'] = [{'Name': c} for c in creators]

result['PublicationDate'] = metadata['date_published']
return result
if resp.ok:
metadata = resp.json()
result = {}

result['Identifier'] = [{'IDScheme': 'doi', 'ID': doi}]
alt_ids = metadata.get('alternate_identifier') or []
if not isinstance(alt_ids, list):
alt_ids = [alt_ids]
for ai in alt_ids:
result['Identifier'].append({'IDScheme': ai['type'],
'ID': ai['name']})

res_type = metadata['resource_type_general'].lower()
result['Type'] = {
'Name': (res_type if res_type in ('dataset', 'software')
else 'literature')
}
if metadata.get('title'):
result['Title'] = metadata['title']

creators = []
if metadata.get('creator'):
for author in metadata['creator']:
if isinstance(author, str):
creators.append(author)
elif author.get('name'):
creators.append(author['name'])
elif author.get('familyName') and author.get('givenName'):
creators.append(
'{}, {}'.format(author['familyName'],
author['givenName']))
if creators:
result['Creator'] = [{'Name': c} for c in creators]

result['PublicationDate'] = metadata['date_published']
return result
else:
raise DataCiteAPIException()


class DOIMetadataHarvester(MetadataHarvester):
Expand Down Expand Up @@ -132,7 +153,7 @@ def harvest(self, identifier: str, scheme: str,
"""."""
data = self.get_metadata(identifier)
if data:
providers = set(providers)
providers = set(providers) if providers else set()
providers.add(self.provider_name)
update_metadata(
identifier, scheme, data,
Expand All @@ -154,7 +175,10 @@ def get_agency(self, doi: str) -> str:
def _agency_by_prefix(self, doi_prefix):
"""."""
res = requests.get(f'{self.doi_api_url}/{doi_prefix}')
return res.json()[0].get('RA').lower()
if res.ok:
return res.json()[0].get('RA').lower()
else:
raise MetadataAPIException()


class ADSMetadataHarvester(MetadataHarvester):
Expand Down Expand Up @@ -212,7 +236,7 @@ def harvest(self, identifier: str, scheme: str,
"""."""
data = self.get_metadata(identifier)
if data:
providers = set(providers)
providers = set(providers) if providers else set()
providers.add(self.provider_name)
update_metadata(
identifier, scheme, data,
Expand All @@ -224,18 +248,22 @@ def get_metadata(self, bibcode: str) -> dict:
params['q'] = f'identifier:{bibcode}'
res = requests.get(
self.api_url, params=params, headers=self._req_headers)
data = res.json()
if data['response']['numFound'] == 1:
doc = data['response']['docs'][0]
return {
'Identifier': self._extract_identifiers(doc),
'Publisher': ([{'Name': doc['pub']}]
if doc.get('pub') else None),
'Creator': [{'Name': n} for n in doc.get('author', []) if n],
'Title': doc.get('title', [None])[0],
'PublicationDate': self._extract_date(doc),
'Type': {'Name': self._extract_type(doc)},
}
if res.ok:
data = res.json()
if data['response']['numFound'] == 1:
doc = data['response']['docs'][0]
return {
'Identifier': self._extract_identifiers(doc),
'Publisher': ([{'Name': doc['pub']}]
if doc.get('pub') else None),
'Creator': [{'Name': n} for n in doc.get('author', [])
if n],
'Title': doc.get('title', [None])[0],
'PublicationDate': self._extract_date(doc),
'Type': {'Name': self._extract_type(doc)},
}
else:
raise AdsAPIException()

@cached_property
def api_token(self):
Expand Down
2 changes: 1 addition & 1 deletion asclepias_broker/harvester/tasks.py
Expand Up @@ -31,7 +31,7 @@ def harvest_metadata(identifiers: Optional[List[dict]],
eager: bool = False):
"""."""
if identifiers:
identifiers_to_harvest = (dict(identifier=i, scheme=v)
identifiers_to_harvest = (dict(identifier=i, scheme=v, providers=None)
for i, v in identifiers)
else: # use queue
identifiers_to_harvest = current_harvester.metadata_queue.consume()
Expand Down

0 comments on commit 1bd0957

Please sign in to comment.