In [2]:
import csv
from copy import deepcopy
from datetime import datetime
import json
from pathlib import Path

import idutils
import requests

In [4]:
# Get your ADS API token from https://ui.adsabs.harvard.edu/#user/settings/token
ADS_TOKEN = 'CHANGEME'
ADS_API_URL = 'https://api.adsabs.harvard.edu/v1/search/query'
ADS_API_HEADERS = {"Authorization": "Bearer:{}".format(ADS_TOKEN)}
ADS_API_PARAMS = {
    "q": "identifier:CHANGEME",
    "fl": "title,author,doi,bibcode,identifier,doctype,pub,year,pubdate",
}

ADS_TYPE_TO_ASCLEPIAS = {
    'article': 'literature',
    'eprint': 'literature',
    'inproceedings': 'literature',
    'inbook': 'literature',
    'abstract': 'unknown',
    'book': 'literature',
    'bookreview': 'unknown',
    'catalog': 'unknown',
    'circular': 'unknown',
    'erratum': 'unknown',
    'mastersthesis': 'literature',
    'newsletter': 'unknown',
    'obituary': 'unknown',
    'phdthesis': 'literature',
    'pressrelease': 'unknown',
    'proceedings': 'unknown',
    'proposal': 'unknown',
    'software': 'software',
    'talk': 'unknown',
    'techreport': 'unknown',
    'misc': 'unknown',
}


OUTPUT_DIR = 'ads_metadata'

In [6]:
def fetch_metadata(bibcode):
    params = deepcopy(ADS_API_PARAMS)
    params['q'] = 'identifier:{}'.format(bibcode)
    res = requests.get(ADS_API_URL, params=params, headers=ADS_API_HEADERS)
    data = res.json()
    assert data['response']['numFound'] == 1
    return data['response']['docs'][0]


def build_identifiers(data):
    ids = set()
    if data.get('bibcode'):
        ids.add((data.get('bibcode'), 'ads'))
    ids |= {(d, 'doi') for d in data.get('doi', []) if d}
    for id_ in data.get('identifier', []):
        try:
            ids.add((id_, idutils.detect_identifier_schemes(id_)[0]))
        except Exception:
            pass
    return [{'ID': i, 'IDScheme': s} for i, s in ids if i and s]


def extract_date(data):
    try:
        return datetime.strptime(data.get('pubdate'), '%Y-%m-%d').isoformat()
    except Exception:
        return data.get('year')

    
def build_document(data):
    return {
        'Provider': 'SAO/NASA Astrophysics Data System',
        'Object': {
            'Identifier': build_identifiers(data),
            'Publisher': [{'Name': data['pub']}] if data.get('pub') else None,
            'Creator': [{'Name': n} for n in data.get('author', []) if n],
            'Title': data.get('title', [None])[0],
            'PublicationDate': extract_date(data),
            'Type': { 'Name': ADS_TYPE_TO_ASCLEPIAS.get(data.get('doctype'))},
        }
    }


def fetch_all_docs(csv_path):
    with open(csv_path, 'r') as csvfile:
        citations = list(csv.reader(csvfile, delimiter=','))[1:]
    bibcodes = {bibcode for bibcode, *_ in citations}
    print(len(bibcodes), 'citations read from CSV')
    errors = []
    docs = []
    for i, bibcode in enumerate(bibcodes):
        if i % 10 == 0:
            print(i)
        try:
            docs.append((bibcode, build_document(fetch_metadata(bibcode))))
        except Exception as ex:
            errors.append((bibcode, ex))
    return docs, errors

def write_documents(documents):
    output_dir = Path(OUTPUT_DIR)
    output_dir.mkdir(exist_ok=True)
    for bibcode, data in documents:
        if '/' not in bibcode:
            (output_dir / (bibcode + '.json')).write_text(json.dumps(data))
    

In [7]:
docs, errors = fetch_all_docs('TriangleCornerCitations.csv')
write_documents(docs)

204 citations read from CSV
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
