In [1]:
#Get metadata from CaltechDATA

import os
from ames.harvesters import get_caltechdata

if os.path.isdir('data') == False:
    os.mkdir('data')
os.chdir('data')

production = True
collection = 'caltechdata.ds'

get_caltechdata(collection,production)

100% (1034 of 1034) |####################| Elapsed Time: 0:00:03 Time:  0:00:03


In [2]:
#Determine percentage of records have authors with ORCIDS in CaltechDATA

from ames.harvesters import get_records
from py_dataset import dataset

dot_paths = ['.dates','.resourceType','.subjects','.publicationYear','.creators']
keys = dataset.keys(collection)
author_metadata = get_records(dot_paths,'dois',collection,keys)
dates = []
categories = []
keys = []
orcid_records = 0
new_records = 0
total_orcids = 0
records = 0
orcids = set()
for record in author_metadata:
    records += 1
    orcid_count = 0
    for c in record['creators']:
        if 'nameIdentifiers' in c:
            for n in c['nameIdentifiers']:
                if n['nameIdentifierScheme'] == 'ORCID':
                    orcid_count += 1
                    orcids.add(n['nameIdentifier'])
    total_orcids += orcid_count
    if orcid_count > 0:
        orcid_records += 1
    if int(record['publicationYear']) > 2012:
        new_records += 1

print("Total ORCIDS: ",total_orcids)
print("Unique ORCIDS: ",len(orcids))
print("Records with at least one ORCID: ",orcid_records)
print("Total records: ",records)
print(f"Percent of records with at least one ORCID: {100*(orcid_records/records):.0f}%")
print(f"Modern records (post 2012 publication date): {new_records}")
print(f"Percent of modern records with at least one ORCID: {100*(orcid_records/new_records):.0f}%")

Total ORCIDS:  423
Unique ORCIDS:  124
Records with at least one ORCID:  217
Total records:  1034
Percent of records with at least one ORCID: 21%
Modern records (post 2012 publication date): 305
Percent of modern records with at least one ORCID: 71%


In [3]:
# Alternativly, harvest all metadata from DataCite

import requests, shutil
from progressbar import progressbar

def validate_response(response):
    if (response.status_code != 200):
        print(str(response.status_code) + " " + response.text)
        exit()
    else:
        return response.json()

clients = ['tind.caltech','caltech.library']

collection = 'datacite.ds'
#Always make a new collection
shutil.rmtree(collection)
dataset.init(collection)

for client in clients:
    endpoint = 'https://api.datacite.org/dois'
    query = endpoint + '?client-id=' + client
    response = requests.get(query)
    response = validate_response(response)
    pages = response['meta']['totalPages']
    records = response['data']

    orcid_records = 0
    new_records = 0
    record_num = 0
    page = 1
    for p in progressbar(range(pages)):
        for r in records:
            err = dataset.create(collection,r['id'],r['attributes'])
            if err != '':
                print(err)
            record_num += 1
        page = page + 1
        response = requests.get(query + '&page[number]='+str(page))
        response = validate_response(response)
        records = response['data']

100% (40 of 40) |########################| Elapsed Time: 0:00:46 Time:  0:00:46
 92% (95 of 103) |#####################  | Elapsed Time: 0:02:13 ETA:   0:00:10

10.7907/z93r0qt5 already exists in collection datacite.ds


100% (103 of 103) |######################| Elapsed Time: 0:02:24 Time:  0:02:24


In [4]:
#Now process metadata to get stats from DataCite

orcid_records = 0
total_orcids = 0
new_records = 0
record_num = 0
orcids = set()
page = 1
keys = dataset.keys(collection)
for k in keys:
    record,err = dataset.read(collection,k)
    record_num += 1
    orcid_count = 0
    for c in record['creators']:
        if 'nameIdentifiers' in c:
            for n in c['nameIdentifiers']:
                if n['nameIdentifierScheme'] == 'ORCID':
                    if 'nameIdentifier' in n:
                        orcid_count += 1
                        orcids.add(n['nameIdentifier'])
    total_orcids += orcid_count
    if orcid_count > 0:
        orcid_records += 1
    if record['publicationYear'] != None:
        if int(record['publicationYear']) > 2012:
            new_records += 1

print("Total ORCIDS: ",total_orcids)
print("Unique ORCIDS: ",len(orcids))
print("Records with at least one ORCID: ",orcid_records)
print("Total DOIs: ",record_num)
print(f"Percent of DOIs with at least one ORCID: {100*(orcid_records/record_num):.0f}%")
print(f"Modern DOIs (post 2012 publication date): {new_records}")
print(f"Percent of modern DOIs with at least one ORCID: {100*(orcid_records/new_records):.0f}%")

Total ORCIDS:  1125
Unique ORCIDS:  780
Records with at least one ORCID:  915
Total DOIs:  3549
Percent of DOIs with at least one ORCID: 26%
Modern DOIs (post 2012 publication date): 1855
Percent of modern DOIs with at least one ORCID: 49%
