In [2]:
import httpx
import pandas as pd

API_BASE = "https://api.datacite.org"
CONSORTIUM_ID = "blco"

cl = httpx.Client(
    timeout=httpx.Timeout(5.0, read=60.0),
)

In [3]:
r = cl.get(f"{API_BASE}/providers/totals")
r.raise_for_status()
totals_data = r.json()
totals = pd.json_normalize(totals_data) \
    .set_index('id')

In [4]:
def extract_counts(row):
    result = {}
    for st in row.states:
        result[st['id']] = st['count']
    for period in ['this_year', 'last_year', 'two_years_ago']:
        data = row['temporal.' + period][0]
        result['count.' + data['id']] = data['count']
       
    return result

totals = totals[['title', 'count']].join(totals.apply(extract_counts, axis=1, result_type='expand'))
totals

Unnamed: 0_level_0,title,count,findable,registered,count.2021,count.2020,count.2019
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cern,CERN - European Organization for Nuclear Research,2874672,2436491.0,438181.0,566509.0,655451.0,833365.0
figshare,figshare,2332758,2196374.0,136384.0,342599.0,449720.0,386275.0
tawj,University of Tartu,1930980,1930964.0,16.0,1210609.0,21431.0,11704.0
stdp,ETH Zurich,1906558,1906492.0,66.0,119080.0,118294.0,181454.0
sage,SAGE Publishing,1793430,333716.0,1459714.0,81913.0,12728.0,13019.0
...,...,...,...,...,...,...,...
jugt,Mercator Ocean International,1,,1.0,1.0,0.0,0.0
mbf,MBF Bioscience,1,1.0,,0.0,0.0,0.0
mskcc,Memorial Sloan Kettering Cancer Center,1,1.0,,0.0,1.0,0.0
pvre,Leuphana Universität Lüneburg,1,,1.0,1.0,0.0,0.0


In [9]:
r = cl.get(f"{API_BASE}/providers/{CONSORTIUM_ID}")
r.raise_for_status()
provider_data = r.json()

In [16]:
co_ids = {x['id'] for x in provider_data['data']['relationships']['consortiumOrganizations']['data']}
co_totals = totals[totals.index.isin(co_ids)] \
    .reindex(co_ids) \
    .fillna(0)

In [27]:
def get_title(row):
    r = cl.get(f"{API_BASE}/providers/{row.name}")
    if r.status_code == httpx.codes.OK:
        data = r.json()
        return data['data']['attributes']['name']
co_totals[co_totals.title == 0].apply(get_title, axis=1)

id
rvth                                      None
puzp                   Health Data Research UK
uxvc                                      None
upnf                                      Jisc
aevn                    University of Stirling
qono                        Abertay University
cwat                      Diamond Light Source
iobo                                      None
kiai                          CREST Consortium
ueur               London Southbank University
mykg    United Kingdom Atomic Energy Authority
mhqr                                      None
jocq                                      None
wwps                                      None
siyf                       University of Derby
kgxr                                      None
bdeb                       Kingston University
qlbc                    Parliamentary Archives
dtype: object

In [13]:
import datetime as dt

today = dt.datetime.now()
co_totals.to_excel(f'{CONSORTIUM_ID}-totals-{today:%Y%m%d}.xlsx')