In [162]:
import json
import os
import pandas as pd
import requests
import time
import yaml


In [2]:
# /Users/nxo/Workspace/GitHub/arcsaef/config/reporting.yaml
with open('config/reporting.yaml', 'r') as file:
        cf = yaml.safe_load(file)

In [333]:
# Highlight all journal articles of interest
# Run get_doi_ipop.js
# Save results as doi_org.txt
# /Users/nxo/Workspace/GitHub/arcsaef/data/all/doi_org.txt'
with open('data/all/doi_org.txt', mode= "r", encoding="utf-8") as f:
    doi_org_raw = f.readlines()

In [334]:
doi_org_raw

['#2021\n',
 '10.1126/science.aay9165 : UOW\n',
 '10.1007/s43630-020-00001-x : UOW\n',
 '10.1002/ece3.8376 : JCU; WAM\n',
 '10.1073/pnas.2017384118 : MON\n',
 '10.5194/tc-15-5447-2021 : MON; BGC\n',
 '10.1073/pnas.2025322118 : MON\n',
 '10.1093/conphys/coab009 : UOW; MON\n',
 '10.1093/conphys/coab038 : MON\n',
 '10.1111/gcb.15841 : UOW\n',
 '10.1111/1365-2435.13829 : LAT ; MON\n',
 '#2022\n',
 '10.1016/j.oneear.2022.05.009 : LAT\n',
 '10.1016/j.ympev.2022.107429 : SAM\n',
 '10.1017/S0954102022000402 : QUT\n',
 '10.1002/fee.2560 : SAM; AAD\n',
 '10.1038/s43017-022-00348-y : MON\n',
 '10.1016/j.agwat.2022.107695 : UOW\n',
 '10.1111/gcb.16309 : UOW\n',
 '10.1029/2022GL099897 : MON\n',
 '10.1038/s41597-022-01514-z : LAT\n',
 '10.1111/gcb.16356 JCU; UOW; WAM; MON; ANSTO\n',
 '10.1007/s43630-022-00176-5 : UOW\n',
 '10.1111/gcb.15926 : UOW\n',
 '10.1111/conl.12918 : LAT\n',
 '10.1029/2022GL098539 : MON\n',
 '10.1038/s41467-022-34355-w : MON\n',
 '10.1111/gcb.16060 : MON; UOW\n',
 '10.1016/j.c

In [335]:
# Use doi a the key
# Use the values in ipop
doi_org = {}
for elem in doi_org_raw:
    if not elem.startswith("#"):
        if len(elem.split(":") ) == 2:
            doi_org[elem.split(":")[0].strip()] = elem.split(":")[1].strip()

In [336]:
# load organisation in config file
nrpt_orgs = json.loads(cf['non-report']['organisations'])
rpt_orgs  = json.loads(cf['report']['organisations'])
nrpt_orgs.update(rpt_orgs)
nrpt_orgs_uc = {}

for e in nrpt_orgs:
    nrpt_orgs_uc[e.upper()] = nrpt_orgs.get(e)

# update organisation abbreviation with it's long name
for o in doi_org:
    chopped = doi_org.get(o).split(";")
    if( len(chopped) == 1 ):
        for x in nrpt_orgs_uc:
            if x == doi_org.get(o).upper():
                doi_org[o] = nrpt_orgs.get(x)
# handle output with >1 ipop
    else:
        org_split = []
        for chop in chopped:
            if chop.strip().upper() in nrpt_orgs_uc.keys():
                org_split.append(nrpt_orgs_uc.get(chop.strip().upper()))
        doi_org[o] = org_split

In [11]:
url = "https://api.openalex.org/works/https://doi.org/"

for doi in doi_org:
    # there is no SAEF id that can be used to retrieve research output attributed to SAEF
    # therefore it is safer to use DOI's of known (recorded in Zotero) SAEF output 
    # /Users/nxo/Workspace/GitHub/arcsaef/data/jnls_202408/
    response = requests.get(f"{url}{doi}") 

    if response.status_code == 200:
        page  = json.loads(response.content) # dictionary 
        fname = f"{'data/jnls_202408/'}{page.get('id').split('/')[3]}.json"

        with open(fname, mode= "w", encoding="utf-8") as f:
            json.dump(page, f)

    time.sleep(1) # sleep for 1 second satisfies max 10 call per sec limit


In [71]:
# /Users/nxo/Workspace/GitHub/arcsaef/data/jnls_202408/W4392715378.json
with open('data/jnls_202408/W3210882272.json', mode= "r", encoding="utf-8") as f:
    twork = json.load(f)

In [486]:
# /Users/nxo/Workspace/GitHub/arcsaef/data/jnls_202408/
collab_orgs  = {}

# find institutions of all authors
for file in os.listdir('data/jnls_202408/'):
    
    with open(f"{'data/jnls_202408/'}{file}", mode= "r", encoding="utf-8") as f:
        work = json.load(f)
        doi = work.get('doi')[16: ] 

        if doi in doi_org.keys():
            pub_institutions = []
            for w in work.get('authorships'):
                institutions = w.get('institutions')

                # sanity check! Some authors have an empty institutions array
                if len(institutions) > 0:
                    # an author can choose to associate >1 institution with a work
                    # the most significant institution is deemed to be the first one
                    pub_institutions.append(institutions[0].get('display_name'))

            collab_orgs[ doi ] = pub_institutions

            # if len(pub_institutions) > 0:
            #     pub_institutions.insert(0, doi_org.get(doi))    
                


In [465]:
collab_orgs

{'10.1016/j.ympev.2022.107429': ['South Australia Museum',
  'British Antarctic Survey',
  'British Antarctic Survey',
  'British Antarctic Survey',
  'University of Bristol',
  'South Australian Museum',
  'British Antarctic Survey'],
 '10.1038/s41598-024-65081-6': ['Monash University',
  'University of Wollongong',
  'UNSW Sydney'],
 '10.1038/s41559-023-02171-0': ['Monash University',
  'Finnish Environment Institute',
  'Universidad Nacional Autónoma de México',
  'University of Victoria',
  'Griffith University',
  'Manaaki Whenua – Landcare Research',
  'McGill University',
  'German Centre for Integrative Biodiversity Research',
  'University of British Columbia',
  'Nord University',
  'Norwegian Institute for Nature Research',
  'Fondation Pour la Recherche Sur la Biodiversité',
  'University of St Andrews',
  'Joint Research Centre',
  'Smithsonian Environmental Research Center',
  'German Centre for Integrative Biodiversity Research',
  'German Centre for Integrative Biodiver

In [497]:
res = [val for val in collab_orgs.values()]
final_collab_orgs = {}
multi_key = []

for o in collab_orgs:
    if len(o) > 0:
        key = doi_org.get(o)     # ipop institution(s)
        val = collab_orgs.get(o) # author institutions

        if not isinstance(key, list):
            if key not in final_collab_orgs:
               final_collab_orgs[key] = ','.join(val)
            else:
                final_collab_orgs[key] = f"{final_collab_orgs.get(key)}, {', '.join(val)}"

        if isinstance(key, list):
            for k in key:
                if k in final_collab_orgs:
                    final_collab_orgs[k] = f"{final_collab_orgs[k]}, {', '.join(val)}"
                else:
                    final_collab_orgs[k] = ', '.join(val)
        #             # if not isinstance(val, list):

In [498]:
final_collab_orgs

{'South Australia Museum': "British Antarctic Survey,British Antarctic Survey,British Antarctic Survey,University of Bristol,South Australian Museum,British Antarctic Survey, University of Western Australia, University of Western Australia, Australian Antarctic Division, Australian Antarctic Division, Australian Antarctic Division, Cawthron Institute, Australian Antarctic Division, Natural History Museum Aarhus, British Antarctic Survey, University of Pretoria, Queensland University of Technology, British Antarctic Survey, University of Sydney, Australian Antarctic Division, Helmholtz Institute for Functional Marine Biodiversity, University of Waikato, University of Pretoria, University of Wollongong, Deakin University, Australian Antarctic Division, South Australian Museum, James Cook University, Cawthron Institute, Western Australian Museum, Cawthron Institute, Australian Antarctic Division, University of Waikato, University of Waikato, University of Adelaide, University of Otago, Un

## Visualisation !!

?