In [91]:

import json
import requests
import re
import pandas as pd

In [None]:
# Data file was exportm from Zotero in CSL JSON format
with open('data/BIB_saef_20241206.json', 'r') as f:
    bib = json.load(f)

In [7]:
data = {}
for entry in bib:
    doi  = entry['DOI']                                         # Use DOI as key
    saef_author_list    = re.findall('saef:.*', entry['note'])  # Use saef keyword as an identifier

    for names in saef_author_list:
            names_tidy  = re.sub('saef:', '', names, flags=re.IGNORECASE)
            names_split = names_tidy.strip().split(';')         # Create a list of saef names
    
    # retrieve openalex authorship for DOI
    url = f"https://api.openalex.org/works/https://doi.org/{doi}?select=authorships"
    authorships = requests.get(url).json()                      # OpenAlex call using DOI arg
    data[doi] = [names_split, authorships]


In [43]:
collaboration = []
for combo in data:
    for authors in data[combo][1]['authorships']:
        author = authors['author']['display_name']
        if authors['author']['orcid'] is not None:
            auth_orcid = authors['author']['orcid'].replace("https://orcid.org/", "")
        else:
            auth_orcid = "0000-0000-0000-0000"

        if len(authors['institutions']) > 0:
            auth_inst = authors['institutions'][0]['display_name']
        else:
            auth_inst = "Missing"
        # print(f"{combo}, {data[combo][0]}, {author}, {auth_orcid}, {auth_inst}") # debug print statement
        collaboration.append({"doi": combo, "saef_author":data[combo][0], "collab_author": author, "collab_orcid":auth_orcid, "collab_inst": auth_inst})

In [35]:
print(collaboration)

[{'doi': '10.1002/rse2.371', 'saef_author': ['KrystalRandall', ' SharonRobinson', ' MelindaWaterman'], 'collab_author': 'Darren Turner', 'collab_orcid': '0000-0002-3029-6717', 'collab_inst': 'University of Tasmania'}, {'doi': '10.1002/rse2.371', 'saef_author': ['KrystalRandall', ' SharonRobinson', ' MelindaWaterman'], 'collab_author': 'Emiliano Cimoli', 'collab_orcid': '0000-0001-7964-2716', 'collab_inst': 'University of Tasmania'}, {'doi': '10.1002/rse2.371', 'saef_author': ['KrystalRandall', ' SharonRobinson', ' MelindaWaterman'], 'collab_author': 'Arko Lucieer', 'collab_orcid': '0000-0002-9468-4516', 'collab_inst': 'University of Tasmania'}, {'doi': '10.1002/rse2.371', 'saef_author': ['KrystalRandall', ' SharonRobinson', ' MelindaWaterman'], 'collab_author': 'R. Haynes', 'collab_orcid': '0000-0002-2785-2700', 'collab_inst': 'University of Tasmania'}, {'doi': '10.1002/rse2.371', 'saef_author': ['KrystalRandall', ' SharonRobinson', ' MelindaWaterman'], 'collab_author': 'Krystal Randal

In [None]:
saef_author_collab = {}
for x in collaboration:
    for a in x.get('saef_author'):
        author = a.strip().replace("(", "").replace(")", "")
        split = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', author)).split() # https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
        regex = f"{split[0]}\w* ?\w*.? {split[-1]}\-?\w*" # E.g.  re.search(r'Aleks\w* ?\w*.? Terauds\-?\w*', 'Aleksander Terauds')
        match = re.search(regex, x.get('collab_author'))
        if match:
            pass # name match, don't count as a collaboration
        else:
            if author in saef_author_collab:
                saef_author_collab[author].append(x.get('collab_inst'))
            else:
                saef_author_collab[author] = [x.get('collab_inst')]

# Remove duplicate collaborations
for entry in saef_author_collab:
    saef_author_collab[entry] = set(saef_author_collab.get(entry))

In [93]:
saef_author_collab

{'KrystalRandall': {'Queensland University of Technology',
  'University of Tasmania',
  'University of Wollongong'},
 'SharonRobinson': {'Aristotle University of Thessaloniki',
  'Australian National University',
  'Biospherical Instruments (United States)',
  'California State University, Northridge',
  'Case Western Reserve University',
  'Centre for Research on Ecology and Forestry Applications',
  'Colorado State University',
  'Cooperative Institute for Research in Environmental Sciences',
  'Cornell University',
  'Donghua University',
  'Duke University',
  'ETH Zurich',
  'Environmental Protection Agency',
  'Exponent (United States)',
  'Finnish Meteorological Institute',
  'Indian Academy of Sciences',
  'Institute of Applied Ecology',
  'KTH Royal Institute of Technology',
  "King's College London",
  'Kingston University',
  'Leibniz Institute of Environmental Medicine',
  'Linnaeus University',
  'Loyola University New Orleans',
  'Manaaki Whenua – Landcare Research',
  '

In [86]:
# re.search(r'Aleks\w* ?\w*.? Terauds\-?\w*', 'Aleksander Terauds')
# re.search(r'Sharon\w* ?\w*.? Robinson\-?\w*', 'Sharon A. Robinson')
' M(e)lindaWaterman'.strip().replace("(", "").replace(")", "")


'MelindaWaterman'

In [None]:
name = 'LarissaLubianaBotelho'
split = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', name)).split() # https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
print(f"{split[0]} {split[-1]}") 

Larissa Botelho
