In [45]:
import pyalex
from pyalex import Works
import json
import pandas as pd
from collections import defaultdict

In [61]:
pyalex.config.email = "cthoyt@gmail.com"

usual_suspects = {
    "GB",  # Great Britian
    "US",  # USA
    "DE",  # Germany
    "IT",  # Italy
    "BE",  # Belgium
    "FI",  # Finland
    "CH",  # Switzerland
    "FR",  # France
    "ES",  # Spain
    "DK",  # Denmark
    "CZ",  # Czechia
    "CA",  # Canada
    "AU",  # Australia
    "SE",  # Sweden
    "HU",  # Hungary
    "PT",  # Portugal
    "NL",  # Netherlands
    "IE",  # Ireland
    "NO",  # Norway
    "CN",  # China
    "AT",  # Austria
}

def get_underrepresented_countries(work):
    dd = defaultdict(dict)
    for a in work['authorships']:
        for i in a['institutions']:
            iid = i.get('id')
            country_code = i.get('country_code')
            if not iid or not country_code or country_code in usual_suspects:
                continue
            iid = iid.removeprefix("https://openalex.org/")
            country_display = country_code_to_name[country_code]
            dd[country_display][iid] = i['display_name']
    return dict(dd)

In [49]:
country_code_to_name = {
    r['key']: r['key_display_name']
    for r in (
        Works()
        .filter(institutions={"is_global_south":True})
        .group_by("institutions.country_code")
        .get()
    )
}
len(country_code_to_name)

200

In [50]:
results = defaultdict(dict)

def _append(works):
    for work in works:
        for country, institutions in get_underrepresented_countries(work).items():
            results[country][work['doi']] = institutions

In [51]:
issns = [
    "1758-0463", # Oxford Database
    "2041-1480", # Journal of Biomedical Semantics
]
for issn in issns:
    _append(
        Works()
        .filter(institutions={"is_global_south":True}, host_venue={"issn": issn})
        .get()
    )

In [57]:
keywords = [
    "biocuration",
    "biomedical ontology",
    "model organism database",
    "genomics database",
]
for keyword in keywords:
    _append(
        Works()
        .search(keyword)
        .filter(institutions={"is_global_south":True})
        .get()
    )
    


In [60]:
false_positives = {
    "https://doi.org/10.1093/nar/gkt1026", # HPO 2013 paper
    "https://doi.org/10.1186/2041-1480-2-s2-s1",
}
true_positives = {
    "https://doi.org/10.1186/s13326-016-0080-2", # Olaf Wolkenhauer had south africa affiliaton
    "https://doi.org/10.1111/nph.13557", # Yves Van de Peer has south africa affiliation
    "https://doi.org/10.1093/database/bau061", # Authors from many countries, but no disambiguation nor contact information available
}

df = pd.DataFrame(dict(results))
df = df[~df.index.isin(false_positives)]
df = df[~df.index.isin(true_positives)]
df

Unnamed: 0,Estonia,Thailand,South Africa,Austria,"Korea, Republic of",New Zealand,Japan,India,Peru,"Iran, Islamic Republic of",...,Costa Rica,Poland,Greece,Ethiopia,Algeria,Tunisia,Fiji,Colombia,Romania,Puerto Rico
https://doi.org/10.1186/s13326-015-0014-4,,,{'I26092322': 'Stellenbosch University'},,,,,,,,...,,,,,,,,,,
https://doi.org/10.1186/2041-1480-2-s2-s1,,,,"{'I5994924': 'Information Retrieval Facility',...",{'I4210092243': 'AstraZeneca'},,,,,,...,,,,,,,,,,
https://doi.org/10.1186/s13326-018-0176-y,,,,,{'I191879574': 'Inha University'},,,,,,...,,,,,,,,,,
https://doi.org/10.1093/nar/gkv350,,,,,{'I139264467': 'Seoul National University'},,,,{'I1318233029': 'International Potato Center'},,...,,,,,,,,,,
https://doi.org/10.1093/database/bar041,,,,,,,{'I74801974': 'University of Tokyo'},,{'I1318233029': 'International Potato Center'},,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://doi.org/10.1093/nar/gki450,,,,,,,,,,,...,,,,{'I2802457231': 'New Generation University Col...,,,,,,
https://doi.org/10.1007/978-3-319-49004-5_2,,,,,,,,,,,...,,,,,{'I4210149016': 'École Nationale Supérieure d'...,,,,,
https://doi.org/10.1145/3102254.3102284,,,,,,,,,,,...,,,,,,{'I108714496': 'Tunis University'},,,,
https://doi.org/10.1038/srep03376,,,,,,,,,,,...,,,,,,,{'I44666525': 'University of the South Pacific'},,,


In [17]:
r = []
for work in w:
    title = work['title']
    dd = defaultdict(dict)
    authors = []
    for a in work['authorships']:
        countries = []
        for i in a['institutions']:
            iid = i.get('id')
            country_code = i.get('country_code')
            if not iid or not country_code or country_code in usual_suspects:
                continue
            iid = iid.removeprefix("https://openalex.org/")
            country_display = country_code_to_name[country_code]
            countries.append(country_display)
            dd[f'{country_display} ({country_code})'][iid] = i['display_name']
        if not countries:
            continue
        
        # THIS WORKS BADLY
        author = a['author']
        author_id = author['id']
        if author_id:
            author_id = author_id.removeprefix("https://openalex.org/")
        author_dict = dict(
            openalex=author_id,
            name=author.get('display_name'),
            countries=countries,
        )
        orcid = author.get("orcid")
        if orcid:
            author_dict['orcid'] = orcid.removeprefix("https://orcid.org/")
        authors.append(author_dict)
    r.append(dict(
        id=work['id'],
        title=title, 
        # institutions=dict(dd), 
        authors=authors,
    ))

print(json.dumps(r, indent=2, ensure_ascii=False))

[
  {
    "id": "https://openalex.org/W2168644196",
    "title": "Text-mining-assisted biocuration workflows in Argo",
    "authors": [
      {
        "openalex": "A2158507047",
        "name": "Rafal Rak",
        "countries": [
          "Philippines"
        ]
      },
      {
        "openalex": "A2160147828",
        "name": "Andrew F. Rowley",
        "countries": [
          "Philippines"
        ]
      },
      {
        "openalex": "A2130666468",
        "name": "Jacob Carter",
        "countries": [
          "Philippines"
        ]
      },
      {
        "openalex": "A2141504082",
        "name": "Sophia Ananiadou",
        "countries": [
          "Philippines"
        ],
        "orcid": "0000-0002-4097-9191"
      }
    ]
  },
  {
    "id": "https://openalex.org/W1980075500",
    "title": "Bringing Biocuration to China",
    "authors": [
      {
        "openalex": "A2170102332",
        "name": "Zhang Zhang",
        "countries": [
          "China"
        ],
      

In [5]:
w

[{'id': 'https://openalex.org/W2168644196',
  'doi': 'https://doi.org/10.1093/database/bau070',
  'title': 'Text-mining-assisted biocuration workflows in Argo',
  'display_name': 'Text-mining-assisted biocuration workflows in Argo',
  'relevance_score': 433.71744,
  'publication_year': 2014,
  'publication_date': '2014-01-01',
  'ids': {'openalex': 'https://openalex.org/W2168644196',
   'doi': 'https://doi.org/10.1093/database/bau070',
   'mag': '2168644196',
   'pmid': 'https://pubmed.ncbi.nlm.nih.gov/25037308',
   'pmcid': 'https://www.ncbi.nlm.nih.gov/pmc/articles/4103424'},
  'host_venue': {'id': 'https://openalex.org/V4210201630',
   'issn_l': '1758-0463',
   'issn': ['1758-0463'],
   'display_name': 'Database',
   'publisher': 'University of Oxford',
   'type': 'journal',
   'url': 'https://academic.oup.com/database/article-pdf/doi/10.1093/database/bau070/17472907/bau070.pdf',
   'is_oa': True,
   'version': 'publishedVersion',
   'license': 'cc-by'},
  'type': 'journal-article',