In [None]:
# ==============================
# ORCID -> TAXID Prototype
# ==============================

import requests
import json

# Small dictionary for demo purposes
organism_dict = {
    "Homo sapiens": 9606,
    "Escherichia coli": 562,
    "Saccharomyces cerevisiae": 4932,
    "Mus musculus": 10090,
    "Drosophila melanogaster": 7227,
    "Caenorhabditis elegans": 6239,
    "Arabidopsis thaliana": 3702
}

def get_publications_by_orcid(orcid_id, page_size=10):
    """
    Query Europe PMC for publications by the given ORCID.

    :param orcid_id: ORCID string, e.g. "0000-0002-1825-0097"
    :param page_size: Number of results to retrieve per page (for demo).
    :return: List of publication records (JSON)
    """
    base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    query = f"AUTHORID:{orcid_id}"

    params = {
        "query": query,
        "format": "json",
        "resultType": "core",
        "pageSize": page_size
    }

    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        print(f"Request failed with status code {response.status_code}")
        return []

    data = response.json()
    return data.get("resultList", {}).get("result", [])

def extract_organisms_in_abstract(abstract_text, organism_dict):
    """
    Very naive approach that checks if any of the known organisms
    in `organism_dict` appear as a substring in the abstract.

    :param abstract_text: Text of the abstract
    :param organism_dict: Dictionary mapping organism names to TAXIDs
    :return: Set of matched organism names
    """
    found_organisms = set()
    if not abstract_text:
        return found_organisms

    for org_name in organism_dict.keys():
        if org_name in abstract_text:
            found_organisms.add(org_name)
    return found_organisms

def main(orcid_id):
    # 1. Retrieve publications
    pubs = get_publications_by_orcid(orcid_id)

    if not pubs:
        print("No publications found or an error occurred.")
        return

    all_found_organisms = set()

    # 2. Parse each publication
    for i, pub in enumerate(pubs, start=1):
        title = pub.get("title", "No title")
        abstract = pub.get("abstractText", "")
        journal = pub.get("journalInfo", {}).get("journal", {}).get("title", "Unknown journal")

        # Extract organisms from abstract
        found_in_pub = extract_organisms_in_abstract(abstract, organism_dict)
        all_found_organisms.update(found_in_pub)

        print(f"=== Publication #{i} ===")
        print(f"Title: {title}")
        print(f"Journal: {journal}")

        # 3. Print authors and their ORCIDs if available
        author_list = pub.get("authorList", {}).get("author", [])
        print("Authors:")
        for author in author_list:
            # print(author)
            full_name = author.get("fullName", "Unknown author")
            # Some authors may have no authorId OR a different type
            # e.g. 'authorId': {'type': 'ORCID', 'value': '...'}
            orcid = author.get("authorId", {}).get("value", "No ORCID found")
            print(f"  - {full_name} | ORCID: {orcid}")

        print()

    # 4. Print summary of organisms found
    print(f"Total publications found: {len(pubs)}")
    print("Organisms found across these publications:")
    for org in all_found_organisms:
        tax_id = organism_dict[org]
        print(f"  - {org} (TAXID: {tax_id})")

test_orcid = "0000-0003-3130-8043"  # Replace with any valid ORCID
# test_orcid = "0000-0003-4896-7109"  # Replace with any valid ORCID
# main(test_orcid)


In [None]:
import os
import requests
from google.colab import files

# Define the default file name to look for
# default_file = "tuberculosis-paper.pdf"
default_file = "tuberculosis-abstract.txt"

# Check if the file exists in the current working directory
if os.path.exists(default_file):
    file_path = default_file
    print(f"Found file: {file_path}")
else:
    uploaded = files.upload()
    file_path = next(iter(uploaded))

# The API endpoint
endpoint = "https://finder.globalnames.org/api/v1/find"

# Data payload: leave 'text' empty so the file is used
payload = {
    "text": "",  # Leave empty to use the file
    "format": "json",
    "bytesOffset": "false",
    "returnContent": "false",
    "uniqueNames": "true",
    "ambiguousNames": "true",
    "allMatches": "false"
}

with open(file_path, "rb") as f:
    files_payload = {"file": f}
    response = requests.post(endpoint, data=payload, files=files_payload)

# Parse the JSON response and print the detected words with their odds ratio
if response.status_code == 200:
    data = response.json()  # Convert response to a dict
    for item in data.get('names', []):
        detected_word = item['verbatim']
        odds_ratio = item['oddsLog10']
        print(f"{detected_word}: {odds_ratio}")
else:
    print(f"Error {response.status_code}: {response.text}")


Saving tuberculosis-abstract.pdf to tuberculosis-abstract.pdf
Nigeria: -0.17319842126322213
Nigeria: -0.17319842126322213
M. bovis): 5.351755643347912


In [None]:
test_orcid = "0000-0003-3130-8043"
pubs = get_publications_by_orcid(test_orcid)
for pub in pubs:
  pub['authorList'] = {}
pubs