In [203]:
import requests
from src.utils.epmc_utils import search_europe_pmc

# Get tools from bio.tools

* This example only requests 40 proteomics tools

In [None]:
base_url = "https://bio.tools/api/tool/?topic=%22proteomics%22&format=json&page="
page = 1
tools = []

while(page < 5):
    response = requests.get(base_url + str(page))

    if response.status_code == 200:
        result = response.json()['list']
        tools.extend(result)
        page += 1
    else:
        break

print("Got {} tools from bio.tools".format(len(tools)))

# Separate tools

According to publication details in the json, tools are separated into:

* tools without publication details (tools_no_publication)
* tools with publication but no DOI (tools_no_doi)
* tools with publication with DOI (tools_subset)

In [None]:
tools_no_doi, tools_no_publication = [], []

for tool in tools:
    if tool['publication']:
        if tool['publication'][0]['doi'] is None:
            tools_no_doi.append(tool)

    else:
        tools_no_publication.append(tool)


print("Got {} tools without DOI".format(len(tools_no_doi)))
print("Got {} tools without publication".format(len(tools_no_publication)))

tools_subset = [item for item in tools if item not in tools_no_publication]

Check how many don't have PMID and PMCID

In [None]:
tools_no_id = [tool for tool in tools_subset if tool['publication'][0]['pmid'] is None]
tools_no_pmcid = [tool for tool in tools_subset if tool['publication'][0]['pmcid'] is None]

print("Got {} tools without PMID".format(len(tools_no_id)))
print("Got {} tools without PMCID".format(len(tools_no_pmcid)))

# Tools with Publication and DOI

For tools with publication - query EuropePMC and update publication fields 

In [None]:
updated_tools =  []

for tool in tools_subset:
    
    # Query Europe PMC with DOI or PMID if DOI is not available
    query = tool['publication'][0].get('doi') or tool['publication'][0].get('pmid')

    response = search_europe_pmc(query)

    
    if response and response.get('hitCount', 0) > 0:
        match = response['resultList']['result'][0] 
        
        original_publication = {
            'doi': tool['publication'][0].get('doi'),
            'pmid': tool['publication'][0].get('pmid'),
            'pmcid': tool['publication'][0].get('pmcid')
        }

        updated_publication = get_publication_from_match(match)
        
        if updated_publication and updated_publication != original_publication:
            # Assuming Europe PMC has the most up-to-date information
            if 'doi' not in updated_publication:
                    updated_publication['doi'] = None
            if 'pmid' not in updated_publication:
                updated_publication['pmid'] = None
            if 'pmcid' not in updated_publication:
                updated_publication['pmcid'] = None
            
            tool['publication'][0] = updated_publication
            updated_tools.append(tool)


print(f"Updated {len(updated_tools)} tools with publication information.")

save_tools_to_json(updated_tools, "updated_tools.json")



In [None]:
tools_no_id = [tool for tool in tools2 if tool['publication'][0].get('pmid') is None]
tools_no_pmcid = [tool for tool in tools2 if tool['publication'][0].get('pmcid') is None]

print("Got {} tools without PMID".format(len(tools_no_id)))
print("Got {} tools without PMCID".format(len(tools_no_pmcid)))

# Tools without DOI

In [None]:
updated_tools_doi = []

for tool in tools_no_doi:
    if tool['publication'][0]['pmid'] is not None:
        response = search_europe_pmc(tool['publication'][0]['pmid'])

        if response and response.get('hitCount', 0) > 0:
            match = response['resultList']['result'][0]
        
            original_publication = {
              'doi': tool['publication'][0].get('doi'),
              'pmid': tool['publication'][0].get('pmid'),
              'pmcid': tool['publication'][0].get('pmcid')
            }
            updated_publication = update_publication_with_match(match)

            if updated_publication and updated_publication != original_publication:
                if 'doi' not in updated_publication:
                    updated_publication['doi'] = None
                if 'pmid' not in updated_publication:
                    updated_publication['pmid'] = None
                if 'pmcid' not in updated_publication:
                    updated_publication['pmcid'] = None

                tool['publication'][0] = updated_publication
                updated_tools_doi.append(tool)

print(f"Updated {len(updated_tools_doi)} tools with publication information.")

# Replace original publication instead of the updated publication, replacing only the fields to keep all the other metadata