# How to crawl bio.tools with its API ? 

This notebook shows you how to automate the crawling of biotools to filter or transform its content. Please send any comment or feedback to alban.gaignard@univ-nantes.fr. 

In [None]:
import urllib3
import json
import argparse
from argparse import RawTextHelpFormatter
from json.decoder import JSONDecodeError
import time
import sys, os
from rdflib import Graph

In [None]:
def rdfize(json_entry):
    """
    Transforms a biotools json entry into RDF, and returns a JSON-LD serialization. The following fields
    are covered: contact, publication, EDAM topic, EDAM operation, EDAM inputs & outputs.
    """

    entry = json_entry

    try:

        ctx = {
            "@context": {
                "@base": "https://bio.tools/",
                "biotools": "https://bio.tools/ontology/",
                "edam": "http://edamontology.org/",
                "pubmed": "https://www.ncbi.nlm.nih.gov/pubmed/",
                "pmc": "https://www.ncbi.nlm.nih.gov/pmc/",
                "doi": "https://doi.org/",
                "dc": "http://purl.org/dc/terms/",
                "rdfs": "http://www.w3.org/2000/01/rdf-schema#",

                # "hasContact": "dc:publisher",
                # "hasPublication": "dc:references",

                # "id": "datacite:identifier",
                "id": "dc:identifier",
                # "name": "datacite:title",
                "name": "dc:title",
                # "description": "datacite:description",
                "description": "dc:description",
                # "license": "datacite:rights",
                "license": "dc:license",
                "hasContact": "datacite:contributor",
                "toolType": "datacite:resourceType",
                "additionDate": "datacite:date",
                "language": "datacite:format",
                "homepage": "datacite:alternateIdentifier",
                "hasPublication": "dc:references",
                "download": "datacite:alternateIdentifier",

                "hasOperation": "biotools:has_function",
                "hasInputData": "edam:has_input",
                "hasOutputData": "edam:has_output",
                "hasTopic": "edam:has_topic"
            }
        }
        entry['@id'] = str(entry['biotoolsID'])
        entry['@type'] = {"@id": 'biotools:Resource'}
        entry.update(ctx)

        # for contact in entry['contact']:
        #     if not "hasContact" in entry.keys():
        #         entry['hasContact'] = [contact['name']]
        #     else :
        #         entry['hasContact'].append(contact['name'])

        # for download in entry['download']:
        #     if download['url']:
        #         if not "download" in entry.keys():
        #             entry['download'] = [download['url']]
        #         else :
        #             entry['download'].append(download['url'])

        for publication in entry['publication']:
            if publication['pmid']:
                if not "hasPublication" in entry.keys():
                    entry['hasPublication'] = [{"@id": 'pubmed:' + publication['pmid']}]
                else:
                    entry['hasPublication'].append({"@id": 'pubmed:' + publication['pmid']})
            if publication['pmcid']:
                if not "hasPublication" in entry.keys():
                    entry['hasPublication'] = [{"@id": 'pmc:' + publication['pmcid']}]
                else:
                    entry['hasPublication'].append({"@id": 'pmc:' + publication['pmcid']})
            if publication['doi']:
                if not ("<" in publication['doi'] or ">" in publication['doi']):
                    if not "hasPublication" in entry.keys():
                        entry['hasPublication'] = [{"@id": "https://doi.org/" + publication['doi']}]
                    else:
                        entry['hasPublication'].append({"@id": "https://doi.org/" + publication['doi']})

        for item in entry['function']:
            for op in item['operation']:
                if not "hasOperation" in entry.keys():
                    entry['hasOperation'] = [{"@id": op['uri']}]
                else:
                    entry['hasOperation'].append({"@id": op['uri']})

            for input in item['input']:
                if not "hasInputData" in entry.keys():
                    entry['hasInputData'] = [{"@id": input['data']['uri']}]
                else:
                    entry['hasInputData'].append({"@id": input['data']['uri']})

            for output in item['output']:
                if not "hasOutputData" in entry.keys():
                    entry['hasOutputData'] = [{"@id": output['data']['uri']}]
                else:
                    entry['hasOutputData'].append({"@id": output['data']['uri']})

        for item in entry['topic']:
            if not "hasTopic" in entry.keys():
                entry['hasTopic'] = [{"@id": item['uri']}]
            else:
                entry['hasTopic'].append({"@id": item['uri']})

    except KeyError as error:
        print(json.dumps(entry, indent=4, sort_keys=True))
        print()

    raw_jld = json.dumps(entry)
    return raw_jld

In [9]:
import urllib3
import json

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def crawl_biotools(keyword, limit=-1):
    """
    Go through all bio.tools entries and print the tool home page if the keyword is found in the tool description.  
    :param limit: an integer value specifying the max number of entries to be crawled, -1 by default, means no limit.
    """
    
    http = urllib3.PoolManager()
    http.headers['Accept'] = 'application/json'
    http.headers['Content-type'] = 'application/json'
    
    try:
        req = http.request('GET', 'https://bio.tools/api/tool/?page=1&?format=json')
        count_json = json.loads(req.data.decode('utf-8'))
        count = int(count_json['count'])
        print(str(count)+ " available BioTools entries")

        i = 1
        nb_tools = 1
        has_next_page = True
        while has_next_page :
            req = http.request('GET', 'https://bio.tools/api/tool/?page=' + str(i) + '&?format=json')
            try:
                entry = json.loads(req.data.decode('utf-8'))
            except JSONDecodeError as e:
                print("Json decode error for " + str(req.data.decode('utf-8')))
                break
            has_next_page = (entry['next'] != None)

            for tool in entry['list']:
                #print(tool)
                if keyword in str(tool['description']).lower():
                    print(tool['homepage'])
                
                nb_tools += 1
                progress = nb_tools * 100 / count
                if (nb_tools % 500 == 0) :
                    print(str(round(progress))+" % done")
                if ((limit != -1) and (nb_tools >= limit)):
                    return
            i += 1

    except urllib3.exceptions.HTTPError as e:
        print(e)

Now we just crawl the first `1000` entries and search for tools with the `rare disease` keyword in their description field. 

In [10]:
crawl_biotools("snp", limit=1000)

14765 available BioTools entries
https://snps-and-go.biocomp.unibo.it/snps-and-go/
https://asia.ensembl.org
http://github.com/trmznt/vivaxgen-geo
https://www.ncbi.nlm.nih.gov/pubmed/?term=31552442
https://github.com/vitorpavinato/PypeAmplicon
https://github.com/ahvdk/SSNpipe
http://www.ncgd.nbri.res.in/PLANET-SNP-Pipeline.aspx
https://hirisplex.erasmusmc.nl
http://waltzdb.switchlab.org/
http://www.bioinfoindia.org/abcd
http://hzau.edu.cn/SNP2APA/
https://github.com/isglobal-brge/MADloy
3 % done
http://www.genemed.tech/ascrispr
http://www.maizegdb.org
http://snp-seek.irri.org/
https://dbmdega.shinyapps.io/dbMDEGA/
https://www.ncbi.nlm.nih.gov/pubmed/?term=31378650
https://www.ncbi.nlm.nih.gov/pubmed/?term=31373606
https://pubs.broadinstitute.org/mammals/haploreg/haploreg.php
https://github.com/wangying0128/IsomiR_Find
http://PHDB.switchlab.org/
7 % done
