# How to crawl bio.tools with its API ? 

This notebook shows you how to automate the crawling of biotools to filter or transform its content. Please send any comment or feedback to alban.gaignard@univ-nantes.fr. 

In [33]:
import urllib3
import json

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def crawl_biotools(keyword, limit=-1):
    """
    Go through all bio.tools entries and print the tool home page if the keyword is found in the tool description.  
    :param limit: an integer value specifying the max number of entries to be crawled, -1 by default, means no limit.
    """
    
    http = urllib3.PoolManager()
    http.headers['Accept'] = 'application/json'
    http.headers['Content-type'] = 'application/json'
    
    try:
        req = http.request('GET', 'https://bio.tools/api/tool/?page=1&?format=json')
        count_json = json.loads(req.data.decode('utf-8'))
        count = int(count_json['count'])
        print(str(count)+ " available BioTools entries")

        i = 1
        nb_tools = 1
        has_next_page = True
        while has_next_page :
            req = http.request('GET', 'https://bio.tools/api/tool/?page=' + str(i) + '&?format=json')
            try:
                entry = json.loads(req.data.decode('utf-8'))
            except JSONDecodeError as e:
                print("Json decode error for " + str(req.data.decode('utf-8')))
                break
            has_next_page = (entry['next'] != None)

            for tool in entry['list']:
                #print(tool)
                if keyword in str(tool['description']).lower():
                    print(tool['homepage'])
                
                nb_tools += 1
                progress = nb_tools * 100 / count
                if (nb_tools % 500 == 0) :
                    print(str(round(progress))+" % done")
                if ((limit != -1) and (nb_tools >= limit)):
                    return
            i += 1

    except urllib3.exceptions.HTTPError as e:
        print(e)

Now we just crawl the first `1000` entries and search for tools with the `rare disease` keyword in their description field. 

In [34]:
crawl_biotools("rare disease", limit=1000)

12157 available BioTools entries
https://muccg.github.io/rdrf/
http://www.findzebra.com/raredisease
4 % done
8 % done
