# How to crawl bio.tools with its API ? 

This notebook shows you how to automate the crawling of biotools to filter or transform its content. Please send any comment or feedback to alban.gaignard@univ-nantes.fr. 

In [1]:
import urllib3
import requests
import json
import argparse
from argparse import RawTextHelpFormatter
from json.decoder import JSONDecodeError
import time
import sys, os
from rdflib import ConjunctiveGraph

# 1. Extending a JSON document with semantics (@context) -> JSON-LD

In [9]:
def rdfize(json_entry):
    """
    Transforms a biotools json entry into RDF, and returns a JSON-LD serialization. The following fields
    are covered: contact, publication, EDAM topic, EDAM operation, EDAM inputs & outputs.
    """

    entry = json_entry

    try:

        ctx = {
            "@context": {
                "@base": "https://bio.tools/",
                "biotools": "https://bio.tools/ontology/",
                "edam": "http://edamontology.org/",
                "pubmed": "https://www.ncbi.nlm.nih.gov/pubmed/",
                "pmc": "https://www.ncbi.nlm.nih.gov/pmc/",
                "doi": "https://doi.org/",
                "dc": "http://purl.org/dc/terms/",
                "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
                
                "sc": "http://schema.org/", 
                "bsc": "http://bioschemas.org/",
                
                #"description": {'@id' : 'sc:description','@type' : 'xsd:string'},
                #"description": {'@id' : 'sc:description','@type' : 'sc:Text'},
                "description": 'sc:description',
                "name": "sc:name",
                "homepage": "sc:url",
                "toolType": 'sc:additionalType',
                "hasTopic": 'sc:applicationSubCategory',
                "author": 'sc:author',
                "hasPublication": "sc:citation",
                "hasOperation": "sc:featureList",
                "license": "sc:license",
                "version": "sc:version",

                # "hasContact": "dc:publisher",
                

                # "id": "datacite:identifier",
                #"id": "dc:identifier",
                # "name": "datacite:title",
                
                # "description": "datacite:description",
                
                
                #"license": "dc:license",
                #"hasContact": "datacite:contributor",
                #"toolType": "datacite:resourceType",
                #"additionDate": "datacite:date",
                #"language": "datacite:format",
                
                #"download": "datacite:alternateIdentifier",

                "hasInputData": "edam:has_input",
                "hasOutputData": "edam:has_output",
            }
        }
        entry.update(ctx)
        
        entry['@id'] = str(entry['biotoolsID'])
        #entry['@type'] = ['bsc:Tool','sc:SoftwareApplication']
        entry['@type'] = ['sc:SoftwareApplication']
        entry['applicationCategory'] = 'Computational science tool'

        # for contact in entry['contact']:
        #     if not "hasContact" in entry.keys():
        #         entry['hasContact'] = [contact['name']]
        #     else :
        #         entry['hasContact'].append(contact['name'])

        # for download in entry['download']:
        #     if download['url']:
        #         if not "download" in entry.keys():
        #             entry['download'] = [download['url']]
        #         else :
        #             entry['download'].append(download['url'])
        
        #for toolType in entry['toolType']:
        
        for credit in entry['credit']:
            if credit['name']:
                if not 'author' in entry.keys():
                    entry['author'] = [credit['name']]
                else:
                    entry['author'].append(credit['name'])
            
        for publication in entry['publication']:
            if publication['pmid']:
                if not "hasPublication" in entry.keys():
                    #entry['hasPublication'] = [{"@id": 'pubmed:' + publication['pmid']}]
                    entry['hasPublication'] = ['pubmed:' + publication['pmid']]
                else:
                    #entry['hasPublication'].append({"@id": 'pubmed:' + publication['pmid']})
                    entry['hasPublication'].append('pubmed:' + publication['pmid'])
            if publication['pmcid']:
                if not "hasPublication" in entry.keys():
                    entry['hasPublication'] = ['pmcid:' + publication['pmcid']]
                else:
                    entry['hasPublication'].append('pmcid:' + publication['pmcid'])
            if publication['doi']:
                if not ("<" in publication['doi'] or ">" in publication['doi']):
                    if not "hasPublication" in entry.keys():
                        entry['hasPublication'] = [{"@id": "https://doi.org/" + publication['doi'], "@type":"sc:CreativeWork"}]
                    else:
                        entry['hasPublication'].append({"@id": "https://doi.org/" + publication['doi'], "@type":"sc:CreativeWork"})

        for item in entry['function']:
            for op in item['operation']:
                if not "hasOperation" in entry.keys():
                    entry['hasOperation'] = [{"@id": op['uri']}]
                else:
                    entry['hasOperation'].append({"@id": op['uri']})

            for input in item['input']:
                if not "hasInputData" in entry.keys():
                    entry['hasInputData'] = [{"@id": input['data']['uri']}]
                else:
                    entry['hasInputData'].append({"@id": input['data']['uri']})

            for output in item['output']:
                if not "hasOutputData" in entry.keys():
                    entry['hasOutputData'] = [{"@id": output['data']['uri']}]
                else:
                    entry['hasOutputData'].append({"@id": output['data']['uri']})

        for item in entry['topic']:
            if not "hasTopic" in entry.keys():
                entry['hasTopic'] = [{"@id": item['uri']}]
            else:
                entry['hasTopic'].append({"@id": item['uri']})

    except KeyError as error:
        print(json.dumps(entry, indent=4, sort_keys=True))
        print()
        
    #print(json.dumps(entry, indent=4, sort_keys=True))

    raw_jld = json.dumps(entry, indent=4, sort_keys=True)
    return raw_jld

# 2. Crawling all content of http://bio.tools

In [10]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)



def crawl_biotools(keyword, limit=-1):
    graph = ConjunctiveGraph()
    
    """
    Go through all bio.tools entries and print the tool home page if the keyword is found in the tool description.  
    :param limit: an integer value specifying the max number of entries to be crawled, -1 by default, means no limit.
    """
    
    http = urllib3.PoolManager()
    http.headers['Accept'] = 'application/json'
    http.headers['Content-type'] = 'application/json'
    
    try:
        req = http.request('GET', 'https://bio.tools/api/tool/?page=1&?format=json')
        count_json = json.loads(req.data.decode('utf-8'))
        count = int(count_json['count'])
        print(str(count)+ " available BioTools entries")

        i = 1
        nb_tools = 1
        has_next_page = True
        while has_next_page :
            req = http.request('GET', 'https://bio.tools/api/tool/?page=' + str(i) + '&?format=json')
            try:
                entry = json.loads(req.data.decode('utf-8'))
            except JSONDecodeError as e:
                print("Json decode error for " + str(req.data.decode('utf-8')))
                break
            has_next_page = (entry['next'] != None)

            for tool in entry['list']:
                jsonld = rdfize(tool)
                graph.parse(data=jsonld, format='json-ld')
                
                nb_tools += 1
                progress = nb_tools * 100 / count
                if (nb_tools % 500 == 0) :
                    print(str(round(progress))+" % done")
                if ((limit != -1) and (nb_tools >= limit)):
                    return graph
            i += 1

    except urllib3.exceptions.HTTPError as e:
        print(e)
    
    return graph

# 3. Testing BioShema for some entries

In [15]:
url = 'https://bio.tools/api/tool/rsat?format=json'
#url = 'https://bio.tools/api/tool/3SRP?format=json'

r = requests.get(url)
tool = r.json()
#print(rdfize(tool))

g = ConjunctiveGraph()
g.parse(data = rdfize(tool), format="json-ld")
print()
print(g.serialize(format="turtle").decode())


@prefix biotools: <https://bio.tools/ontology/> .
@prefix bsc: <http://bioschemas.org/> .
@prefix dc: <http://purl.org/dc/terms/> .
@prefix doi: <https://doi.org/> .
@prefix edam: <http://edamontology.org/> .
@prefix pmc: <https://www.ncbi.nlm.nih.gov/pmc/> .
@prefix pubmed: <https://www.ncbi.nlm.nih.gov/pubmed/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix sc: <http://schema.org/> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://bio.tools/rsat> a sc:SoftwareApplication ;
    sc:additionalType "Suite",
        "Web application" ;
    sc:applicationSubCategory edam:topic_0203,
        edam:topic_0621,
        edam:topic_0749,
        edam:topic_3125,
        edam:topic_3512 ;
    sc:author "Jacques van Helden" ;
    sc:citation "pubmed:10641039",
        "pubmed:12824373",
        "pubmed:18495751",
        "pubmed:9719638" ;
    sc:descr

# 4. Crawling all the content and producing a dump

In [None]:
%%time
g = crawl_biotools("", limit=-1)
print(len(g))
g.serialize(format='turtle', destination='biotools.bioschema.ttl')
print('BioSchema output produced at biotools.bioschema.ttl')
g.serialize(format='json-ld', destination='biotools.bioschema.json-ld')
print('BioSchema output produced at biotools.bioschema.json-ld')

14767 available BioTools entries
3 % done
7 % done
10 % done
14 % done
17 % done
20 % done
24 % done
27 % done
30 % done
34 % done
37 % done
41 % done
44 % done
47 % done
51 % done
54 % done
58 % done
61 % done
64 % done
68 % done
71 % done
74 % done
78 % done
81 % done
85 % done
88 % done
91 % done
95 % done
98 % done
205719
BioSchema output produced at biotools.bioschema.ttl
