# Bioinformatics tools and training materials knowledge graph 
### BioSchemas to bridge Bio.Tools and TeSS 

**Issue and objective**: TeSS training materials are not always annotated with the EDAM Bioinformatics ontology. Since Bio.Tools softwares are annotated with EDAZM in terms of topics and operations, the idea would be to navigate through training materials and their, associated softwares, to retrieve relevant EDAM annotations. 

**Results**: we build a knowledge graph (KG) leveraging i) the EDAM ontology, ii) BioSchemas tools and training material profiles, and iii) lifted content from Bio.Tools and TeSS. This KG can further be exploited with semantic queries (SPARQL), for instance we can search a training material based on bioinformatics definition or synonyms formalized in th EDAM ontology. 

Contact : alban.gaignard@univ-nantes.fr

Work done during the [Elixir BioHackathon 2019](https://www.biohackathon-europe.org) 

# 1. Transforming a TeSS entry to BioSchema

In [10]:
import json
import requests
import urllib3
import pandas as pd
from rdflib import ConjunctiveGraph
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#url = 'https://tess.elixir-europe.org/materials.json_api?scientific_topics=RNA-Seq'
url = 'https://tess.elixir-europe.org/materials.json_api'
    
r = requests.get(url)
training = r.json()

#print(json.dumps(training, indent=2))

def rdfize_tess(item):
    jsonld = {}
    jsonld['@context'] = {
        "edam": "http://edamontology.org/",
        "schema": "http://schema.org/", 
        "name": "schema:name",
        "hasPart": "schema:hasPart"
    }
    jsonld['@graph'] = []

    #print(item['attributes']['title'])
    #print()
    #print(item['attributes'])
    #print()
    if item['attributes']['external-resources'] : 
        for extr in item['attributes']['external-resources']:
            #print(extr)
            if 'bio.tools' in extr['url']:
                #print('\t' + str(extr['url']))
                tess_biotools_entry = {'@id':item['attributes']['url'], 
                 '@type': 'schema:CreativeWork', 
                 'name':item['attributes']['title'], 
                 'hasPart':{'@id':extr['url']}}
                jsonld['@graph'].append(tess_biotools_entry)
                
    return json.dumps(jsonld, indent=4, sort_keys=True)

print('Sample JSON-LD data for the a TeSS training material\n')
for item in training['data']:
    print(rdfize_tess(item))
    break

Sample JSON-LD data for the a TeSS training material

{
    "@context": {
        "edam": "http://edamontology.org/",
        "hasPart": "schema:hasPart",
        "name": "schema:name",
        "schema": "http://schema.org/"
    },
    "@graph": [
        {
            "@id": "https://doi.org/10.5281/zenodo.3689221",
            "@type": "schema:CreativeWork",
            "hasPart": {
                "@id": "https://bio.tools/tool/Data_Stewardship_Wizard"
            },
            "name": "Data Stewardship Wizard Workshop (Feb 2020)"
        }
    ]
}


# 2. Crawling the whole TeSS registry

In [3]:


def crawl_tess(limit=-1):
    graph = ConjunctiveGraph()
    
    """
    Go through all TeSS entries and print the tool home page if the keyword is found in the tool description.  
    :param limit: an integer value specifying the max number of entries to be crawled, -1 by default, means no limit.
    """
    
    http = urllib3.PoolManager()
    http.headers['Accept'] = 'application/json'
    http.headers['Content-type'] = 'application/json'
    
    try:
        req = http.request('GET', 'https://tess.elixir-europe.org/materials.json_api?page=1')
        count_json = json.loads(req.data.decode('utf-8'))
        #print(json.dumps(count_json, indent=2))
        count = int(count_json['meta']['results-count'])
        #print(str(count)+ " available TeSS entries")

        i = 1
        nb_tools = 1
        has_next_page = True
        while has_next_page :
            #print('Crawling page '+str(i))
            req = http.request('GET', 'https://tess.elixir-europe.org/materials.json_api?page=' + str(i))
            try:
                entry = json.loads(req.data.decode('utf-8'))
            except JSONDecodeError as e:
                print("Json decode error for " + str(req.data.decode('utf-8')))
                break
            
            #print(json.dumps(entry['links'], indent=2))
            has_next_page = ('next' in entry['links'].keys())
            

            for tool in entry['data']:
                jsonld = rdfize_tess(tool)
                graph.parse(data=jsonld, format='json-ld')
                #print(len(graph))
                
                #temp_graph = ConjunctiveGraph()
                #temp_graph.parse(data=jsonld, format='json-ld')
                #os.makedirs('./bio.tools.dataset/'+tool['biotoolsID'], exist_ok=True)
                #temp_graph.serialize(format="json-ld", 
                #                     auto_compact=True, 
                #                     destination=str('./bio.tools.dataset/'+tool['biotoolsID']+'/'+tool['biotoolsID']+'.jsonld'))
                
                nb_tools += 1
                progress = nb_tools * 100 / count
                if (nb_tools % 200 == 0) :
                    print(str(round(progress))+" % done")
                if ((limit != -1) and (nb_tools >= limit)):
                    return graph
            i += 1

    except urllib3.exceptions.HTTPError as e:
        print(e)
    
    return graph

g = crawl_tess(limit=-1)
print(len(g))
g.serialize(format='turtle', destination='tess.bioschema.ttl')
print('BioSchema output produced at tess.bioschema.ttl')
g.serialize(format='json-ld', auto_compact=True, destination='tess.bioschema.jsonld')
print('BioSchema output produced at tess.bioschema.jsonld')

15 % done
29 % done
44 % done
59 % done
74 % done
88 % done
666
BioSchema output produced at tess.bioschema.ttl
BioSchema output produced at tess.bioschema.jsonld


# 3. Feeding the KG

In [4]:
%%time
from rdflib import ConjunctiveGraph
kg = ConjunctiveGraph()# 2. Crawling the whole TeSS registry
kg.parse('tess.bioschema.ttl', format='turtle')
print(len(kg))
kg.parse('biotools.bioschema.ttl', format='turtle')
print(len(kg))
kg.parse('/Users/gaignard-a/Documents/Dev/fresh-toolbox/EDAM.owl')
print(len(kg))

666
196201
229690
CPU times: user 16.1 s, sys: 157 ms, total: 16.3 s
Wall time: 16.4 s


# 4. Querying the KG
This query searches for training materials having as "part-of" relation some tools. The descriptions of the tools are then navigated to retreive their names, their associated topics, and the operations they perform.  

In [8]:
query = """
SELECT ?mat_label ?t_label ?op_label ?tool_name WHERE {
    ?mat schema:hasPart ?tool ;
        schema:name ?mat_label .
    ?tool schema:applicationSubCategory ?topic ;
       schema:featureList ?operation ;
       schema:name ?tool_name .
    ?topic rdfs:label ?t_label .
    ?operation rdfs:label ?op_label .
}
"""
res_data = []
results = kg.query(query)
for r in results:
    #print(r)
    res_data.append({'Training material':str(r['mat_label']), 'Tool':str(r['tool_name']), 'Topic':str(r['t_label']), 'Operation':str(r['op_label'])})   
        
df = pd.DataFrame(res_data)
df

Unnamed: 0,Operation,Tool,Topic,Training material
0,Statistical calculation,Bioconductor,Computational biology,R / Bioconductor for everyone
1,Analysis,Bioconductor,Computational biology,R / Bioconductor for everyone
2,Data handling,Bioconductor,Computational biology,R / Bioconductor for everyone
3,Statistical calculation,Bioconductor,Bioinformatics,R / Bioconductor for everyone
4,Analysis,Bioconductor,Bioinformatics,R / Bioconductor for everyone
5,Data handling,Bioconductor,Bioinformatics,R / Bioconductor for everyone
6,Statistical calculation,Bioconductor,Data management,R / Bioconductor for everyone
7,Analysis,Bioconductor,Data management,R / Bioconductor for everyone
8,Data handling,Bioconductor,Data management,R / Bioconductor for everyone
9,Statistical calculation,Bioconductor,Statistics and probability,R / Bioconductor for everyone
