In [None]:
import requests
import json

#url = 'https://tess.elixir-europe.org/materials.json_api?scientific_topics=RNA-Seq'
url = 'https://tess.elixir-europe.org/materials.json_api'
    
r = requests.get(url)
training = r.json()

#print(json.dumps(training, indent=2))

def rdfize_tess(item):
    jsonld = {}
    jsonld['@context'] = {
        "edam": "http://edamontology.org/",
        "schema": "http://schema.org/", 
        "name": "schema:name",
        "hasPart": "schema:hasPart"
    }
    jsonld['@graph'] = []

    #print(item['attributes']['title'])
    #print()
    #print(item['attributes'])
    #print()
    if item['attributes']['external-resources'] : 
        for extr in item['attributes']['external-resources']:
            #print(extr)
            if 'bio.tools' in extr['url']:
                #print('\t' + str(extr['url']))
                tess_biotools_entry = {'@id':item['attributes']['url'], 
                 '@type': 'schema:CreativeWork', 
                 'name':item['attributes']['title'], 
                 'hasPart':{'@id':extr['url']}}
                jsonld['@graph'].append(tess_biotools_entry)
                
    return json.dumps(jsonld, indent=4, sort_keys=True)

for item in training['data']:
    print(rdfize_tess(item))
    break

In [None]:
from rdflib import ConjunctiveGraph
g = ConjunctiveGraph()

json_ld_as_string = json.dumps(jsonld, indent=4, sort_keys=True)
g.parse(data=json_ld_as_string, format='json-ld')
print(g.serialize(format='turtle').decode())

In [None]:
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def crawl_tess(limit=-1):
    graph = ConjunctiveGraph()
    
    """
    Go through all TeSS entries and print the tool home page if the keyword is found in the tool description.  
    :param limit: an integer value specifying the max number of entries to be crawled, -1 by default, means no limit.
    """
    
    http = urllib3.PoolManager()
    http.headers['Accept'] = 'application/json'
    http.headers['Content-type'] = 'application/json'
    
    try:
        req = http.request('GET', 'https://tess.elixir-europe.org/materials.json_api?page=1')
        count_json = json.loads(req.data.decode('utf-8'))
        #print(json.dumps(count_json, indent=2))
        count = int(count_json['meta']['results-count'])
        #print(str(count)+ " available TeSS entries")

        i = 1
        nb_tools = 1
        has_next_page = True
        while has_next_page :
            #print('Crawling page '+str(i))
            req = http.request('GET', 'https://tess.elixir-europe.org/materials.json_api?page=' + str(i))
            try:
                entry = json.loads(req.data.decode('utf-8'))
            except JSONDecodeError as e:
                print("Json decode error for " + str(req.data.decode('utf-8')))
                break
            
            #print(json.dumps(entry['links'], indent=2))
            has_next_page = ('next' in entry['links'].keys())
            

            for tool in entry['data']:
                jsonld = rdfize_tess(tool)
                graph.parse(data=jsonld, format='json-ld')
                #print(len(graph))
                
                #temp_graph = ConjunctiveGraph()
                #temp_graph.parse(data=jsonld, format='json-ld')
                #os.makedirs('./bio.tools.dataset/'+tool['biotoolsID'], exist_ok=True)
                #temp_graph.serialize(format="json-ld", 
                #                     auto_compact=True, 
                #                     destination=str('./bio.tools.dataset/'+tool['biotoolsID']+'/'+tool['biotoolsID']+'.jsonld'))
                
                nb_tools += 1
                progress = nb_tools * 100 / count
                if (nb_tools % 200 == 0) :
                    print(str(round(progress))+" % done")
                if ((limit != -1) and (nb_tools >= limit)):
                    return graph
            i += 1

    except urllib3.exceptions.HTTPError as e:
        print(e)
    
    return graph

In [None]:
g = crawl_tess(limit=-1)
print(len(g))
g.serialize(format='turtle', destination='tess.bioschema.ttl')
print('BioSchema output produced at tess.bioschema.ttl')
g.serialize(format='json-ld', auto_compact=True, destination='tess.bioschema.jsonld')
print('BioSchema output produced at tess.bioschema.jsonld')

In [4]:
%%time
from rdflib import ConjunctiveGraph
kg = ConjunctiveGraph()
kg.parse('tess.bioschema.ttl', format='turtle')
print(len(kg))
kg.parse('biotools.bioschema.ttl', format='turtle')
print(len(kg))
kg.parse('/Users/gaignard-a/Documents/Dev/fresh-toolbox/EDAM.owl')
print(len(kg))

627
196162
229651
CPU times: user 14.9 s, sys: 89.6 ms, total: 15 s
Wall time: 15 s


In [18]:
import pandas as pd
query = """
SELECT ?mat_label ?t_label ?op_label ?tool_name WHERE {
    ?mat schema:hasPart ?tool ;
        schema:name ?mat_label .
    ?tool schema:applicationSubCategory ?topic ;
       schema:featureList ?operation ;
       schema:name ?tool_name .
    ?topic rdfs:label ?t_label .
    ?operation rdfs:label ?op_label .
}
"""
res_data = []
results = kg.query(query)
for r in results:
    #print(r)
    res_data.append({'Training material':str(r['mat_label']), 'Tool':str(r['tool_name']), 'Topic':str(r['t_label']), 'Operation':str(r['op_label'])})   
        
df = pd.DataFrame(res_data)
df

Unnamed: 0,Operation,Tool,Topic,Training material
0,Data handling,Bioconductor,Computational biology,05: Bioconductor Annotation Resources
1,Analysis,Bioconductor,Computational biology,05: Bioconductor Annotation Resources
2,Statistical calculation,Bioconductor,Computational biology,05: Bioconductor Annotation Resources
3,Data handling,Bioconductor,Statistics and probability,05: Bioconductor Annotation Resources
4,Analysis,Bioconductor,Statistics and probability,05: Bioconductor Annotation Resources
5,Statistical calculation,Bioconductor,Statistics and probability,05: Bioconductor Annotation Resources
6,Data handling,Bioconductor,Bioinformatics,05: Bioconductor Annotation Resources
7,Analysis,Bioconductor,Bioinformatics,05: Bioconductor Annotation Resources
8,Statistical calculation,Bioconductor,Bioinformatics,05: Bioconductor Annotation Resources
9,Data handling,Bioconductor,Data management,05: Bioconductor Annotation Resources
