Install external libraries

In [None]:
!pip install requests # library for making HTTP req
!pip install lxml # library for working with XML
!pip install bs4 # yet another library for working with XML

Clone git repository with tools (to follow adopted contributing protocol it may be useful to make a fork of this repository at github first)

In [None]:
!git clone https://github.com/galaxyproject/tools-iuc

Import classes and functions from installed libraries

In [None]:
import requests
import json
from lxml import etree
from os import walk
import os
import glob
import re
from bs4 import BeautifulSoup
import csv
from urllib.request import urlopen

Create utility functions

Function to download bio.tools data

In [None]:
def fetch(p="", c=[]):
    try:
        url = "https://bio.tools/api/t" + p + "&format=json"
        json = requests.get(url).json()
        
        print("Page: {}".format(p))

        return fetch(json['next'], (c + json['list']))

    except:
        return c

In [None]:
data = fetch(p="?page=1")

Save data to file (to reuse in the next runs, but be careful, google collab provides no guarantees on data persistence)

In [None]:
with open('data.json', 'w') as outfile:
    json.dump(data, outfile)

Function that enriches data with doi lists

In [None]:
def enrich_publication_data(biotool_description):
    biotool_description['dois'] = []
    for publication in biotool_description['publication']:
        if publication['doi']:
            biotool_description['dois'].append({
                'doi': publication['doi'],
                'type': publication['type'],
                'source': 'doi'
            })
        else:
            if publication['pmid']:
                doi = get_doi(publication['pmid'])
                if doi:
                    biotool_description['dois'].append({
                        'doi': doi,
                        'type': publication['type'],
                        'source': 'pmid'
                    })
            elif publication['pmcid']:
                doi = get_doi(publication['pmcid'])
                if doi:
                    biotool_description['dois'].append({
                        'doi': doi,
                        'type': publication['type'],
                        'source': 'pmid'
                    })

Function to convert PMID and PMCID to DOI

In [None]:
def get_doi(pid):
    # Based on implementation of DOI fetcher by Kenzo-Hugo Hillion
    url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids=" + pid
    xml = etree.fromstring(requests.get(url).text)
    if xml.find('record') is not None:
        try:
            doi = xml.find('record').attrib['doi']
            print("DOI was found for {}".format(pid))
            return doi
        except:
            print("DOI was not found for {}".format(pid))
            return None

Enrich tools description with DOIs

In [None]:
i = 0
for tool in data:
    print("Tool #{}".format(i))
    enrich_publication_data(tool)
    i += 1

Save results to file

In [None]:
with open('data_enriched.json', 'w') as outfile:
    json.dump(data, outfile)

Get the list of XML files

In [None]:
path ="{}/tools-iuc/tools/".format(os.getcwd())

In [None]:
filepathes = []
for (dirpath, dirnames, filenames) in walk(path):
    for d in dirnames:
        p = dirpath + d
        filelist = dirList = glob.glob(p + "/*.xml")
        filepathes += filelist

Function for extracting DOI from Galaxy tool description

In [None]:
tools_dois = {}
for filepath in filepathes:
    #print("{}: Tool #{} parsed".format(filepath, i))
    with open(filepath) as f:
        xml = BeautifulSoup(f, 'xml')
    
    dois = xml.find_all('citation', {"type" : "doi"})

    if len(dois) > 0:
        tools_dois[filepath] = list(map(lambda x: x.get_text(), dois))

Function to extract EDAM topics' and operations' IDs from bio.tools description

In [None]:
def enrich_from_biotools(biotool, galaxy_tool_path, results):
    # extract edam topic and edam operation
    
    topics = biotool.get('topic', [])
    if len(topics) > 0:
        results['biotools_topics'] += list(map(lambda x: x['uri'].split('/')[-1], topics))
        results['biotools_topics'] = list(set(results['biotools_topics']))

    functions = biotool.get('function', [])
    
    if "biotools_operations" in results and results['biotools_operations'] != None:
        results['biotools_operations'] = []
    
    if len(functions) > 0:
        for function in functions:
            operations = function.get('operation', [])
            if len(operations) > 0:
                results['biotools_operations'].append(list(set(list(map(lambda x: x['uri'].split('/')[-1], operations)))))
                
    results['biotools_id'] = biotool.get('biotoolsID', None)

    return results

Function to extract EDAM topics' and operations' IDs from Debian Med repositories

In [None]:
def enrich_from_debmed(debtool, galaxy_tool_path, results):
    topics = debtool.get('topics', [])
    if topics and len(topics) > 0:
        for topic in topics:
            t = edam_data.get(topic, None)
            results['deb_topics'].append({
                'url': t,
                'value': topic
            })

    functions = debtool.get('edam_scopes', [])
    
    if functions and len(functions) > 0:
        for function in functions:
            operations = function.get('function', [])
            
            if isinstance(operations, str):
                op = edam_data.get(operations, None)
                results['deb_operations'].append([{
                    'url': op,
                    'value': function
                }])
            else:
                if len(operations) > 0:
                    ops = []
                    for operation in operations:
                        op = edam_data.get(operation, None)
                        ops.append({
                            'url': op,
                            'value': operation
                        })
                    if len(ops) > 0:
                        results['deb_operations'].append(ops)
                
    results['deb_biotools_id'] = debtool.get('bio.tools', None)

    return results

In [None]:
# The script `edam.sh` is written by Andreas Tille (https://github.com/tillea)
# and copied from https://github.com/bio-tools/biotoolsConnect
# It generates a file `edam.json`
!bash edam.sh -j

Load the JSON output of `edam.sh`

In [None]:
with open('edam.json') as json_file:  
    debian_data = json.load(json_file)

Download EDAM

In [None]:
version = '1.21'
url = 'http://edamontology.org/EDAM_{}.tsv'.format(version)

file = urlopen(url)

with open('edam.tsv','wb') as output:
    output.write(file.read())

with open('edam.tsv','r') as tsv:
    tsv = csv.reader(tsv, delimiter='\t')
    edam_data = {}
    for row in tsv:
        edam_data[row[0]] = {
            'label': row[1],
            'synonyms': row[2].split('|'),
            'definition': row[54],
            'comments': row[3].split('|'),
        }
        edam_data['_version'] = version

Create tools annotations (match Galaxy tool's DOI against bio.tools' DOI and Debian Med tools' DOI to get topics and operations)

In [None]:
i = 0
j = 0
tool_annotations = {}
for path, galaxy_dois in tools_dois.items():
    
    tool_annotations[path] = []
    
    for galaxy_doi in galaxy_dois:
        for biotool in data:
            for biotool_doi in biotool['dois']:
                if galaxy_doi == biotool_doi['doi']:
                    i += 1
                    tool_edam = enrich_from_biotools(biotool, path, {
                        'type': 'bio.tools',
                        'biotools_topics': [],
                        'biotools_operations': [],
                        'biotools_id': None,
                        'biotools_doi': biotool_doi
                    })
                    tool_annotations[path].append(tool_edam)
        for deb_tool in debian_data:
            if galaxy_doi == deb_tool['doi']:
                j += 1
                tool_edam = enrich_from_debmed(deb_tool, path, {
                    'type': 'debmed',
                    'deb_topics': [],
                    'deb_operations': [],
                    'deb_biotools_id': None,
                })
                tool_annotations[path].append(tool_edam)
    
print("Total bio.tools matches:", i)
print("Total Debian Med matches:", j)

In [None]:
with open('./client/src/tool_annotations.json', 'w') as outfile:
    json.dump(tool_annotations, outfile)