## Set up
First you need to install GraphDB locally on you machine


In [3]:
from rdflib import ConjunctiveGraph
from SPARQLWrapper import SPARQLWrapper, JSON
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import numpy as np
from IPython.display import display, HTML


import pandas as pd

def remoteQuery(query, endpoint):
    endpoint.setQuery(query)
    try:
        result = endpoint.queryAndConvert()
        pd.set_option("display.max_rows",None,"display.max_colwidth",6000,"display.width",6000,)
        df = pd.DataFrame(result['results']['bindings'])
        df = df.applymap(lambda x: x['value'])
        return df
        #return (result['results']['bindings'])
    except Exception as e:
        print(e)


In [4]:
ep_biotools = SPARQLWrapper("http://localhost:7200/repositories/Project25")
ep_biotools.setReturnFormat(JSON)

## List all EDAM formats with _is_format_of_ relations (transitive)

In [5]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?format ?format_label
    (COUNT(?is_format_of_data) as ?count)
    #?is_format_of_data ?is_format_of_data_label
WHERE
{
    ?format
        rdfs:subClassOf [owl:onProperty edam:is_format_of ; owl:someValuesFrom ?is_format_of_data] ;
        rdfs:label ?format_label .
    ?is_format_of_data rdfs:label ?is_format_of_data_label .
} GROUP BY ?format ?format_label
ORDER BY DESC(?count) ASC(?format) ASC(?is_format_of_data)
"""


In [6]:
results1=remoteQuery(query=q, endpoint=ep_biotools)
print(f"nb of edam format with is_format_of relation: {len(results1)}")

display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results1.to_html() +
             "</div>"))

nb of edam format with is_format_of relation: 533


Unnamed: 0,format,format_label,count
0,http://edamontology.org/format_2352,BioXSD (XML),9
1,http://edamontology.org/format_3772,BioJSON (BioXSD),9
2,http://edamontology.org/format_3773,BioYAML,9
3,http://edamontology.org/format_2572,BAM,4
4,http://edamontology.org/format_2573,SAM,4
5,http://edamontology.org/format_3007,PSL,4
6,http://edamontology.org/format_3771,UniProtKB RDF,4
7,http://edamontology.org/format_3774,BioJSON (Jalview),4
8,http://edamontology.org/format_3777,MCPD,4
9,http://edamontology.org/format_3826,proBAM,4


## List all EDAM formats with has_topic relations (transitive)

In [7]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?format ?format_label
    (COUNT(?is_format_of_data) as ?count)
    #?is_format_of_data ?is_format_of_data_label
WHERE
{
    ?format
        rdfs:subClassOf [owl:onProperty edam:has_topic ; owl:someValuesFrom ?is_format_of_data] ;
        rdfs:label ?format_label .
    ?is_format_of_data rdfs:label ?is_format_of_data_label .
} GROUP BY ?format ?format_label
ORDER BY DESC(?count) ASC(?format) ASC(?is_format_of_data)
"""


In [8]:
results1=remoteQuery(query=q, endpoint=ep_biotools)
print(f"nb of edam format with is_format_of relation: {len(results1)}")

display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results1.to_html() +
             "</div>"))

nb of edam format with is_format_of relation: 618


Unnamed: 0,format,format_label,count
0,http://edamontology.org/operation_0269,Transmembrane protein prediction,8
1,http://edamontology.org/operation_0245,Structural motif discovery,7
2,http://edamontology.org/operation_0246,Protein domain recognition,7
3,http://edamontology.org/operation_0267,Protein secondary structure prediction,7
4,http://edamontology.org/operation_0268,Protein super-secondary structure prediction,7
5,http://edamontology.org/operation_0390,Protein peeling,7
6,http://edamontology.org/operation_0468,Protein secondary structure prediction (helices),7
7,http://edamontology.org/operation_0469,Protein secondary structure prediction (turns),7
8,http://edamontology.org/operation_0470,Protein secondary structure prediction (coils),7
9,http://edamontology.org/operation_2464,Protein-protein binding site prediction,7


## How many bio.tools entries have EDAM topics and operations which are consistent with the has_topic relation? +listing

In [9]:
q = """
PREFIX edam:<http://edamontology.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
SELECT
	DISTINCT ?biotools_id
WHERE
{
   ?biotools_id a sc:SoftwareApplication;
   sc:featureList ?feature.
    ?feature rdfs:subClassOf/owl:Restriction* ?rs.
	?rs owl:onProperty <http://edamontology.org/has_topic>;
    owl:someValuesFrom ?r .
    FILTER NOT EXISTS {
     ?biotools_id sc:applicationSubCategory ?r.   
    }
    
}
"""

In [10]:
print(f"nb of EDAM operation and topics consistent by has_topic, used in bio.tools: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

nb of EDAM data and topics consistent by has_topic, used in bio.tools: 21031


Unnamed: 0,biotools_id
0,https://bio.tools/AmtDB
1,https://bio.tools/AnnotSV
2,https://bio.tools/Ciona_robusta_Anatomy_and_Development_Ontology
3,https://bio.tools/DiversityCollection
4,https://bio.tools/DiversityReferences
5,https://bio.tools/Python_Video_Annotator
6,https://bio.tools/RNAIndel
7,https://bio.tools/SecStrAnnotator
8,https://bio.tools/aceparser
9,https://bio.tools/ahtpdb


## How many bio.tools entries have EDAM data but not data format defined (input/output) + listing?

In [13]:
q = """
PREFIX edam:<http://edamontology.org/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX sc: <http://schema.org/>
SELECT
    ?biotools_id ?type
WHERE
{
    {
        {
           	?biotools_id a sc:SoftwareApplication;
			bsc:input [sc:additionalType ?data].
            VALUES ?type {'input'}.
        } MINUS 
    	{
            ?biotools_id a sc:SoftwareApplication;
      		bsc:input [sc:additionalType ?data; sc:encodingFormat ?data2];
		}
    } UNION {
        {
            ?biotools_id a sc:SoftwareApplication;
			bsc:output [sc:additionalType ?data].
            VALUES ?type {'output'}.
        } MINUS 
        {
                ?biotools_id a sc:SoftwareApplication;
                bsc:output [sc:additionalType ?data; sc:encodingFormat ?data2];
        }
    }

}
GROUP BY ?biotools_id ?type
ORDER BY ?biotools_id
"""

In [14]:
print(f"nb of EDAM data but not with data format definition, used in bio.tools: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

nb of EDAM data but not with data format definition, used in bio.tools: 1867


Unnamed: 0,biotools_id,type
0,https://bio.tools/1000genomes_id_history_converter,input
1,https://bio.tools/1000genomes_id_history_converter,output
2,https://bio.tools/2d-page,input
3,https://bio.tools/2d-page,output
4,https://bio.tools/2dx,input
5,https://bio.tools/3dbionotes,input
6,https://bio.tools/3dnetmod,output
7,https://bio.tools/4dxpress,input
8,https://bio.tools/Banana_Genome_Hub,output
9,https://bio.tools/BeanMine,input
