## Set up
First you need to install GraphDB locally on you machine


In [1]:
from rdflib import ConjunctiveGraph
from SPARQLWrapper import SPARQLWrapper, JSON
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import numpy as np
from IPython.display import display, HTML



import pandas as pd

def remoteQuery(query, endpoint):
    endpoint.setQuery(query)
    try:
        result = endpoint.queryAndConvert()
        pd.set_option("display.max_rows",None,"display.max_colwidth",6000,"display.width",6000,)
        df = pd.DataFrame(result['results']['bindings'])
        df = df.applymap(lambda x: x['value'])
        return df
        #return (result['results']['bindings'])
    except Exception as e:
        print(e)



### With a GraphDB SPARQL endpoint

The input for SPARQLWrapper is the link to the GraphDB repository where you loaded the appropriate data set, in this example the dev version of EDAM (https://raw.githubusercontent.com/edamontology/edamontology/main/EDAM_dev.owl) and a bio.tools bioschemas turtle file (https://raw.githubusercontent.com/bio-tools/content/master/datasets/bioschemas-dump.ttl) was loaded in the GrapphDB repository. 


In [2]:
ep_biotools = SPARQLWrapper("http://localhost:7200/repositories/Project25")
ep_biotools.setReturnFormat(JSON)

## How many non-obsolete formats are in EDAM?

In [3]:
q= """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?format
    ?label
WHERE
{
    ?format rdfs:subClassOf edam:format_1915   # 'subClassOf+' does not make a difference here. Is it because the inference is on?
    ;    rdfs:label ?label
    .
}
ORDER BY ASC(?format)
"""

In [4]:
print(f"Number of non-obsolete formats in EDAM: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of non-obsolete formats in EDAM: 618


Unnamed: 0,format,label
0,http://edamontology.org/format_1196,SMILES
1,http://edamontology.org/format_1197,InChI
2,http://edamontology.org/format_1198,mf
3,http://edamontology.org/format_1199,InChIKey
4,http://edamontology.org/format_1200,smarts
5,http://edamontology.org/format_1206,unambiguous pure
6,http://edamontology.org/format_1207,nucleotide
7,http://edamontology.org/format_1208,protein
8,http://edamontology.org/format_1209,consensus
9,http://edamontology.org/format_1210,pure nucleotide


## How many EDAM formats are used to annotate bio.tools?

In [5]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT
    ?format
    (COUNT(?format) as ?count)
    ?label
WHERE
{
    ?param
        rdf:type bsct:FormalParameter ;
        sc:encodingFormat ?format .
    ?format rdfs:label ?label .
}
GROUP BY ?format ?label
ORDER BY DESC(?count)
#ORDER BY ASC(?format)
"""

In [6]:
print(f"Number of EDAM formats used in Bio.tools: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of EDAM formats used in Bio.tools: 254


Unnamed: 0,format,count,label
0,http://edamontology.org/format_2330,1354,Textual format
1,http://edamontology.org/format_1929,760,FASTA
2,http://edamontology.org/format_3475,509,TSV
3,http://edamontology.org/format_2331,480,HTML
4,http://edamontology.org/format_3016,245,VCF
5,http://edamontology.org/format_1930,194,FASTQ
6,http://edamontology.org/format_1476,189,PDB
7,http://edamontology.org/format_3547,188,Image format
8,http://edamontology.org/format_2572,167,BAM
9,http://edamontology.org/format_2332,161,XML


## How many OBSOLETE concepts are in EDAM?

In [7]:
q= """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?concept
    ?label
WHERE
{
    ?concept
        owl:deprecated true
        #rdfs:subClassOf owl:DeprecatedClass
    ;   rdfs:label ?label
    .
}
ORDER BY ASC(?concept)
"""

In [8]:
print(f"Number of OBSOLETE EDAM concepts: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of OBSOLETE EDAM concepts: 1117


Unnamed: 0,concept,label
0,http://edamontology.org/data_0005,Resource type
1,http://edamontology.org/data_0007,Tool
2,http://edamontology.org/data_0581,Database
3,http://edamontology.org/data_0583,Directory metadata
4,http://edamontology.org/data_0831,MeSH vocabulary
5,http://edamontology.org/data_0832,HGNC vocabulary
6,http://edamontology.org/data_0835,UMLS vocabulary
7,http://edamontology.org/data_0843,Database entry
8,http://edamontology.org/data_0848,Raw sequence
9,http://edamontology.org/data_0851,Sequence mask character


## How many OBSOLETE EDAM formats are used to annotate bio.tools?

In [9]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT
    ?format
    (COUNT(?format) as ?count)
    ?label
WHERE
{
    ?param
        rdf:type bsct:FormalParameter ;
        sc:encodingFormat ?format .
    ?format rdfs:label ?label .
    ?format owl:deprecated true .
}
GROUP BY ?format ?label
ORDER BY DESC(?count)
#ORDER BY ASC(?format)
"""

In [10]:
print(f"Number of OBSOLETE EDAM formats used in Bio.tools: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of OBSOLETE EDAM formats used in Bio.tools: 0


## How many bio.tools entries are annotated with EDAM formats (as input)?

In [15]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool
    #?format ?label
WHERE
{
    ?tool bsc:input [sc:encodingFormat ?format] .
    ?format rdfs:label ?label .   # Neeeded because the old data with URIs as literals for Data and Formats
}
GROUP BY ?tool
ORDER BY ASC(?tool)
#LIMIT 8
"""

In [16]:
print(f"Number of Bio.tools entries annotated with EDAM formats (as input): {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of Bio.tools entries annotated with EDAM formats (as input): 2223


Unnamed: 0,tool
0,https://bio.tools/1000genomes_assembly_converter
1,https://bio.tools/1000genomes_data_slicer
2,https://bio.tools/1000genomes_variation_pattern_finder
3,https://bio.tools/1000genomes_vcf2ped
4,https://bio.tools/2DProt
5,https://bio.tools/3SRP
6,https://bio.tools/3dnetmod
7,https://bio.tools/3drs
8,https://bio.tools/4pipe4
9,https://bio.tools/ABRicate


## How many bio.tools entries are annotated with EDAM formats (as output)?

In [17]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool
    #?format ?label
WHERE
{
    ?tool bsc:output [sc:encodingFormat ?format] .
    ?format rdfs:label ?label .   # Neeeded because the old data with URIs as literals for Data and Formats
}
GROUP BY ?tool
ORDER BY ASC(?tool)
#LIMIT 8
"""

In [18]:
print(f"Number of Bio.tools entries annotated with EDAM formats (as output): {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of Bio.tools entries annotated with EDAM formats (as output): 1984


Unnamed: 0,tool
0,https://bio.tools/2DProt
1,https://bio.tools/2d-page
2,https://bio.tools/3SRP
3,https://bio.tools/3drs
4,https://bio.tools/4dxpress
5,https://bio.tools/4pipe4
6,https://bio.tools/ABRicate
7,https://bio.tools/AMICI
8,https://bio.tools/ANISEED
9,https://bio.tools/AmtDB


## How many bio.tools entries are annotated with EDAM formats (input or output)?

In [23]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool
    #?param ?format ?label
WHERE
{
    ?tool ?param [sc:encodingFormat ?format] .
    ?format rdfs:label ?label .   # Neeeded because the old data with URIs as literals for Data and Formats
}
GROUP BY ?tool
ORDER BY ASC(?tool)
#LIMIT 8
"""

In [24]:
print(f"Number of Bio.tools entries annotated with EDAM formats (input or output): {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of Bio.tools entries annotated with EDAM formats (input or output): 2484


Unnamed: 0,tool
0,https://bio.tools/1000genomes_assembly_converter
1,https://bio.tools/1000genomes_data_slicer
2,https://bio.tools/1000genomes_variation_pattern_finder
3,https://bio.tools/1000genomes_vcf2ped
4,https://bio.tools/2DProt
5,https://bio.tools/2d-page
6,https://bio.tools/3SRP
7,https://bio.tools/3dnetmod
8,https://bio.tools/3drs
9,https://bio.tools/4dxpress


## How many bio.tools entries are annotated with OBSOLETE EDAM formats (input or output)?

In [25]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool
    #?param ?format ?label
WHERE
{
    ?tool ?param [sc:encodingFormat ?format] .
    ?format
        #rdfs:label ?label ;
        owl:deprecated true .
}
GROUP BY ?tool
ORDER BY ASC(?tool)
#LIMIT 8
"""

In [26]:
print(f"Number of Bio.tools entries annotated with OBSOLETE EDAM formats (input or output): {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of Bio.tools entries annotated with OBSOLETE EDAM formats (input or output): 0


## How many bio.tools entries are annotated with the root "Format" concept (input or output)?

In [58]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool (COUNT(?tool) as ?count)
    #?param
WHERE
{
    ?tool ?param [sc:encodingFormat edam:format_1915] .
}
GROUP BY ?tool
ORDER BY ASC(?tool)
#LIMIT 8
"""

In [59]:
print(f"Number of Bio.tools entries annotated with the root 'Format' concept (input or output): {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of Bio.tools entries annotated with the root 'Format' concept (input or output): 10


Unnamed: 0,tool,count
0,https://bio.tools/GoldenMutagenesis,1
1,https://bio.tools/MeroX,1
2,https://bio.tools/beyondcell,1
3,https://bio.tools/bioregistry,1
4,https://bio.tools/canvas1,2
5,https://bio.tools/eosc_pipeline-reproducibility,2
6,https://bio.tools/kma,3
7,https://bio.tools/meme_suite,9
8,https://bio.tools/metagwgs,2
9,https://bio.tools/mmvec,2


In https://bio.tools/meme_suite, I can actually count 11-times 'Format'.

**Looks like a bug in the Bioschema**
Why does it only count 9 here?
Could it be that it only selects the 1st encodingFormat if there are multiple in the given FormalParameter?

## How many bio.tools entries are annotated with the root "Data" concept (input or output)?

In [66]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool (COUNT(?tool) as ?count)
    #?param
WHERE
{
    ?tool ?param [sc:additionalType edam:data_0006] .
}
GROUP BY ?tool
ORDER BY DESC(?count) ASC(?tool)
#LIMIT 8
"""

In [67]:
print(f"Number of Bio.tools entries annotated with the root 'Data' concept (input or output): {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of Bio.tools entries annotated with the root 'Data' concept (input or output): 339


Unnamed: 0,tool,count
0,https://bio.tools/gatk_depth_of_coverage,30
1,https://bio.tools/cuffdiff,26
2,https://bio.tools/osdb_api,19
3,https://bio.tools/getdifferentialexpression,13
4,https://bio.tools/gatk_base_recalibrator,12
5,https://bio.tools/meme_meme,12
6,https://bio.tools/plasmidspades,12
7,https://bio.tools/compareoverlapping,9
8,https://bio.tools/compareoverlappingsmallquery,9
9,https://bio.tools/compareoverlappingsmallref,9


## Bonus: How many bio.tools entries are annotated with EDAM topics OR operations?

In [33]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool
    #?top_or_op ?concept ?label
WHERE
{
    ?tool
        rdf:type sc:SoftwareApplication ;
        ?top_or_op ?concept .
    ?concept rdfs:label ?label .
}
GROUP BY ?tool
ORDER BY ASC(?tool)
#LIMIT 16
"""

In [34]:
print(f"Number of Bio.tools entries annotated with topics or operations: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of Bio.tools entries annotated with topics or operations: 26222


Unnamed: 0,tool
0,https://bio.tools/1000genomes
1,https://bio.tools/1000genomes_assembly_converter
2,https://bio.tools/1000genomes_data_slicer
3,https://bio.tools/1000genomes_id_history_converter
4,https://bio.tools/1000genomes_variation_pattern_finder
5,https://bio.tools/1000genomes_vcf2ped
6,https://bio.tools/13Check_RNA
7,https://bio.tools/1433pred
8,https://bio.tools/16s_classifier
9,https://bio.tools/16spip


## Bonus: How many bio.tools entries are annotated with OBSOLETE EDAM topics OR operations?

In [35]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool
    #?top_or_op ?concept ?label
WHERE
{
    ?tool
        rdf:type sc:SoftwareApplication ;
        ?top_or_op ?concept .
    ?concept
        owl:deprecated true .
}
GROUP BY ?tool
ORDER BY ASC(?tool)
#LIMIT 16
"""

In [36]:
print(f"Number of Bio.tools entries annotated with OBSOLETE topics or operations: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of Bio.tools entries annotated with OBSOLETE topics or operations: 1058


Unnamed: 0,tool
0,https://bio.tools/1000genomes
1,https://bio.tools/3d-e-chem
2,https://bio.tools/3dproin
3,https://bio.tools/ADRAlert
4,https://bio.tools/AMICI
5,https://bio.tools/AMON
6,https://bio.tools/ANN-Glycolysis-Flux-Prediction
7,https://bio.tools/ASNR
8,https://bio.tools/ATEN
9,https://bio.tools/AbLIFT


## List all EDAM data-format pairs in Bio.tools

In [46]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool
    ?formal_parameter ?data ?data_label ?format ?format_label
WHERE
{
    ?tool ?formal_parameter [sc:additionalType ?data ; sc:encodingFormat ?format] .
    ?data rdfs:label ?data_label .
    ?format rdfs:label ?format_label .
}
#GROUP BY ?tool   # Uncomment this to count how many tool records (also comment out the 2nd line of selects)
ORDER BY ASC(?tool) ASC(?data) ASC(?format)
"""

In [47]:
print(f"Number of data-format pairs in Bio.tools: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of data-format pairs in Bio.tools: 6953


Unnamed: 0,tool,formal_parameter,data,data_label,format,format_label
0,https://bio.tools/1000genomes_assembly_converter,http://bioschemas.org/input,http://edamontology.org/data_1276,Nucleic acid features,http://edamontology.org/format_3007,PSL
1,https://bio.tools/1000genomes_data_slicer,http://bioschemas.org/input,http://edamontology.org/data_2012,Sequence coordinates,http://edamontology.org/format_3016,VCF
2,https://bio.tools/1000genomes_variation_pattern_finder,http://bioschemas.org/input,http://edamontology.org/data_3498,Sequence variations,http://edamontology.org/format_3016,VCF
3,https://bio.tools/1000genomes_vcf2ped,http://bioschemas.org/input,http://edamontology.org/data_3498,Sequence variations,http://edamontology.org/format_3016,VCF
4,https://bio.tools/2DProt,http://bioschemas.org/input,http://edamontology.org/data_1460,Protein structure,http://edamontology.org/format_1475,PDB database entry format
5,https://bio.tools/2DProt,http://bioschemas.org/output,http://edamontology.org/data_2992,Protein structure image,http://edamontology.org/format_3547,Image format
6,https://bio.tools/2d-page,http://bioschemas.org/output,http://edamontology.org/data_0897,Protein property,http://edamontology.org/format_2331,HTML
7,https://bio.tools/3SRP,http://bioschemas.org/output,http://edamontology.org/data_2048,Report,http://edamontology.org/format_2331,HTML
8,https://bio.tools/3SRP,http://bioschemas.org/output,http://edamontology.org/data_3112,Gene expression matrix,http://edamontology.org/format_3751,DSV
9,https://bio.tools/3SRP,http://bioschemas.org/input,http://edamontology.org/data_3495,RNA sequence,http://edamontology.org/format_1931,FASTQ-illumina


## List all EDAM data-format pairs in Bio.tools, that are consistent with _is_format_of_ in EDAM

In [48]:
q= """
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX edam: <http://edamontology.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sc: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT
    ?tool
    ?formal_parameter ?data ?data_label ?format ?format_label
WHERE
{
    ?tool ?formal_parameter [sc:additionalType ?data ; sc:encodingFormat ?format] .
    ?data rdfs:label ?data_label .
    ?format
        rdfs:subClassOf [owl:onProperty edam:is_format_of ; owl:someValuesFrom ?data] ;   # Comment this out to see all
        rdfs:label ?format_label .
}
#GROUP BY ?tool   # Uncomment this to count how many tool records (also comment out the 2nd line of selects)
ORDER BY ASC(?tool) ASC(?data) ASC(?format)
"""

In [49]:
print(f"Number of data-format pairs in Bio.tools, that are consistent with 'is format of' in EDAM: {len(remoteQuery(query=q, endpoint=ep_biotools))}")
results=remoteQuery(query=q, endpoint=ep_biotools)
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             results.to_html() +
             "</div>"))

Number of data-format pairs in Bio.tools, that are consistent with 'is format of' in EDAM: 1121


Unnamed: 0,tool,formal_parameter,data,data_label,format,format_label
0,https://bio.tools/1000genomes_variation_pattern_finder,http://bioschemas.org/input,http://edamontology.org/data_3498,Sequence variations,http://edamontology.org/format_3016,VCF
1,https://bio.tools/1000genomes_vcf2ped,http://bioschemas.org/input,http://edamontology.org/data_3498,Sequence variations,http://edamontology.org/format_3016,VCF
2,https://bio.tools/3SRP,http://bioschemas.org/output,http://edamontology.org/data_2048,Report,http://edamontology.org/format_2331,HTML
3,https://bio.tools/3drs,http://bioschemas.org/input,http://edamontology.org/data_3870,Trajectory data,http://edamontology.org/format_3910,trr
4,https://bio.tools/AMICI,http://bioschemas.org/input,http://edamontology.org/data_2600,Pathway or network,http://edamontology.org/format_2585,SBML
5,https://bio.tools/Anncolvar,http://bioschemas.org/input,http://edamontology.org/data_3870,Trajectory data,http://edamontology.org/format_3867,Trajectory format (binary)
6,https://bio.tools/Astrocyte_Quantification_and_Analysis_AQuA,http://bioschemas.org/input,http://edamontology.org/data_2968,Image,http://edamontology.org/format_3591,TIFF
7,https://bio.tools/CPSR,http://bioschemas.org/input,http://edamontology.org/data_3498,Sequence variations,http://edamontology.org/format_3016,VCF
8,https://bio.tools/CoBAMP,http://bioschemas.org/input,http://edamontology.org/data_2600,Pathway or network,http://edamontology.org/format_2585,SBML
9,https://bio.tools/Complex_Portal,http://bioschemas.org/output,http://edamontology.org/data_0906,Protein interaction data,http://edamontology.org/format_3158,PSI MI XML (MIF)


Oh no, this already shows something what might be an EDAM way of thinking from the past, with a need for re-assessment:

![image.png](attachment:baeda59b-0db5-4358-b3b2-4424fdef7123.png)