# 1. Generating a SHACL shape from a list of mandatory properties

In [None]:
from jinja2 import Template

mandatory_properties = ['schema:featureList', 
                        'schema:applicationSubCategory', 
                        'edam:has_input', 
                        'edam:has_output']

shape_template = """
@prefix dash: <http://datashapes.org/dash#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix edam: <http://edamontology.org/> .

schema:ToolShape
    a sh:NodeShape ;
    sh:targetClass schema:SoftwareApplication ;
    
    {% for prop_name in properties %}
    sh:property [
        sh:path {{prop_name}} ;
        sh:minCount 1 ;
    ] ;
    {% endfor %}
    .
"""

template = Template(shape_template)
shape = template.render(properties=mandatory_properties)
print(shape)
g = ConjunctiveGraph()
g.parse(data = shape, format='turtle')
print(len(g))

# 2. Defining test data to be validated

In [None]:
manual_shape_constraint = """
@prefix dash: <http://datashapes.org/dash#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

schema:ToolShape
    a sh:NodeShape ;
    sh:targetClass schema:SoftwareApplication ;
    sh:property [
        sh:path schema:featureList ;
        sh:minCount 1 ;
    ] ;
    sh:property [
        sh:path schema:applicationSubCategory ;
        sh:minCount 1 ;
    ] .
"""

data = """
@prefix biotools: <https://bio.tools/ontology/> .
@prefix bsc: <http://bioschemas.org/> .
@prefix dc: <http://purl.org/dc/terms/> .
@prefix doi: <https://doi.org/> .
@prefix edam: <http://edamontology.org/> .
@prefix pmc: <https://www.ncbi.nlm.nih.gov/pmc/> .
@prefix pubmed: <https://www.ncbi.nlm.nih.gov/pubmed/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix sc: <http://schema.org/> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .


<https://bio.tools/PyBDA> a sc:SoftwareApplication ;
    sc:additionalType "Command-line tool" ;
    sc:applicationSubCategory edam:topic_3391 ;
    sc:citation <https://doi.org/10.1186/s12859-019-3087-8> ;
    sc:description "A command-line tool for analysis of big biological data sets for distributed HPC clusters." ;
    sc:featureList edam:operation_2939,
        edam:operation_2945,
        edam:operation_3432,
        edam:operation_3659 ;
    sc:license "GPL-3.0" ;
    sc:name "PyBDA" ;
    sc:url "https://pybda.readthedocs.io/en/latest/#" .
"""

# 3. Validating the shape on the graph data

In [None]:
from pyshacl import validate
from IPython.display import display, Markdown, Latex

r = validate(data_graph = data, 
             data_graph_format='turtle', 
             shacl_graph = shape, 
             shacl_graph_format = 'turtle', 
             ont_graph = None, 
             inference = 'rdfs', 
             abort_on_error = False, 
             meta_shacl = False, 
             debug = True)

conforms, results_graph, results_text = r

if not conforms:
    print(results_graph.serialize(format='turtle').decode())
else:
    print('Everything is fine')

# 4. Producing an explaination for the validation graph

In [None]:
report_query = """
    SELECT ?node ?path WHERE {
        ?v rdf:type sh:ValidationReport ;
           sh:result ?r .
        ?r sh:focusNode ?node ;
           sh:sourceShape ?s .
        ?s sh:path ?path .   
    }
"""

results = results_graph.query(report_query)
for r in results :
    display(Markdown('The tool `{}` should be fixed, it is missing information for field {}'.format(str(r['node']), str(r['path']))))
    #display(Markdown(' **It was produced in the context of** ' + str(r['st']) ))

# 5. Launching the validation on a large dataset ..

In [None]:
r = validate(data_graph = 'biotools.bioschema.ttl', 
             data_graph_format='turtle', 
             shacl_graph = shape, 
             shacl_graph_format = 'turtle', 
             ont_graph = None, 
             inference = 'rdfs', 
             abort_on_error = False, 
             meta_shacl = False, 
             debug = True)

conforms, results_graph, results_text = r

results = results_graph.query(report_query)
for r in results :
    display(Markdown('The tool {} should be fixed, it is missing information for field {}'.format(str(r['node']), str(r['path']))))

# 6. Basic statistics

In [3]:
%%time
import pandas as pd
from rdflib import ConjunctiveGraph

g_dump = ConjunctiveGraph()
g_dump.parse("./biotools.bioschema.ttl", format='turtle')
print(len(g_dump))

205719
CPU times: user 12.9 s, sys: 71.8 ms, total: 13 s
Wall time: 13 s


In [None]:
%%time
count_query = """
    SELECT (COUNT(DISTINCT ?node) AS ?to_be_fixed_entries) WHERE {
        ?v rdf:type sh:ValidationReport ;
           sh:result ?r .
        ?r sh:focusNode ?node ;
           sh:sourceShape ?s .
        ?s sh:path ?path .   
    }
"""

results = results_graph.query(count_query)
for r in results :
    display(Markdown('**{}** tools should be fixed.'.format(str(r['to_be_fixed_entries']))))



In [7]:
#%%time 
ns = {"nb": "http://bise-eu.info/core-ontology#",
      "dc": "http://dcterms/",
      "p-plan": "http://purl.org/net/p-plan#",
      "edam": "http://purl.obolibrary.org/obo/edam#", 
      "sh": "http://www.w3.org/ns/shacl#"}

count_query = """
    SELECT  ?p (COUNT(?s) AS ?count ) { ?s ?p ?o } GROUP BY ?p ORDER BY ?count
"""


res_data = []
results = g_dump.query(count_query, initNs=ns)
for r in results:
    res_data.append({'Property':str(r['p']),'Count':str(r['count'])})   
#print(list_of_topics)

df = pd.DataFrame(res_data)
df

Unnamed: 0,Count,Property
0,3343,http://edamontology.org/has_output
1,3493,http://edamontology.org/has_input
2,4806,http://schema.org/version
3,5270,http://schema.org/license
4,14767,http://schema.org/url
5,14767,http://schema.org/description
6,14767,http://schema.org/name
7,15877,http://schema.org/additionalType
8,16000,http://schema.org/author
9,22460,http://schema.org/citation
