In [67]:
from SPARQLWrapper import SPARQLWrapper
from rdflib import Graph, RDF, DCAT
import requests
from pyshacl import validate

In [None]:
## primer intento, sale error de certificado:
# <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1028)>
def run_query(query):
    sparql.setReturnFormat('json')
    sparql.setQuery(query)
    try:
        results = sparql.queryAndConvert()
        return(results)
    except Exception as e:
        print(e)

sparql = SPARQLWrapper(
    'https://datos.gob.es/virtuoso/sparql'
)

In [None]:

url = 'https://datos.gob.es/virtuoso/sparql'
query = """
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX dcat: <http://www.w3.org/ns/dcat#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    
    DESCRIBE ?dataset ?distribution ?format ?media_type
    WHERE
    {
        ?dataset a dcat:Dataset ;
            dct:publisher ?publisher ;
            dcat:distribution ?distribution  .
        ?publisher foaf:name ?publisher_name.
        OPTIONAL {?distribution dct:format ?format }
        OPTIONAL {?distribution dcat:mediaType ?media_type }
        
        FILTER (CONTAINS (lcase(?publisher_name), "ministerio de sanidad"))
    }
"""

headers = {
    "Accept": "text/turtle"
}

res = requests.get(url, params={"query": query}, verify=False, headers=headers)
#print(r.text)



In [None]:
graph = Graph()
graph.parse(data=res.text, format="turtle")

print(f"Grafo cargado con {len(graph)} tripletas")

shapes = Graph()
shapes.parse('shapes.ttl', format='ttl')
print(f"{len(shapes)} shapes cargadas")

graph.serialize('output/datasets.ttl', format='ttl')


Grafo cargado con 408 tripletas
8 shapes cargadas


<Graph identifier=N7549cb07b8e448c4863a1b1bb1719556 (<class 'rdflib.graph.Graph'>)>

In [43]:
shapes = Graph()
shapes.parse('shapes.ttl', format='ttl')

conforms, report_graph, report_text = validate(
    graph,
    shacl_graph=shapes,
    #inference='rdfs',   # opcional: "rdfs", "owlrl", "none"
    abort_on_first=False,
    meta_shacl=False,
    advanced=True,
    debug=False
)

print(report_text)



Validation Report
Conforms: False
Results (46):
Constraint Violation in NodeConstraintComponent (http://www.w3.org/ns/shacl#NodeConstraintComponent):
	Severity: sh:Violation
	Source Shape: [ sh:minCount Literal("1", datatype=xsd:integer) ; sh:node :DistributionShape ; sh:path dcat:distribution ]
	Focus Node: ns2:e05070101-acuerdos-adoptados-en-pleno-del-sistema-nacional-de-salud-sns
	Value Node: ns7:914ee207-6c81-48a5-848e-e8db6acd284a
	Result Path: dcat:distribution
	Message: Value does not conform to Shape :DistributionShape. See details for more information.
	Details:
		Constraint Violation in MinCountConstraintComponent (http://www.w3.org/ns/shacl#MinCountConstraintComponent):
			Severity: sh:Violation
			Source Shape: [ sh:minCount Literal("1", datatype=xsd:integer) ; sh:path dcat:mediaType ]
			Focus Node: ns7:914ee207-6c81-48a5-848e-e8db6acd284a
			Result Path: dcat:mediaType
			Message: Less than 1 values on ns7:914ee207-6c81-48a5-848e-e8db6acd284a->dcat:mediaType
Constraint Vi

In [81]:
def url_accessibility(graph, predicate, violation_nodes):
    res = {}
    for dist in graph.subjects(RDF.type, DCAT.Distribution):
        if dist not in violation_nodes:
            for url in graph.objects(dist, predicate):
                try:
                    r = requests.head(str(url), allow_redirects=True, timeout=5)
                    if r.status_code == 200:
                        res[str(url)] = 'OK'
                    else: 
                        res[str(url)] = f'Error {r.status_code}'
                except Exception as e:
                    res[str(url)] = f'Failed: {e}'
    return res

query = """
    PREFIX sh: <http://www.w3.org/ns/shacl#>
    SELECT DISTINCT ?node 
    WHERE {
        ?s sh:focusNode ?node ;
        sh:resultPath ?property
        VALUES ?property { dcat:accessURL dcat:downloadURL }
    }
"""

res = list(report_graph.query(query))
violation_nodes = [str(row['node']) for row in res]

accessURL_acc = url_accessibility(graph, DCAT.accessURL, violation_nodes)
downloadURL_acc = url_accessibility(graph, DCAT.downloadURL, violation_nodes)


In [82]:
accessURL_acc
#downloadURL_acc

{'http://www.mscbs.gob.es/organizacion/consejoInterterritorial.do': 'OK',
 'http://www.adobe.com/prodindex/acrobat/readstep.html': 'OK',
 'http://www.mscbs.gob.es/estadEstudios/estadisticas/bancoDatos.htm': 'OK',
 'https://www.sanidad.gob.es/estadEstudios/estadisticas/sisInfSanSNS/ofertaRecursos/centrosSalud/home.htm': 'OK',
 'https://www.sanidad.gob.es/ciudadanos/centros.do': 'OK',
 'http://www.mscbs.gob.es/biblioPublic/biblioDocum/biblioCentral/libros.htm': 'OK',
 'http://www.mscbs.gob.es/biblioPublic/publicaciones.do': 'OK',
 'http://www.mscbs.gob.es/ciudadanos/centros.do': 'OK',
 'http://www.mscbs.gob.es/ciudadanos/prestaciones/centrosServiciosSNS/hospitales/home.htm': 'OK',
 'http://www.mscbs.gob.es/organizacion/consejoInterterri/ordenes.htm': 'OK',
 'https://www.mscbs.gob.es/profesionales/saludPublica/ccayes/alertasActual/nCov/vacunaCovid19.htm': 'Error 404',
 'https://cnecovid.isciii.es/covid19/': 'OK',
 'https://portalcne.isciii.es/enecovid19/': 'OK',
 'https://cnecovid.isciii.