In [162]:
from SPARQLWrapper import SPARQLWrapper
from rdflib import Graph, RDF, RDFS, DCAT, XSD, URIRef, Literal, Namespace
import requests
from pyshacl import validate

## Datasets metadata retrieval

In [None]:
## primer intento, sale error de certificado:
# <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1028)>
def run_query(query):
    sparql.setReturnFormat('json')
    sparql.setQuery(query)
    try:
        results = sparql.queryAndConvert()
        return(results)
    except Exception as e:
        print(e)

sparql = SPARQLWrapper(
    'https://datos.gob.es/virtuoso/sparql'
)

In [204]:

url = 'https://datos.gob.es/virtuoso/sparql'
query = """
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX dcat: <http://www.w3.org/ns/dcat#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    
    DESCRIBE ?dataset ?distribution ?format ?media_type
    WHERE
    {
        ?dataset a dcat:Dataset ;
            dct:publisher ?publisher ;
            dcat:distribution ?distribution  .
        ?publisher foaf:name ?publisher_name.
        OPTIONAL {?distribution dct:format ?format }
        OPTIONAL {?distribution dcat:mediaType ?media_type }
        
        FILTER (CONTAINS (lcase(?publisher_name), "ministerio de sanidad"))
    }
"""

headers = {
    "Accept": "text/turtle"
}

res = requests.get(url, params={"query": query}, verify=False, headers=headers)
#print(r.text)



In [206]:
graph = Graph()
graph.parse(data=res.text, format="turtle")

print(f"Grafo cargado con {len(graph)} tripletas")

shapes = Graph()
shapes.parse('input/shapes.ttl', format='ttl')
print(f"{len(shapes)} shapes cargadas")

#graph.serialize('output/datasets.ttl', format='ttl')


Grafo cargado con 408 tripletas
26 shapes cargadas


## Shapes validation

In [None]:
shapes = Graph()
shapes.parse('shapes.ttl', format='ttl')

conforms, shacl_report_graph, shacl_report_text = validate(
    graph,
    shacl_graph=shapes,
    abort_on_first=False,
    meta_shacl=False,
    advanced=True,
    debug=False
)

print(shacl_report_text)
shacl_report_graph.serialize('output/shacl_report.ttl', format='ttl')


Validation Report
Conforms: False
Results (46):
Constraint Violation in NodeConstraintComponent (http://www.w3.org/ns/shacl#NodeConstraintComponent):
	Severity: sh:Violation
	Source Shape: [ sh:minCount Literal("1", datatype=xsd:integer) ; sh:node :DistributionShape ; sh:path dcat:distribution ]
	Focus Node: ns2:e05070101-acuerdos-adoptados-en-pleno-del-sistema-nacional-de-salud-sns
	Value Node: ns7:914ee207-6c81-48a5-848e-e8db6acd284a
	Result Path: dcat:distribution
	Message: Value does not conform to Shape :DistributionShape. See details for more information.
	Details:
		Constraint Violation in MinCountConstraintComponent (http://www.w3.org/ns/shacl#MinCountConstraintComponent):
			Severity: sh:Violation
			Source Shape: [ sh:minCount Literal("1", datatype=xsd:integer) ; sh:path dcat:mediaType ]
			Focus Node: ns7:914ee207-6c81-48a5-848e-e8db6acd284a
			Result Path: dcat:mediaType
			Message: Less than 1 values on ns7:914ee207-6c81-48a5-848e-e8db6acd284a->dcat:mediaType
Constraint Vi

## Accessibility of dcat:accessURL and dcat:downloadURL 
Deprecated, solved below when creating the report

In [None]:
def url_accessibility(graph, predicate, violation_nodes):
    res = {}
    for dist in graph.subjects(RDF.type, DCAT.Distribution):
        if dist not in violation_nodes:
            for url in graph.objects(dist, predicate):
                try:
                    r = requests.head(str(url), allow_redirects=True, timeout=5)
                    if r.status_code == 200:
                        res[str(url)] = [True, r.status_code]
                    else: 
                        res[str(url)] = [False, r.status_code]
                except Exception as e:
                    res[str(url)] = f'Failed: {e}'
    return res

query = """
    PREFIX sh: <http://www.w3.org/ns/shacl#>
    SELECT DISTINCT ?node 
    WHERE {
        ?s sh:focusNode ?node ;
        sh:resultPath ?property
        VALUES ?property { dcat:accessURL dcat:downloadURL }
    }
"""

# retrieve the uris that have not passed the SHACL validation for dcat:accessURL and dcat:downloadURL
# the ones that don't pass the validation (start with 'http(s)://') are not checked if accesible
res = list(shacl_report_graph.query(query))
violation_nodes = [str(row['node']) for row in res]

accessURL_acc = url_accessibility(graph, DCAT.accessURL, violation_nodes)
downloadURL_acc = url_accessibility(graph, DCAT.downloadURL, violation_nodes)
print(accessURL_acc)

{'http://www.mscbs.gob.es/organizacion/consejoInterterritorial.do': [True, 200], 'http://www.adobe.com/prodindex/acrobat/readstep.html': [True, 200], 'http://www.mscbs.gob.es/estadEstudios/estadisticas/bancoDatos.htm': [True, 200], 'https://www.sanidad.gob.es/estadEstudios/estadisticas/sisInfSanSNS/ofertaRecursos/centrosSalud/home.htm': [True, 200], 'https://www.sanidad.gob.es/ciudadanos/centros.do': [True, 200], 'http://www.mscbs.gob.es/biblioPublic/biblioDocum/biblioCentral/libros.htm': [True, 200], 'http://www.mscbs.gob.es/biblioPublic/publicaciones.do': [True, 200], 'http://www.mscbs.gob.es/ciudadanos/centros.do': [True, 200], 'http://www.mscbs.gob.es/ciudadanos/prestaciones/centrosServiciosSNS/hospitales/home.htm': [True, 200], 'http://www.mscbs.gob.es/organizacion/consejoInterterri/ordenes.htm': [True, 200], 'https://www.mscbs.gob.es/profesionales/saludPublica/ccayes/alertasActual/nCov/vacunaCovid19.htm': [False, 404], 'https://cnecovid.isciii.es/covid19/': [True, 200], 'https://

## Controlled vocabs for formats and media types
Deprecated, solved below when creating the report

In [111]:
def is_vocabulary(types_list):
    is_vocab = {}
    headers = {'Accept': 'text/turtle'}
    for uri in types_list:
        g = Graph()
        res = requests.get(uri, headers=headers) # gets whole vocab, not only the triples for the resource
        g.parse(data=res.text, format='ttl')
        types = [str(o) for o in g.objects(URIRef(uri), RDF.type)] #its type indicates if its a controlled vocab or not
        if 'http://purl.org/dc/dcam/VocabularyEncodingScheme' in types:
            is_vocab[uri] = True
        else:
            is_vocab[uri] = False
    return(is_vocab)


query = """
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX dcat: <http://www.w3.org/ns/dcat#>

    SELECT DISTINCT ?resource_type 
    WHERE {{
        ?distribution a dcat:Distribution ;
            {property} ?resource .
        ?resource a ?resource_type .
    }}
"""

# Get the URIs of the types of formats and media types
formats = [str(row['resource_type']) for row in list(graph.query(query.format(property='dct:format')))]
media_types = [str(row['resource_type']) for row in list(graph.query(query.format(property='dcat:mediaType')))]

formats_vocab = is_vocabulary(formats)
media_types_vocab = is_vocabulary(media_types)

print(formats_vocab, media_types_vocab)

{'http://purl.org/dc/terms/IMT': True} {}


# Generating report following DQV

## Datasets

Dimension: completeness of dcat:keyword and dcat:theme in Datasets

In [189]:
dataset_metric = """
    @prefix dct: <http://purl.org/dc/terms/> .
    @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
    @prefix dcat: <http://www.w3.org/ns/dcat#> .
    @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
    @prefix dqv: <http://www.w3.org/ns/dqv#> .
    @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
    @prefix ldqd:    <http://www.w3.org/2016/05/ldqd#> .
    @prefix : <http://example.org/> .

    :datasetCompletenessMetric
        a dqv:Metric ; 	
        skos:definition "Ratio between the metadata properties observed in the dataset and the number of properties expected to be represented (dcat:keyword and dcat:theme)." ;
        dqv:expectedDataType xsd:double ;
        dqv:inDimension ldqd:completeness
        .
"""

In [190]:
def find_violation_nodes(shacl_report_graph, property):
    query = """
        PREFIX sh: <http://www.w3.org/ns/shacl#>
        PREFIX dcat: <http://www.w3.org/ns/dcat#>
        PREFIX dct: <http://purl.org/dc/terms/>
        SELECT DISTINCT ?node 
        WHERE {{
            ?s sh:focusNode ?node ;
            sh:resultPath {property}
        }}
    """
    res = list(shacl_report_graph.query(query.format(property=property)))
    return( [str(row['node']) for row in res] )

In [196]:
# We start assuming all datasets have both properties, then each property is checked and if found in the 
# shacl report, the value is re-calculated
DQV = Namespace('http://www.w3.org/ns/dqv#')
EX = Namespace('http://example.org/')
SH = Namespace ('http://www.w3.org/ns/shacl#')
DCT = Namespace('http://purl.org/dc/terms/')

dqv_report = Graph()
dqv_report.parse(data=dataset_metric, format='ttl')

violation_keywords = find_violation_nodes(shacl_report_graph, 'dcat:keyword')
violation_theme = find_violation_nodes(shacl_report_graph, 'dcat:theme')

for uri in graph.subjects(RDF.type, DCAT.Dataset):
    dqv_report.add((uri, RDF.type, DCAT.Dataset))
    for s, p, o in graph.triples((None, DCAT.distribution, None)):
        dqv_report.add((s, p, o))
    measurement_uri = URIRef(str(uri).replace('https://datos.gob.es/catalogo/', 'http://example.org/measurement-'))
    dqv_report.add((uri, DQV.hasQualityMeasurement, measurement_uri))
    dqv_report.add((measurement_uri, RDF.type, DQV.QualityMeasurement))
    dqv_report.add((measurement_uri, DQV.computedOn, uri))
    dqv_report.add((measurement_uri, DQV.isMeasurementOf, EX.datasetCompletenessMetric))
    
    value = 1
    if str(uri) in violation_keywords:
        value -= 0.5
        dqv_report.add((measurement_uri, RDFS.comment, Literal('Missing keywords (dcat:keyword).')))
    if str(uri) in violation_theme:
        value -= 0.5
        dqv_report.add((measurement_uri, RDFS.comment, Literal('Missing category (dcat:theme).')))

    dqv_report.add((measurement_uri, DQV.value, Literal(value, datatype=XSD.float)))

print(len(dqv_report))


105


## Distributions

* First, assess completeness of formats and media Types
* Then, assess availability of provided accessURLs and downloadURLs
* Last, check type of format/media types (controlled vocab or not)

### Definition of Metrics and Dimensions

In [197]:
distribution_metrics = """
    @prefix dct: <http://purl.org/dc/terms/> .
    @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
    @prefix dcat: <http://www.w3.org/ns/dcat#> .
    @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
    @prefix dqv: <http://www.w3.org/ns/dqv#> .
    @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
    @prefix ldqd:    <http://www.w3.org/2016/05/ldqd#> .
    @prefix : <http://example.org/> .

    :distributionCompletenessMetric
        a dqv:Metric ; 	
        skos:definition "Ratio between the metadata properties observed in the dataset and the number of properties expected to be represented (dcat:mediaType and dct:format)." ;
        dqv:expectedDataType xsd:double ;
        dqv:inDimension ldqd:completeness
        .
    
    :downloadURLAvailabilityMetric
        a dqv:Metric ;
        skos:definition "It checks if dcat:downloadURL value provided is available." ;
        dqv:expectedDataType xsd:boolean ;
        dqv:inDimension ldqd:availability
        .

    :accessURLAvailabilityMetric
        a dqv:Metric ;
        skos:definition "It checks if dcat:accessURL value provided is available." ;
        dqv:expectedDataType xsd:boolean ;
        dqv:inDimension ldqd:availability
        .

    :formatInControlledVocabularyMetric
        a dqv:Metric ;
        skos:definition "It checks if dct:format value is reused from a controlled vocabulary." ;
        dqv:expectedDataType xsd:boolean ;
        dqv:inCategory ldqd:interoperability
        .
    
    :mediaTypeInControlledVocabularyMetric
        a dqv:Metric ;
        skos:definition "It checks if dcat:mediaType value is reused from a controlled vocabulary." ;
        dqv:expectedDataType xsd:boolean ;
        dqv:inCategory ldqd:interoperability
        .

"""

### Completeness

In [198]:

dqv_report.parse(data=distribution_metrics, format='ttl')

violation_format = find_violation_nodes(shacl_report_graph, 'dct:format')
violation_media_type = find_violation_nodes(shacl_report_graph, 'dcat:mediaType')

for uri in graph.subjects(RDF.type, DCAT.Distribution):
    dqv_report.add((uri, RDF.type, DCAT.Distribution))
    measurement_uri = URIRef(str(uri).replace('https://datos.gob.es/catalogo/', 'http://example.org/completeness-measurement-'))
    dqv_report.add((uri, DQV.hasQualityMeasurement, measurement_uri))
    dqv_report.add((measurement_uri, RDF.type, DQV.QualityMeasurement))
    dqv_report.add((measurement_uri, DQV.computedOn, uri))
    dqv_report.add((measurement_uri, DQV.isMeasurementOf, EX.distributionCompletenessMetric))
    
    value = 1
    if str(uri) in violation_format:
        value -= 0.5
        dqv_report.add((measurement_uri, RDFS.comment, Literal('Missing format (dct:format).')))
    if str(uri) in violation_media_type:
        value -= 0.5
        dqv_report.add((measurement_uri, RDFS.comment, Literal('Missing media type (dcat:mediaType).')))

    dqv_report.add((measurement_uri, DQV.value, Literal(value, datatype=XSD.float)))

print(len(dqv_report))

286


### URL availability

In [199]:

def url_accessibility(url):
    try:
        r = requests.head(str(url), allow_redirects=True, timeout=5)
        if r.status_code == 200:
            return True
        else: 
            return False
    except Exception:
        return False

violation_nodes_acc = find_violation_nodes(shacl_report_graph, 'dcat:accessURL')
violation_nodes_down = find_violation_nodes(shacl_report_graph, 'dcat:downloadURL')

for dist_uri in graph.subjects(RDF.type, DCAT.Distribution):
    # dqv_report.add((dist_uri, RDF.type, DCAT.Distribution)) ## already added in completeness metric for distributions
    down_measure_uri = URIRef(str(dist_uri).replace('https://datos.gob.es/catalogo/', 'http://example.org/download-accessibility-measurement-'))
    acc_measure_uri = URIRef(str(dist_uri).replace('https://datos.gob.es/catalogo/', 'http://example.org/access-accessibility-measurement-'))
    
    for uri in graph.objects(dist_uri, DCAT.downloadURL):
        if dist_uri in violation_nodes_down: # if captured in shape, malformed URL
            is_accesible = False
        is_accesible = url_accessibility(uri)
        dqv_report.add((dist_uri, DQV.hasQualityMeasurement, down_measure_uri))
        dqv_report.add((down_measure_uri, RDF.type, DQV.QualityMeasurement))
        dqv_report.add((down_measure_uri, DQV.computedOn, dist_uri))
        dqv_report.add((down_measure_uri, DQV.isMeasurementOf, EX.downloadURLAvailabilityMetric))
        dqv_report.add((down_measure_uri, DQV.value, Literal(is_accesible, datatype=XSD.boolean)))

    for uri in graph.objects(dist_uri, DCAT.accessURL):
        if dist_uri in violation_nodes_acc: # if captured in shape, malformed URL
            is_accesible = False
        is_accesible = url_accessibility(uri)
        dqv_report.add((dist_uri, DQV.hasQualityMeasurement, acc_measure_uri))
        dqv_report.add((acc_measure_uri, RDF.type, DQV.QualityMeasurement))
        dqv_report.add((acc_measure_uri, DQV.computedOn, dist_uri))
        dqv_report.add((acc_measure_uri, DQV.isMeasurementOf, EX.accessURLAvailabilityMetric))
        dqv_report.add((acc_measure_uri, DQV.value, Literal(is_accesible, datatype=XSD.boolean)))

print(len(dqv_report))

401


### Interoperability

In [200]:
def chekc_if_vocabulary(uri):
    headers = {'Accept': 'text/turtle'}
    g = Graph()
    res = requests.get(uri, headers=headers) # gets whole vocab, not only the triples for the resource
    g.parse(data=res.text, format='ttl')
    types = [str(o) for o in g.objects(URIRef(uri), RDF.type)] #its type indicates if its a controlled vocab or not
    if 'http://purl.org/dc/dcam/VocabularyEncodingScheme' in types:
        return True
    else:
        return False

for dist_uri in graph.subjects(RDF.type, DCAT.Distribution):
    # dqv_report.add((uri, RDF.type, DCAT.Distribution)) # already added
    format_measure_uri = URIRef(str(dist_uri).replace('https://datos.gob.es/catalogo/', 'http://example.org/format-interop-measurement-'))
    mt_measure_uri = URIRef(str(dist_uri).replace('https://datos.gob.es/catalogo/', 'http://example.org/mediatype-interop-measurement-'))
    
    for uri in graph.objects(dist_uri, URIRef('http://purl.org/dc/terms/format')):    
        dqv_report.add((dist_uri, DQV.hasQualityMeasurement, format_measure_uri))
        dqv_report.add((format_measure_uri, RDF.type, DQV.QualityMeasurement))
        dqv_report.add((format_measure_uri, DQV.computedOn, dist_uri))
        dqv_report.add((format_measure_uri, DQV.isMeasurementOf, EX.formatInControlledVocabularyMetric))
        format_type = [str(o) for o in list(graph.objects(uri, RDF.type))][0]
        is_vocabulary = chekc_if_vocabulary(format_type)
        dqv_report.add((format_measure_uri, DQV.value, Literal(is_vocabulary, datatype=XSD.boolean)))

    for uri in graph.objects(dist_uri, DCAT.mediaType):   
        dqv_report.add((dist_uri, DQV.hasQualityMeasurement, mt_measure_uri))
        dqv_report.add((mt_measure_uri, RDF.type, DQV.QualityMeasurement))
        dqv_report.add((mt_measure_uri, DQV.computedOn, dist_uri))
        dqv_report.add((mt_measure_uri, DQV.isMeasurementOf, EX.mediaTypeInControlledVocabularyMetric))
        mt_type = [str(o) for o in list(graph.objects(uri, RDF.type))][0]
        is_vocabulary = chekc_if_vocabulary(mt_type)
        dqv_report.add((mt_measure_uri, DQV.value, Literal(is_vocabulary, datatype=XSD.boolean)))

print(len(dqv_report))
dqv_report.serialize('output/dqv_report.ttl', format='ttl')

516


<Graph identifier=Ne2e9469b9b2f4919b8379a4563654464 (<class 'rdflib.graph.Graph'>)>