# Analyse EuroVoc
Get the number of concepts and the depth of the ontology

It is important to have rdflib, SPARQLWrapper, pandas, and numpy
* pip install rdflib
* pip install SPARQLWrapper
* pip install pandas
* pip install numpy

Download the lastest version of EuroSciVoc from https://op.europa.eu/en/web/eu-vocabularies/dataset/-/resource?uri=http://publications.europa.eu/resource/dataset/euroscivoc

In [1]:
from rdflib import Graph
from rdflib.namespace import RDFS
from rdflib import URIRef
import rdflib
import json
from collections import deque
import numpy as np
import pandas as pd

In [2]:
input_file = "EuroSciVoc-skos-ap-eu.rdf"
g = Graph()
g.parse(input_file)

<Graph identifier=N2e3b9874b1cf406faeffa427247a8932 (<class 'rdflib.graph.Graph'>)>

In [3]:
qres = g.query(
    """PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
       SELECT DISTINCT ?a
       WHERE {
          ?a a skos:Concept .
       }""")


topics = dict()
for row in qres:
    topics[row[0]] = True
    
print("Number of concepts: {}".format(len(topics)))

Number of concepts: 991


In [4]:
qres = g.query(
    """PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
       SELECT DISTINCT ?a ?b
       WHERE {
          ?a skos:broader ?b .
       }""")

broaders = dict()
narrowers = dict()
for row in qres:
    if row[0] not in broaders:
        broaders[row[0]] = list()
    broaders[row[0]].append(row[1])
    if row[1] not in narrowers:
        narrowers[row[1]] = list()
    narrowers[row[1]].append(row[0])

In [5]:
unhier = broaders
concepts = topics
for concept, value in concepts.items():
    queue = deque() 
    max_depth = value
    queue.append({"t":concept,"d":value})
    while len(queue) > 0:
        dequeued = queue.popleft()
        if dequeued["t"] in unhier:
            broads = unhier[dequeued["t"]]
            new_depth = dequeued["d"]+1
            if new_depth > max_depth:
                max_depth = new_depth
            for broader in broads:
                queue.append({"t":broader,"d":dequeued["d"]+1})
    
    concepts[concept] = max_depth

In [9]:
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])
list_of_depths.sort_values('depth', inplace=True, ascending=False)

In [10]:
print("Concepts are ranked by maximum depth")
list_of_depths.head(20)

Concepts are ranked by maximum depth


Unnamed: 0,depth
http://data.europa.eu/8mn/euroscivoc/aa115772-abcd-4903-8750-31e7739e733e,7
http://data.europa.eu/8mn/euroscivoc/a77e9ec8-7f65-4307-9030-80be41333538,7
http://data.europa.eu/8mn/euroscivoc/934ce75d-7b9c-43b2-ad73-8b46d1dcbdd3,7
http://data.europa.eu/8mn/euroscivoc/28d7156b-f2c0-4e72-b6a2-72480832ada4,7
http://data.europa.eu/8mn/euroscivoc/818e3157-3df3-448d-80af-c7822c9a58f6,7
http://data.europa.eu/8mn/euroscivoc/89f2e45e-af45-40f5-b25f-15fabc3579b1,7
http://data.europa.eu/8mn/euroscivoc/4924e464-4d3b-43cf-83b5-a3582f8b9b0a,7
http://data.europa.eu/8mn/euroscivoc/3c7bbe7a-db5d-4270-8141-6f8a49fa7a68,6
http://data.europa.eu/8mn/euroscivoc/4a54085a-ff4f-4b91-b572-472262d210d5,6
http://data.europa.eu/8mn/euroscivoc/cc4c298e-7c41-46b4-ac6c-6a2b71d476c5,6


In [8]:
print("If it does not print anything after this line, it means EuroSciVoc is monohierarchical (a narrower has only one broader)")
for k, v in broaders.items(): 
    if len(v) > 1: 
        print("{} has {} parents".format(k, len(v)))

If it does not print anything after this line, it means EuroSciVoc is monohierarchical (a narrower has only one broader)
