# Analyse EuroVoc
Get the number of concepts and the depth of the ontology

It is important to have rdflib, SPARQLWrapper, pandas, and numpy
* pip install rdflib
* pip install SPARQLWrapper
* pip install pandas
* pip install numpy

Download the lastest version of EuroVoc from https://op.europa.eu/en/web/eu-vocabularies/dataset/-/resource?uri=http://publications.europa.eu/resource/dataset/eurovoc

In [1]:
from rdflib import Graph
from rdflib.namespace import RDFS
from rdflib import URIRef
import rdflib
import json
from collections import deque
import numpy as np
import pandas as pd

In [2]:
input_file = "eurovoc-skos-ap-eu.rdf"
g = Graph()
g.parse(input_file)

<Graph identifier=N54875dd346774bdcb7eaa55d21c471bb (<class 'rdflib.graph.Graph'>)>

In [5]:
qres = g.query(
    """PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
       SELECT DISTINCT ?a
       WHERE {
          ?a a skos:Concept .
       }""")


topics = dict()
for row in qres:
    topics[row[0]] = True
    
print("Number of concepts: {}".format(len(topics)))

Number of concepts: 7339


In [7]:
qres = g.query(
    """PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
       SELECT DISTINCT ?a ?b
       WHERE {
          ?a skos:broader ?b .
       }""")

broaders = dict()
narrowers = dict()
for row in qres:
    if row[0] not in broaders:
        broaders[row[0]] = list()
    broaders[row[0]].append(row[1])
    if row[1] not in narrowers:
        narrowers[row[1]] = list()
    narrowers[row[1]].append(row[0])

In [8]:
unhier = broaders
concepts = topics
for concept, value in concepts.items():
    queue = deque() 
    max_depth = value
    queue.append({"t":concept,"d":value})
    while len(queue) > 0:
        dequeued = queue.popleft()
        if dequeued["t"] in unhier:
            broads = unhier[dequeued["t"]]
            new_depth = dequeued["d"]+1
            if new_depth > max_depth:
                max_depth = new_depth
            for broader in broads:
                queue.append({"t":broader,"d":dequeued["d"]+1})
    
    concepts[concept] = max_depth

In [9]:
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])
list_of_depths.sort_values('depth', inplace=True, ascending=False)

In [10]:
print("Concepts are ranked by maximum depth")
list_of_depths.head(20)

Concepts are ranked by maximum depth


Unnamed: 0,depth
http://eurovoc.europa.eu/199,6
http://eurovoc.europa.eu/5318,6
http://eurovoc.europa.eu/4355,6
http://eurovoc.europa.eu/4247,6
http://eurovoc.europa.eu/4053,6
http://eurovoc.europa.eu/198,6
http://eurovoc.europa.eu/4799,5
http://eurovoc.europa.eu/6370,5
http://eurovoc.europa.eu/6273,5
http://eurovoc.europa.eu/5022,5


In [14]:
for k, v in broaders.items(): 
    if len(v) > 1: 
        print("{} has {} parents".format(k, len(v)))

http://eurovoc.europa.eu/1236 has 4 parents
http://eurovoc.europa.eu/2084 has 6 parents
http://eurovoc.europa.eu/4092 has 4 parents
http://eurovoc.europa.eu/5693 has 2 parents
http://eurovoc.europa.eu/5563 has 4 parents
http://eurovoc.europa.eu/690 has 6 parents
http://eurovoc.europa.eu/4816 has 6 parents
http://eurovoc.europa.eu/4246 has 2 parents
http://eurovoc.europa.eu/2037 has 4 parents
http://eurovoc.europa.eu/5652 has 3 parents
http://eurovoc.europa.eu/2543 has 8 parents
http://eurovoc.europa.eu/1019 has 5 parents
http://eurovoc.europa.eu/4466 has 4 parents
http://eurovoc.europa.eu/2058 has 4 parents
http://eurovoc.europa.eu/4620 has 2 parents
http://eurovoc.europa.eu/5965 has 6 parents
http://eurovoc.europa.eu/205 has 3 parents
http://eurovoc.europa.eu/4862 has 3 parents
http://eurovoc.europa.eu/249 has 4 parents
http://eurovoc.europa.eu/4841 has 7 parents
http://eurovoc.europa.eu/314 has 6 parents
http://eurovoc.europa.eu/8373 has 2 parents
http://eurovoc.europa.eu/1220 has 2 