# Analyse the UNESCO Thesaurus
Get the number of concepts and the depth of the ontology

It is important to have rdflib, SPARQLWrapper, pandas, and numpy
* pip install rdflib
* pip install SPARQLWrapper
* pip install pandas
* pip install numpy

Download the lastest version of the Unesco Thesaurus from http://vocabularies.unesco.org/browser/thesaurus/en/

In [1]:
from rdflib import Graph
from rdflib.namespace import RDFS
from rdflib import URIRef
import rdflib
import json
from collections import deque
import numpy as np

In [2]:
input_file = "unesco-thesaurus.rdf"
g = Graph()
g.parse(input_file)

<Graph identifier=N11d2a993a4474c3e80029abb53e72462 (<class 'rdflib.graph.Graph'>)>

In [3]:
qres = g.query(
    """SELECT DISTINCT ?a
       WHERE {
          ?a a skos:Concept .
       }""")
topics = dict()
for row in qres:
#     print("%s" % row)
    topics[row[0]] = True
    
print("Number of concepts: {}".format(len(topics)))

Number of concepts: 4436


In [4]:
qres = g.query(
    """SELECT DISTINCT ?a ?b
       WHERE {
          ?a skos:broader ?b .
          ?a a skos:Concept
       }""")

broaders = dict()
narrowers = dict()
for row in qres:
    if row[0] not in broaders:
        broaders[row[0]] = list()
    broaders[row[0]].append(row[1])
    if row[1] not in narrowers:
        narrowers[row[1]] = list()
    narrowers[row[1]].append(row[0])

In [7]:
unhier = broaders
concepts = topics
for concept, value in concepts.items():
    queue = deque() 
    max_depth = value
    queue.append({"t":concept,"d":value})
    while len(queue) > 0:
        dequeued = queue.popleft()
        if dequeued["t"] in unhier:
            broads = unhier[dequeued["t"]]
            new_depth = dequeued["d"]+1
            if new_depth > max_depth:
                max_depth = new_depth
            for broader in broads:
                queue.append({"t":broader,"d":dequeued["d"]+1})
    
    concepts[concept] = max_depth

In [9]:
import pandas as pd
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])

In [10]:
list_of_depths.sort_values('depth', inplace=True, ascending=False)
print("Concepts are ranked by maximum depth")
list_of_depths.head(20)

Concepts are ranked by maximum depth


Unnamed: 0,depth
http://vocabularies.unesco.org/thesaurus/concept3676,6
http://vocabularies.unesco.org/thesaurus/concept2016,6
http://vocabularies.unesco.org/thesaurus/concept7488,6
http://vocabularies.unesco.org/thesaurus/concept3525,6
http://vocabularies.unesco.org/thesaurus/concept2052,6
http://vocabularies.unesco.org/thesaurus/concept2226,5
http://vocabularies.unesco.org/thesaurus/concept6672,5
http://vocabularies.unesco.org/thesaurus/concept10893,5
http://vocabularies.unesco.org/thesaurus/concept2206,5
http://vocabularies.unesco.org/thesaurus/concept6836,5


In [6]:
print("If results are showed here, it means that one concept can have more than one parent: polyhierarchical")
for key, broad in broaders.items():
    #print(len(broad))
    if (len(broad) > 1):
        print(key, broad)

If results are showed here, it means that one concept can have more than one parent: polyhierarchical
http://vocabularies.unesco.org/thesaurus/concept17085 [rdflib.term.URIRef('http://vocabularies.unesco.org/thesaurus/concept17034'), rdflib.term.URIRef('http://vocabularies.unesco.org/thesaurus/concept767')]
http://vocabularies.unesco.org/thesaurus/concept7367 [rdflib.term.URIRef('http://vocabularies.unesco.org/thesaurus/concept935'), rdflib.term.URIRef('http://vocabularies.unesco.org/thesaurus/concept942'), rdflib.term.URIRef('http://vocabularies.unesco.org/thesaurus/concept934'), rdflib.term.URIRef('http://vocabularies.unesco.org/thesaurus/concept3557'), rdflib.term.URIRef('http://vocabularies.unesco.org/thesaurus/concept956')]
http://vocabularies.unesco.org/thesaurus/concept848 [rdflib.term.URIRef('http://vocabularies.unesco.org/thesaurus/concept856'), rdflib.term.URIRef('http://vocabularies.unesco.org/thesaurus/concept17034')]
http://vocabularies.unesco.org/thesaurus/concept918 [rdf