# Analyse AGROVOC Thesaurus
Get the number of concepts and the depth of the ontology

It is important to have rdflib, SPARQLWrapper, pandas, and numpy
* pip install rdflib
* pip install SPARQLWrapper
* pip install pandas
* pip install numpy

In [1]:
from rdflib import Graph
from rdflib.namespace import RDFS
from rdflib import URIRef
import rdflib
import json
from collections import deque
import numpy as np
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [2]:
SPARQL_ENDPOINT = "https://agrovoc.fao.org/sparql"

In [3]:
sparql = SPARQLWrapper(SPARQL_ENDPOINT)

In [15]:
sparql.setQuery("""
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT * WHERE {
  ?topic rdf:type skos:Concept .
}
""")

sparql.setReturnFormat(JSON)
results = sparql.query().convert()


topics = dict()

for result in results["results"]["bindings"]:
    topics[result["topic"]["value"]] = True

In [5]:
print("Number of concepts: {}".format(len(topics)))

Number of concepts: 39276


In [6]:
sparql.setQuery("""
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT * WHERE {
  ?father skos:narrower ?son .
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

broaders = dict()
narrowers = dict()


for result in results["results"]["bindings"]:
    if result["son"]["value"] not in broaders:
        broaders[result["son"]["value"]] = list()
    broaders[result["son"]["value"]].append(result["father"]["value"])
    if result["father"]["value"] not in narrowers:
        narrowers[result["father"]["value"]] = list()
    narrowers[result["father"]["value"]].append(result["son"]["value"])


In [16]:
unhier = broaders
concepts = topics

for concept, value in concepts.items():
    queue = deque() 
    max_depth = value
    queue.append({"t":concept,"d":value})
    while len(queue) > 0:
        dequeued = queue.popleft()
        if dequeued["t"] in unhier:
            broads = unhier[dequeued["t"]]
            new_depth = dequeued["d"]+1
            if new_depth > 100: #### This is important to skip loops
                break
            if new_depth > max_depth:
                max_depth = new_depth
            for broader in broads:
                queue.append({"t":broader,"d":dequeued["d"]+1})
    
    concepts[concept] = max_depth

In [17]:
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])
list_of_depths.sort_values('depth', inplace=True, ascending=False)

In [18]:
print("REMEMBER: if you find depth == 100, it means the algorithm found a loop and it was forced to break.")
list_of_depths.head(10)

REMEMBER: if you find depth == 100, it means the algorithm found a loop and it was forced to break.


Unnamed: 0,depth
http://aims.fao.org/aos/agrovoc/c_c91f415e,100
http://aims.fao.org/aos/agrovoc/c_fb856911,100
http://aims.fao.org/aos/agrovoc/c_8313dd7c,14
http://aims.fao.org/aos/agrovoc/c_0b6d37e4,14
http://aims.fao.org/aos/agrovoc/c_b1f84777,14
http://aims.fao.org/aos/agrovoc/c_2d9bc13d,14
http://aims.fao.org/aos/agrovoc/c_46b0b671,14
http://aims.fao.org/aos/agrovoc/c_e9077677,14
http://aims.fao.org/aos/agrovoc/c_f72596bd,14
http://aims.fao.org/aos/agrovoc/c_d5b64061,14
