# Analyse Art and Architecture Thesaurus
Get the number of concepts and the depth of the ontology

It is important to have rdflib, SPARQLWrapper, pandas, tqdm, and numpy
* pip install rdflib
* pip install SPARQLWrapper
* pip install pandas
* pip install numpy
* pip install tqdm

Please check that the SPARQL Endpoint is running: http://vocab.getty.edu/sparql

In [1]:
from rdflib import Graph
from rdflib.namespace import RDFS
from rdflib import URIRef
import rdflib
import json
from collections import deque
import numpy as np
from SPARQLWrapper import SPARQLWrapper, JSON
from urllib.error import HTTPError
from collections import defaultdict
from tqdm import tqdm
from time import gmtime, strftime

In [2]:
def query_wrapper(sparql, query):
    completed = False
    while completed == False:
        try: 
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            completed = True
        except HTTPError:
            print("Going to sleep for 20 secs")
            time.sleep(20)
    return results

In [11]:
sparql = SPARQLWrapper("http://vocab.getty.edu/sparql")
query = """
        SELECT *
        WHERE{
          ?x gvp:broaderGeneric ?y .
        }
            """ 
qres = query_wrapper(sparql, query)

In [12]:
broaders = defaultdict(list)
narrowers = defaultdict(list)
results = qres["results"]["bindings"]
for result in results:
    x = result["x"]["value"].rsplit('/', 1)[-1]
    y = result["y"]["value"].rsplit('/', 1)[-1]
    broaders[x].append(y)
    narrowers[y].append(x)

In [29]:
topics_list = list(set(broaders.keys()).union(set(narrowers.keys())))
topics = {k:1 for k in topics_list}

In [30]:
print("The total number of concepts is",len(topics_list))

The total number of concepts is 55273


# Analysing the depth

In [31]:
# this tree contains loops
def find_depth(concept, unhier):
    inspected = dict()
    queue = deque() 
    value = 1
    max_depth = value
    queue.append({"t":concept,"d":value})
    while len(queue) > 0:
        dequeued = queue.popleft()
        #print(dequeued)
        if dequeued["t"] in unhier:
            broads = unhier[dequeued["t"]]
            new_depth = dequeued["d"]+1
            if new_depth > max_depth:
                max_depth = new_depth
            for broader in broads:
                if broader not in inspected:
                    queue.append({"t":broader,"d":dequeued["d"]+1})
                    inspected[broader] = True

    concepts[concept] = max_depth
    #print("{} - {}".format(concept,max_depth))

In [32]:
unhier = broaders
topics = {k:1 for k in topics_list}
concepts = topics
#concepts = {"300224863":1}
with tqdm(total=len(concepts)) as pbar:
    for concept, value in concepts.items():
        #print(concept, strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
        value = find_depth(concept, unhier)
        pbar.update(1)

100%|██████████| 55273/55273 [00:00<00:00, 104073.65it/s]


In [24]:
import pandas as pd
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])

In [33]:
print("There are {} top concepts".format(len(list_of_depths[list_of_depths["depth"] == 1])))

There are 47 top concepts


In [34]:
print("Sorting by highest depth")
list_of_depths.sort_values('depth', inplace=True, ascending=False)
list_of_depths.head(50)

Sorting by highest depth


Unnamed: 0,depth
300001352,20
300002156,20
300002142,19
300002528,19
300404751,19
300002155,19
300001355,19
300002533,19
300002150,19
300130953,19
