# Code to explore the Art & Architecture Thesaurus

This code relies on Getty's SPARQL Endpoint [http://vocab.getty.edu/sparql](http://vocab.getty.edu/sparql)

In [9]:
# Requirements
%pip install rdflib
%pip install SPARQLWrapper
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
from rdflib import Graph
from rdflib.namespace import RDFS
from rdflib import URIRef
import rdflib
import json
from collections import deque
import numpy as np
from SPARQLWrapper import SPARQLWrapper, JSON
from urllib.error import HTTPError
from collections import defaultdict
from tqdm import tqdm
from time import gmtime, strftime
import pandas as pd

# Set Up

In [2]:
sparql = SPARQLWrapper("http://vocab.getty.edu/sparql")

# Functionalities

In [3]:
def query_wrapper(sparql, query):
    completed = False
    while completed == False:
        try: 
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            completed = True
        except HTTPError:
            print("Going to sleep for 20 secs")
            time.sleep(20)
    return results

# Number of concepts

In [4]:
query_concepts = """
SELECT (count(?s) as ?ct)
WHERE
{ ?s rdf:type gvp:Subject ; skos:inScheme aat: ; gvp:prefLabelGVP/xl:literalForm ?l}
"""
qres = query_wrapper(sparql, query_concepts)
print(f"Total concepts in the scheme {qres['results']['bindings'][0]['ct']['value']}")

Total concepts in the scheme 58625


# Getting hierarchy

In [5]:
query = """SELECT *
WHERE{
  ?x skos:broader ?y .
  ?x skos:inScheme aat: .
  ?y skos:inScheme aat: .
}
"""
qres = query_wrapper(sparql, query)

In [6]:
#esults["results"]["bindings"][0]["label"]["value"]
broaders = defaultdict(list)
narrowers = defaultdict(list)
results = qres["results"]["bindings"]
for result in results:
    x = result["x"]["value"].rsplit('/', 1)[-1]
    y = result["y"]["value"].rsplit('/', 1)[-1]
    broaders[x].append(y)
    narrowers[y].append(x)
print(f"Len of broaders: {len(broaders)}")
print(f"Len of narrowers: {len(narrowers)}")

Len of broaders: 55407
Len of narrowers: 10005


In [7]:
broaders["300043202"]

['300043196', '300194567']

In [8]:
#narrowers["300194567"]

# Extracting the tree and finding depth

In [9]:
def find_depth(concept, unhier):
    inspected = dict()
    queue = deque() 
    value = 1
    max_depth = value
    queue.append({"t":concept,"d":value})
    while len(queue) > 0:
        dequeued = queue.popleft()
        #print(dequeued)
        if dequeued["t"] in unhier:
            broads = unhier[dequeued["t"]]
            new_depth = dequeued["d"]+1
            if new_depth > max_depth:
                max_depth = new_depth
            for broader in broads:
                if broader not in inspected:
                    queue.append({"t":broader,"d":dequeued["d"]+1})
                    inspected[broader] = True

    concepts[concept] = max_depth
    #print("{} - {}".format(concept,max_depth))

In [11]:
topics_list = list(set(broaders.keys()).union(set(narrowers.keys())))
unhier = broaders
topics = {k:1 for k in topics_list}
concepts = topics
#concepts = {"300224863":1}
with tqdm(total=len(concepts)) as pbar:
    for concept, value in concepts.items():
        #print(concept, strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
        value = find_depth(concept, unhier)
        pbar.update(1)

100%|██████████| 55652/55652 [00:00<00:00, 142698.27it/s]


In [12]:
concepts["300265311"]

10

In [13]:
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])

In [14]:
list_of_depths.sort_values('depth', inplace=True, ascending=False)
list_of_depths.head()

Unnamed: 0,depth
300417319,13
300183986,13
300375655,13
300446230,13
300387259,13


# Languages

In [15]:
query_languages = """
select (count(*) as ?c) ?lang {
  ?concept skos:inScheme aat:; xl:prefLabel|xl:altLabel ?lab.
  ?lab dct:language ?lng.
  ?lng gvp:prefLabelGVP/xl:literalForm ?lang
} group by ?lang order by desc(?c)
"""
qres = query_wrapper(sparql, query_languages)

In [16]:
results = qres["results"]["bindings"]
print(f"Number of Languages available: {len(results)}")
print("=================")
for result in results:
    print(f"{result['c']['value']} -> {result['lang']['value']}")

Number of Languages available: 167
174961 -> English (language)
74814 -> Dutch (language)
56014 -> Spanish (language)
47313 -> Chinese (traditional) (language)
27725 -> German (language)
25030 -> Chinese (transliterated Wade-Giles) (language)
25007 -> Chinese (transliterated Hanyu Pinyin) (language)
24964 -> Chinese (transliterated Pinyin without tones) (language)
22348 -> French (language)
5239 -> American English (language)
3934 -> Portuguese (language)
3806 -> Italian (language)
2969 -> British English (language)
2403 -> Latin (language)
2121 -> Hebrew (language)
1447 -> Classical Nahuatl (language)
1365 -> Sanskrit (transliterated) (language)
919 -> Serbian (language)
855 -> Greek (modern language)
771 -> Arabic (language)
763 -> Eastern Huasteca Nahuatl
569 -> Swedish (language)
470 -> Norwegian, New (Nynorsk) (language)
462 -> Norwegian (Bokmål) (written standard)
319 -> African language (language)
214 -> undetermined (language)
167 -> Japanese (transliterated) (language)
119 -> 

# Other queries

```sql
SELECT (count(?s) as ?ct)
WHERE
{ ?s rdf:type gvp:Concept}


SELECT *
WHERE {
  ?x gvp:broaderGeneric ?y }
LIMIT 100


SELECT ?x, (COUNT(?y) as ?ct) 
WHERE {
  ?x gvp:broaderGeneric ?y }
GROUP BY ?x 
LIMIT 100

SELECT ?x (COUNT(?y) as ?ct) 
WHERE{
  ?x gvp:broaderGeneric ?y .
}
GROUP BY ?x


SELECT (count(*) as ?ct) 
WHERE{
  ?x gvp:broaderExtended ?y
  }


SELECT (COUNT(*) as ?ct) 
WHERE{
  ?x gvp:broaderGeneric ?y .
}

SELECT ?s
WHERE{
  ?s <http://vocab.getty.edu/ontology#parentString> "Top of the AAT hierarchies" .
  }
```