In [1]:
#! pip install -U -q odfpy SPARQLWrapper bigjson ijson rdflib

In [2]:
import pandas as pd
import numpy as np
import rdflib
import urllib.parse
import time
import pickle
#import cPickle as pickle
import bz2
import json
import ijson
import bigjson #https://github.com/henu/bigjson.git
from SPARQLWrapper import SPARQLWrapper, JSON  #https://rdflib.dev/sparqlwrapper/

from functools import lru_cache as cache

### Reading FoodOn vocabulary

In [3]:
vocabularies = pd.read_excel("../data/vocabularies/FoodOn_concepts.ods", engine="odf")
vocabularies.rename(columns={'Food Concept':'FoodOn'}, inplace=True)
vocabularies['FoodOn'] = vocabularies['FoodOn'].apply(str.title)
vocabularies.sort_values('FoodOn', inplace=True)
vocabularies.reset_index(inplace=True, drop=True)
vocabularies.head(10)

Unnamed: 0,FoodOn
0,Agar
1,Alcohol
2,Ale
3,Alligator
4,Almond
5,Almond Butter
6,Almond Paste
7,Aloe Vera
8,Aluminum
9,Amaranth


### Building a function to query SPARQL endpoints

In [4]:
def query_sparql_endpoint(endpoint, query, term):
    term = urllib.parse.quote(term)
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(query%term)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    time.sleep(1)
    return(results["results"]["bindings"])

### Querying Getty [AAT Vocabulary](http://www.getty.edu/research/tools/vocabularies/aat/)  

[downloads](https://www.getty.edu/research/tools/vocabularies/obtain/download.html)

In [5]:
vocabularies['Getty-AAT'] = np.empty((len(vocabularies), 0)).tolist()
endpoint = "http://vocab.getty.edu/sparql"  #http://vocab.getty.edu/queries#_Toc485115879

query = '''
    SELECT ?Subject ?Term ?Parents ?Descr ?ScopeNote ?Type (coalesce(?Type1,?Type2) as ?ExtraType) {
  ?Subject luc:term "%s"; a ?typ.
  ?typ rdfs:subClassOf gvp:Subject; rdfs:label ?Type.
  FILTER (?typ != gvp:Subject)
  OPTIONAL {?Subject gvp:placeTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type1]]}
  OPTIONAL {?Subject gvp:agentTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type2]]}
  OPTIONAL {?Subject gvp:prefLabelGVP [xl:literalForm ?Term]}
  OPTIONAL {?Subject gvp:parentStringAbbrev ?Parents}
  OPTIONAL {?Subject foaf:focus/gvp:biographyPreferred/schema:description ?Descr}
  OPTIONAL {?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]}}
'''

#for idx in vocabularies.index:
#    term = vocabularies.loc[idx,'FoodOn']
#    results = query_sparql_endpoint(endpoint, query, term)
#    for result in results:
#        vocabularies.loc[idx, 'Getty-AAT'].append({result["Term"]["value"]:result['Subject']['value']})

#vocabularies.to_pickle('./temp.pkl')

In [6]:
vocabularies = pd.read_pickle("./temp.pkl")
vocabularies.reset_index(inplace=True, drop=True)

In [7]:
vocabularies.head()

Unnamed: 0,FoodOn,Getty-AAT
0,Agar,"[{'Agar, Syrt': 'http://vocab.getty.edu/tgn/76..."
1,Alcohol,[{'alcohol (general)': 'http://vocab.getty.edu...
2,Ale,[{'Ale Water': 'http://vocab.getty.edu/tgn/746...
3,Alligator,"[{'Alligator, The': 'http://vocab.getty.edu/tg..."
4,Almond,[{'Almond': 'http://vocab.getty.edu/tgn/112112...


### Querying [Iconclass](http://www.iconclass.nl/home)  
#### Using [iconclass database](http://iconclass.org/data/iconclass_20200529_skos_jsonld.ndjson.gz). See Also [LOD](http://www.iconclass.org/help/lod) and [ML Experiment](https://labs.brill.com/ictestset/)  

In [8]:
@cache(maxsize=None)
def search_iconclas(term):
    results = []
    with open('../data/vocabularies/iconclass_20200710_skos_jsonld.ndjson', 'rb') as f:
        for line in f:
            element = json.loads(line)
            try:
                for item in element['skos:prefLabel']:
                    if item['@language'] == 'en':
                        if term.lower() in item['@value'].lower():
                            results.append({item['@value']:element['@id']})
            except:
                #print("-", end='')
                pass;
    return(results)

In [9]:
#t0 = time.time()

#vocabularies['Iconclass'] = np.empty((len(vocabularies), 0)).tolist()

#for idx in vocabularies.index:
#    term = vocabularies.loc[idx,'FoodOn']
#    result = search_iconclas(term)
#    vocabularies.loc[idx, 'Iconclass'].append(result)
#    print(f"{idx}+", end='');
            
#print(time.time() - t0)
#vocabularies.to_pickle('./temp2.pkl')

In [11]:
vocabularies = pd.read_pickle("./temp2.pkl")
vocabularies.head()

#### Alternatively querying SPARQL endpoint

### Querying [DBPedia](http://dbpedia.org/sparql)

In [13]:
vocabularies['DBPedia'] = np.empty((len(vocabularies), 0)).tolist()
endpoint = 'http://dbpedia.org/sparql'
query = '''
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?label
    WHERE { <http://dbpedia.org/resource/%s> rdfs:label ?label }
'''

for idx in vocabularies.index:
    term = vocabularies.loc[idx,'FoodOn']
    results = query_sparql_endpoint(endpoint, query, term)
    for result in results:
        if result["label"]["xml:lang"] == 'en':
            vocabularies.loc[idx, 'DBPedia'].append(result["label"]["value"])

In [14]:
with bz2.BZ2File('../data/vocabularies/vocabularies.pbz2', 'w') as f:
    pickle.dump(vocabularies, f)
    
#vocabularies.to_excel("../data/vocabularies/vocabularies.xlsx")

In [16]:
#with bz2.BZ2File('../data/vocabularies/vocabularies.pbz2', 'r') as f:
#    vocabularies = pickle.load(f)

In [40]:
vocabularies.Iconclass = vocabularies.Iconclass.apply(lambda x:x[0])
vocabularies.head(20)

Unnamed: 0,FoodOn,Getty-AAT,Iconclass,DBPedia
0,Agar,"[{'Agar, Syrt': 'http://vocab.getty.edu/tgn/76...",[{'Hagar (not in biblical context)': 'http://i...,[Agar]
1,Alcohol,[{'alcohol (general)': 'http://vocab.getty.edu...,[{'alcoholism': 'http://iconclass.org/31B531'}...,[Alcohol]
2,Ale,[{'Ale Water': 'http://vocab.getty.edu/tgn/746...,[{'ex-voto of the woman healed from the issue ...,[Ale]
3,Alligator,"[{'Alligator, The': 'http://vocab.getty.edu/tg...",[{'crocodiles: alligator': 'http://iconclass.o...,[Alligator]
4,Almond,[{'Almond': 'http://vocab.getty.edu/tgn/112112...,[{'oval or almond-shaped mandorla': 'http://ic...,[Almond]
5,Almond Butter,[{'Almond': 'http://vocab.getty.edu/tgn/112112...,[],[]
6,Almond Paste,[{'Almond': 'http://vocab.getty.edu/tgn/112112...,[],[]
7,Aloe Vera,[{'Aloe (genus)': 'http://vocab.getty.edu/aat/...,[],[]
8,Aluminum,[{'Aluminum Pond': 'http://vocab.getty.edu/tgn...,[],[Aluminum]
9,Amaranth,[{'Amaranth Island': 'http://vocab.getty.edu/t...,[{'plants and herbs: amaranth': 'http://iconcl...,[Amaranth]


#### Finding the exact Matches

In [54]:
vocabularies['Getty_exact'] = ""
vocabularies['Iclass_exact'] = ""
for idx in vocabularies.index:
    field1 = []
    for item in vocabularies.loc[idx, 'Getty-AAT']:
        if list(item.items())[0][0] == vocabularies.loc[idx, 'FoodOn']:
            field1.append(item)
    if field1:
        vocabularies.at[idx, 'Getty_exact'] = field1
    else:
        vocabularies.at[idx, 'Getty_exact'] = np.nan
    field2 = []        
    for item in vocabularies.loc[idx, 'Iconclass']:
        if list(item.items())[0][0] == vocabularies.loc[idx, 'FoodOn']:
            field2.append(item)
    if field2:
        vocabularies.at[idx, 'Iclass_exact'] = field2
    else:
        vocabularies.at[idx, 'Iclass_exact'] = np.nan

In [55]:
vocabularies.head()

Unnamed: 0,FoodOn,Getty-AAT,Iconclass,DBPedia,Getty_exact,Iclass_exact
0,Agar,"[{'Agar, Syrt': 'http://vocab.getty.edu/tgn/76...",[{'Hagar (not in biblical context)': 'http://i...,[Agar],[{'Agar': 'http://vocab.getty.edu/tgn/2096996'...,
1,Alcohol,[{'alcohol (general)': 'http://vocab.getty.edu...,[{'alcoholism': 'http://iconclass.org/31B531'}...,[Alcohol],,
2,Ale,[{'Ale Water': 'http://vocab.getty.edu/tgn/746...,[{'ex-voto of the woman healed from the issue ...,[Ale],[{'Ale': 'http://vocab.getty.edu/tgn/2128755'}...,
3,Alligator,"[{'Alligator, The': 'http://vocab.getty.edu/tg...",[{'crocodiles: alligator': 'http://iconclass.o...,[Alligator],[{'Alligator': 'http://vocab.getty.edu/tgn/205...,
4,Almond,[{'Almond': 'http://vocab.getty.edu/tgn/112112...,[{'oval or almond-shaped mandorla': 'http://ic...,[Almond],[{'Almond': 'http://vocab.getty.edu/tgn/112112...,


#### Percent of exact matches

In [56]:
p1 = len(vocabularies.dropna(axis=0, subset=["Getty_exact"]))/len(vocabularies) * 100
print(f'AAT: {p1:.2f}')
p2 = len(vocabularies.dropna(axis=0, subset=["Iclass_exact"]))/len(vocabularies) * 100
print(f'Iconclass: {p2:.2f}')

AAT: 20.58
Iconclass: 0.09


In [60]:
vocabularies[['FoodOn', 'DBPedia',
              'Getty-AAT', 'Getty_exact',
              'Iconclass', 'Iclass_exact']].to_excel("../data/vocabularies/vocabularies.xlsx")

## TBD
### Querying Europeana

https://pro.europeana.eu/page/search  
https://pro.europeana.eu/page/sparql  
http://matthewlincoln.net/2014/07/10/sparql-for-humanists.html  