In [3]:
import pandas as pd
import numpy as np
import rdflib
import urllib.parse
import time
import pickle
#import cPickle as pickle
import bz2
import json
import ijson
import bigjson #https://github.com/henu/bigjson.git
from SPARQLWrapper import SPARQLWrapper, JSON  #https://rdflib.dev/sparqlwrapper/

### Reading FoodOn vocabulary

In [4]:
vocabularies = pd.read_excel("../data/vocabularies/FoodOn_concepts.ods", engine="odf")
vocabularies.rename(columns={'Food Concept':'FoodOn'}, inplace=True)
vocabularies['FoodOn'] = vocabularies['FoodOn'].apply(str.title)
vocabularies.sort_values('FoodOn', inplace=True)
vocabularies.head(10)

Unnamed: 0,FoodOn
861,Agar
799,Alcohol
730,Ale
586,Alligator
43,Almond
260,Almond Butter
307,Almond Paste
276,Aloe Vera
163,Aluminum
693,Amaranth


### Building a function to query SPARQL endpoints

In [5]:
def query_sparql_endpoint(endpoint, query, term):
    term = urllib.parse.quote(term)
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(query%term)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    time.sleep(1)
    return(results["results"]["bindings"])

### Querying Getty [AAT Vocabulary](http://www.getty.edu/research/tools/vocabularies/aat/)  

[downloads](https://www.getty.edu/research/tools/vocabularies/obtain/download.html)

In [20]:
vocabularies['Getty-AAT'] = np.empty((len(vocabularies), 0)).tolist()
endpoint = "http://vocab.getty.edu/sparql"  #http://vocab.getty.edu/queries#_Toc485115879

query = '''
    SELECT ?Subject ?Term ?Parents ?Descr ?ScopeNote ?Type (coalesce(?Type1,?Type2) as ?ExtraType) {
  ?Subject luc:term "%s"; a ?typ.
  ?typ rdfs:subClassOf gvp:Subject; rdfs:label ?Type.
  FILTER (?typ != gvp:Subject)
  OPTIONAL {?Subject gvp:placeTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type1]]}
  OPTIONAL {?Subject gvp:agentTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type2]]}
  OPTIONAL {?Subject gvp:prefLabelGVP [xl:literalForm ?Term]}
  OPTIONAL {?Subject gvp:parentStringAbbrev ?Parents}
  OPTIONAL {?Subject foaf:focus/gvp:biographyPreferred/schema:description ?Descr}
  OPTIONAL {?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]}}
'''

for idx in vocabularies.index:
    term = vocabularies.loc[idx,'FoodOn']
    results = query_sparql_endpoint(endpoint, query, term)
    for result in results:
        vocabularies.loc[idx, 'Getty-AAT'].append({result["Term"]["value"]:result['Subject']['value']})

In [22]:
#vocabularies.to_pickle('./temp.pkl')
vocabularies = pd.read_pickle("./temp.pkl")

In [7]:
vocabularies.head()

Unnamed: 0,FoodOn,Getty-AAT
861,Agar,"[{'Agar, Syrt': 'http://vocab.getty.edu/tgn/76..."
799,Alcohol,[{'alcohol (general)': 'http://vocab.getty.edu...
730,Ale,[{'Ale Water': 'http://vocab.getty.edu/tgn/746...
586,Alligator,"[{'Alligator, The': 'http://vocab.getty.edu/tg..."
43,Almond,[{'Almond': 'http://vocab.getty.edu/tgn/112112...


### Querying [Iconclass](http://www.iconclass.nl/home)  
#### Using [iconclass database](http://iconclass.org/data/iconclass_20200529_skos_jsonld.ndjson.gz). See Also [LOD](http://www.iconclass.org/help/lod) and [ML Experiment](https://labs.brill.com/ictestset/)  

In [19]:
def search_iconclas(term):
    results = []
    with open('../data/vocabularies/iconclass_20200710_skos_jsonld.ndjson', 'rb') as f:
        for line in f:
            element = json.loads(line)
            try:
                for item in element['skos:prefLabel']:
                    if item['@language'] == 'en':
                        if term.lower() in item['@value'].lower():
                            #print(element['@id'])
                            results.append({item['@value']:element['@id']})
            except:
                #print("-", end='')
                pass;
    return(results)

In [None]:
t0 = time.time()

vocabularies['Iconclass'] = np.empty((len(vocabularies), 0)).tolist()

for idx in vocabularies.index:
    term = vocabularies.loc[idx,'FoodOn']
    result = search_iconclas(term)
    vocabularies.loc[idx, 'Iconclass'].append(result)
    print("+", end='');
            
print(time.time() - t0)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [None]:
vocabularies.to_pickle('./temp.pkl')
#vocabularies = pd.read_pickle("./temp.pkl")

#### Alternatively querying SPARQL endpoint

### Querying [DBPedia](http://dbpedia.org/sparql)

In [48]:
with bz2.BZ2File('../data/vocabularies.pbz2', 'r') as f:
    vocabularies = pickle.load(f)

In [49]:
vocabularies.head(20)

Unnamed: 0,FoodOn,Getty-ATT,Iconclass,DBPedia
0,Worcestershire Sauce,"[Worcestershire Beacon, Worcestershire, Willia...",[],[]
1,Cream Cheese,"[Cream Ridge, Cream Run, Cream, Cream Ridge, C...",[],[]
2,Mixture,"[Mixture Bayou, Mixture Lake, color mixture, s...",[Jewish dietary law ~ prohibited mixtures (Exo...,[Mixture]
3,Walnuts,"[The Walnuts, walnuts (nuts), Mnemiopsis (genus)]",[],[Walnuts]
4,Onion,"[Onion Creek, Onion, Bayou, Onion Creek, Onion...",[(non-fruit) products of plants or trees: onio...,[Onion]
5,Firm,"[Aponem (Firm), etekt (Firm), Court'art (Firm)...",[male saints (with NAME) - male saint ~ confir...,[Firm]
6,Mozzarella Cheese,[],[],[]
7,Mayonnaise,"[mayonnaise ladle, mayonnaise set, mayonnaise ...",[],[Mayonnaise]
8,Artichoke,"[Artichoke River, Artichoke, Artichoke Creek, ...","[plants and herbs: artichoke, plants and herbs...",[Artichoke]
9,Tortilla,"[Tortilla Mountain, Tortilla Creek, La Tortill...",[],[Tortilla]


#### Finding the exact Matches

In [50]:
vocabularies['Getty_exact'] = np.NaN
vocabularies['Iclass_exact'] = np.NaN
for idx, item in enumerate(vocabularies.index):
    for item in vocabularies.loc[idx, 'Getty-ATT']:
        if item == vocabularies.loc[idx, 'FoodOn']:
            vocabularies.loc[idx, 'Getty_exact'] = item
            break
    for item in vocabularies.loc[idx, 'Iconclass']:
        if item == vocabularies.loc[idx, 'FoodOn']:
            vocabularies.loc[idx, 'Iclass_exact'] = item
            break

In [51]:
vocabularies.dropna(axis=0, subset=['Getty_exact'])

Unnamed: 0,FoodOn,Getty-ATT,Iconclass,DBPedia,Getty_exact,Iclass_exact
4,Onion,"[Onion Creek, Onion, Bayou, Onion Creek, Onion...",[(non-fruit) products of plants or trees: onio...,[Onion],Onion,
8,Artichoke,"[Artichoke River, Artichoke, Artichoke Creek, ...","[plants and herbs: artichoke, plants and herbs...",[Artichoke],Artichoke,
9,Tortilla,"[Tortilla Mountain, Tortilla Creek, La Tortill...",[],[Tortilla],Tortilla,
12,Water,"[Minnoch, Water of, water damage, sig water, A...",[the mocking of St. Alexis: a bucket of water ...,[Water],Water,
19,Honey,"[Honey Creek, Honey, Mount, Honey, Michael, Be...",[representations of the parable of the frailty...,[Honey],Honey,
...,...,...,...,...,...,...
1018,Cork,"[Cork, The, Cork, Lough of, Cork Harbour, Cork...",[],[Cork],Cork,
1042,Verbena,"[Verbena, Cerro, Verbena, Verbena, La Verbena,...",[],[Verbena],Verbena,
1043,Barbados,"[Barbados Museum, Barbados Valley, Barbados, B...",[],[Barbados],Barbados,
1053,Pail,"[Chawa Pail, Bardi Pail, Pail, Pail Minor, Chi...","[pails (fire-extinguishing equipment), pails (...",[Pail],Pail,


In [52]:
vocabularies.dropna(axis=0, subset=['Iclass_exact'])

Unnamed: 0,FoodOn,Getty-ATT,Iconclass,DBPedia,Getty_exact,Iclass_exact
345,Chinese,"[Chinese festivals, Chinese yellow, unknown Ch...","[traditional Chinese religions, conceptions an...",[Chinese],,Chinese


#### Percent of exact matches

In [53]:
p1 = len(vocabularies.dropna(axis=0, subset=["Getty_exact"]))/len(vocabularies) * 100
print(f'AAT: {p1:.2f}')
p2 = len(vocabularies.dropna(axis=0, subset=["Iclass_exact"]))/len(vocabularies) * 100
print(f'Iconclass: {p2:.2f}')

AAT: 20.58
Iconclass: 0.09


In [54]:
vocabularies.to_excel("../data/vocabularies.xlsx")

## TBD
### Querying Europeana

https://pro.europeana.eu/page/search  
https://pro.europeana.eu/page/sparql  
http://matthewlincoln.net/2014/07/10/sparql-for-humanists.html  