In [1]:
#! pip install -U -q odfpy SPARQLWrapper bigjson ijson rdflib

In [2]:
import pandas as pd
import numpy as np
import rdflib
import urllib.parse
import time
import pickle
#import cPickle as pickle
import bz2
import json
import ijson
import bigjson #https://github.com/henu/bigjson.git
from SPARQLWrapper import SPARQLWrapper, JSON  #https://rdflib.dev/sparqlwrapper/

from functools import lru_cache as cache

### Reading FoodOn vocabulary

In [3]:
vocabularies = pd.read_excel("../data/vocabularies/FoodOn_concepts.ods", engine="odf")
vocabularies.rename(columns={'Food Concept':'FoodOn'}, inplace=True)
vocabularies['FoodOn'] = vocabularies['FoodOn'].apply(str.title)
vocabularies.sort_values('FoodOn', inplace=True)
vocabularies.reset_index(inplace=True, drop=True)
vocabularies.head(10)

Unnamed: 0,FoodOn
0,Agar
1,Alcohol
2,Ale
3,Alligator
4,Almond
5,Almond Butter
6,Almond Paste
7,Aloe Vera
8,Aluminum
9,Amaranth


### Building a function to query SPARQL endpoints

In [4]:
def query_sparql_endpoint(endpoint, query, term):
    term = urllib.parse.quote(term)
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(query%term)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    time.sleep(1)
    return(results["results"]["bindings"])

### Querying Getty [AAT Vocabulary](http://www.getty.edu/research/tools/vocabularies/aat/)  

[downloads](https://www.getty.edu/research/tools/vocabularies/obtain/download.html)

In [5]:
vocabularies['Getty-AAT'] = np.empty((len(vocabularies), 0)).tolist()
endpoint = "http://vocab.getty.edu/sparql"  #http://vocab.getty.edu/queries#_Toc485115879

query = '''
    SELECT ?Subject ?Term ?Parents ?Descr ?ScopeNote ?Type (coalesce(?Type1,?Type2) as ?ExtraType) {
  ?Subject luc:term "%s"; a ?typ.
  ?typ rdfs:subClassOf gvp:Subject; rdfs:label ?Type.
  FILTER (?typ != gvp:Subject)
  OPTIONAL {?Subject gvp:placeTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type1]]}
  OPTIONAL {?Subject gvp:agentTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type2]]}
  OPTIONAL {?Subject gvp:prefLabelGVP [xl:literalForm ?Term]}
  OPTIONAL {?Subject gvp:parentStringAbbrev ?Parents}
  OPTIONAL {?Subject foaf:focus/gvp:biographyPreferred/schema:description ?Descr}
  OPTIONAL {?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]}}
'''

#for idx in vocabularies.index:
#    term = vocabularies.loc[idx,'FoodOn']
#    results = query_sparql_endpoint(endpoint, query, term)
#    for result in results:
#        vocabularies.loc[idx, 'Getty-AAT'].append({result["Term"]["value"]:result['Subject']['value']})

#vocabularies.to_pickle('./temp.pkl')

In [6]:
vocabularies = pd.read_pickle("./temp.pkl")
vocabularies.reset_index(inplace=True, drop=True)

In [7]:
vocabularies.head()

Unnamed: 0,FoodOn,Getty-AAT
0,Agar,"[{'Agar, Syrt': 'http://vocab.getty.edu/tgn/76..."
1,Alcohol,[{'alcohol (general)': 'http://vocab.getty.edu...
2,Ale,[{'Ale Water': 'http://vocab.getty.edu/tgn/746...
3,Alligator,"[{'Alligator, The': 'http://vocab.getty.edu/tg..."
4,Almond,[{'Almond': 'http://vocab.getty.edu/tgn/112112...


### Querying [Iconclass](http://www.iconclass.nl/home)  
#### Using [iconclass database](http://iconclass.org/data/iconclass_20200529_skos_jsonld.ndjson.gz). See Also [LOD](http://www.iconclass.org/help/lod) and [ML Experiment](https://labs.brill.com/ictestset/)  

In [8]:
@cache(maxsize=None)
def search_iconclas(term):
    results = []
    with open('../data/vocabularies/iconclass_20200710_skos_jsonld.ndjson', 'rb') as f:
        for line in f:
            element = json.loads(line)
            try:
                for item in element['skos:prefLabel']:
                    if item['@language'] == 'en':
                        if term.lower() in item['@value'].lower():
                            results.append({item['@value']:element['@id']})
            except:
                #print("-", end='')
                pass;
    return(results)

In [9]:
t0 = time.time()

vocabularies['Iconclass'] = np.empty((len(vocabularies), 0)).tolist()

for idx in vocabularies.index:
    term = vocabularies.loc[idx,'FoodOn']
    result = search_iconclas(term)
    vocabularies.loc[idx, 'Iconclass'].append(result)
    print(f"{idx}+", end='');
            
print(time.time() - t0)
vocabularies.to_pickle('./temp2.pkl')

0+1+2+3+4+5+6+7+8+9+10+11+12+13+14+15+16+17+18+19+20+21+22+23+24+25+26+27+28+29+30+31+32+33+34+35+36+37+38+39+40+41+42+43+44+45+46+47+48+49+50+51+52+53+54+55+56+57+58+59+60+61+62+63+64+65+66+67+68+69+70+71+72+73+74+75+76+77+78+79+80+81+82+83+84+85+86+87+88+89+90+91+92+93+94+95+96+97+98+99+100+101+102+103+104+105+106+107+108+109+110+111+112+113+114+115+116+117+118+119+120+121+122+123+124+125+126+127+128+129+130+131+132+133+134+135+136+137+138+139+140+141+142+143+144+145+146+147+148+149+150+151+152+153+154+155+156+157+158+159+160+161+162+163+164+165+166+167+168+169+170+171+172+173+174+175+176+177+178+179+180+181+182+183+184+185+186+187+188+189+190+191+192+193+194+195+196+197+198+199+200+201+202+203+204+205+206+207+208+209+210+211+212+213+214+215+216+217+218+219+220+221+222+223+224+225+226+227+228+229+230+231+232+233+234+235+236+237+238+239+240+241+242+243+244+245+246+247+248+249+250+251+252+253+254+255+256+257+258+259+260+261+262+263+264+265+266+267+268+269+270+271+272+273+274+275+276+27

In [None]:
#vocabularies = pd.read_pickle("./temp2.pkl")

#### Alternatively querying SPARQL endpoint

### Querying [DBPedia](http://dbpedia.org/sparql)

In [None]:
with bz2.BZ2File('../data/vocabularies.pbz2', 'r') as f:
    vocabularies = pickle.load(f)

In [None]:
vocabularies.head(20)

#### Finding the exact Matches

In [None]:
vocabularies['Getty_exact'] = np.NaN
vocabularies['Iclass_exact'] = np.NaN
for idx, item in enumerate(vocabularies.index):
    for item in vocabularies.loc[idx, 'Getty-ATT']:
        if item == vocabularies.loc[idx, 'FoodOn']:
            vocabularies.loc[idx, 'Getty_exact'] = item
            break
    for item in vocabularies.loc[idx, 'Iconclass']:
        if item == vocabularies.loc[idx, 'FoodOn']:
            vocabularies.loc[idx, 'Iclass_exact'] = item
            break

In [None]:
vocabularies.dropna(axis=0, subset=['Getty_exact'])

In [None]:
vocabularies.dropna(axis=0, subset=['Iclass_exact'])

#### Percent of exact matches

In [None]:
p1 = len(vocabularies.dropna(axis=0, subset=["Getty_exact"]))/len(vocabularies) * 100
print(f'AAT: {p1:.2f}')
p2 = len(vocabularies.dropna(axis=0, subset=["Iclass_exact"]))/len(vocabularies) * 100
print(f'Iconclass: {p2:.2f}')

In [None]:
vocabularies.to_excel("../data/vocabularies.xlsx")

## TBD
### Querying Europeana

https://pro.europeana.eu/page/search  
https://pro.europeana.eu/page/sparql  
http://matthewlincoln.net/2014/07/10/sparql-for-humanists.html  