In [1]:
import json, wikipedia, os, random
from tqdm import tqdm
from wikidata.client import Client
from requests_ratelimiter import LimiterSession

In [2]:
KAMEL_DIRECTORY = "data/KAMEL"
KAMEL_PROPERTIES = [name for name in os.listdir(KAMEL_DIRECTORY) if os.path.isdir(os.path.join(KAMEL_DIRECTORY, name))]
random.shuffle(KAMEL_PROPERTIES)
SAMPLE_SIZE = 20
QUERY_HEADERS = {
    'User-Agent': 'NeSyIntSem/0.1 (https://github.com/bradleypallen/nesy-intentional-semantics; b.p.allen@uva.nl)',
}

In [3]:
def get_wikipedia_summary(page_url):
    try:
        page_title = page_url.split("/")[-1]
        page = wikipedia.page(page_title)
        summary = page.summary
        return summary
    except wikipedia.exceptions.PageError:
        return "Page not found."
    except wikipedia.exceptions.DisambiguationError as e:
        return f"Disambiguation page. Possible matches: {e.options}"

In [4]:
def get_entity_id_from_label(session, property, label, obj):
    label = label[:1].upper() + label[1:]
    query = f'SELECT ?item WHERE {{ {{ ?item wdt:{property} wd:{obj} ; rdfs:label "{label}"@en . }} UNION {{ ?item wdt:{property} wd:{obj} ; skos:altLabel "{label}"@en . }} }} LIMIT 1'
    url = "https://query.wikidata.org/sparql"
    params = {"query": query, "format": "json"}
    response = session.get(url, params=params)
    # print(response.status_code, end=" > ")
    if response.status_code == 200:
        data = response.json()
        if data["results"]["bindings"]:
            return data["results"]["bindings"][0]["item"]["value"].split("/")[-1]
    
    return None

In [9]:
for property in KAMEL_PROPERTIES[:SAMPLE_SIZE]:
    filename = f"data/ISWC/{property}.json"
    if os.path.isfile(filename):
        print(f'Skipping {property}')
    else:
        data = [ json.loads(line) for line in open(f"data/KAMEL/{property}/test.jsonl", "r") ]
        wikidata_client = Client()
        prop = wikidata_client.get(property, load=True)
        session = LimiterSession(per_minute=30)
        session.headers.update(QUERY_HEADERS)
        new_data = []
        for datum in tqdm(data, desc=f'{property}', total=len(data)):
            # print(f'{i:3d}', datum['sub_label'], datum['obj_uri'][0], end=": ")
            subj_id = get_entity_id_from_label(session, property, datum['sub_label'], datum['obj_uri'][0])
            # print(subj_id)
            if subj_id:
                try:
                    subj = wikidata_client.get(subj_id, load=True)
                    obj = wikidata_client.get(datum['obj_uri'][0], load=True)
                    new_datum = { 
                        "predicate": { 
                            "id": property, 
                            "label": str(prop.label), 
                            "definition": f'{str(prop.label)}: {str(prop.description)}.' 
                        }, 
                        "arguments": [ 
                            { 
                                "id": str(subj.id), 
                                "label": str(subj.label), 
                                "description": f'{str(subj.label)}: {wikipedia.summary(str(subj.label), auto_suggest=False)}' 
                            }, 
                            { 
                                "id": str(obj.id), 
                                "label": str(obj.label), 
                                "description": f'{str(obj.label)}: {wikipedia.summary(str(obj.label), auto_suggest=False)}' 
                            } 
                        ], 
                        "in_extension": "1" 
                   }
                    new_data.append(new_datum)
                except:
                    pass
        json.dump(new_data, open(filename, "w+"))

P5353: 100%|██████████| 200/200 [06:41<00:00,  2.01s/it]
P571: 100%|██████████| 200/200 [06:03<00:00,  1.82s/it]
P4552: 100%|██████████| 200/200 [06:39<00:00,  2.00s/it]


In [10]:
json.dump(KAMEL_PROPERTIES, open("data/ISWC/kamel_properties.json", "w+"))