In [1]:
import json, wikipedia, os, random
from tqdm import tqdm
from wikidata.client import Client
from requests_ratelimiter import LimiterSession

In [2]:
KAMEL_DIRECTORY = "data/KAMEL"
KAMEL_PROPERTIES = json.load(open("data/ISWC/kamel_properties.json", "r"))
QUERY_HEADERS = {
    'User-Agent': 'NeSyIntSem/0.1 (https://github.com/bradleypallen/nesy-intentional-semantics; b.p.allen@uva.nl)',
}

In [3]:
def get_wikipedia_summary(page_url):
    try:
        page_title = page_url.split("/")[-1]
        page = wikipedia.page(page_title)
        summary = page.summary
        return summary
    except wikipedia.exceptions.PageError:
        return "Page not found."
    except wikipedia.exceptions.DisambiguationError as e:
        return f"Disambiguation page. Possible matches: {e.options}"

In [4]:
def get_entity_id_from_label(session, property, label, obj):
    label = label[:1].upper() + label[1:]
    query = f'SELECT ?item WHERE {{ {{ ?item wdt:{property} wd:{obj} ; rdfs:label "{label}"@en . }} UNION {{ ?item wdt:{property} wd:{obj} ; skos:altLabel "{label}"@en . }} }} LIMIT 1'
    url = "https://query.wikidata.org/sparql"
    params = {"query": query, "format": "json"}
    response = session.get(url, params=params)
    # print(response.status_code, end=" > ")
    if response.status_code == 200:
        data = response.json()
        if data["results"]["bindings"]:
            return data["results"]["bindings"][0]["item"]["value"].split("/")[-1]
    
    return None

In [5]:
for property in KAMEL_PROPERTIES:
    filename = f"data/NESY/{property}.json"
    if os.path.isfile(filename):
        print(f'Skipping {property}')
    else:
        data = [ json.loads(line) for line in open(f"data/KAMEL/{property}/test.jsonl", "r") ]
        wikidata_client = Client()
        prop = wikidata_client.get(property, load=True)
        session = LimiterSession(per_minute=30)
        session.headers.update(QUERY_HEADERS)
        positives = []
        negatives = []
        statements = set()
        predicate = { 
            "id": property, 
            "label": str(prop.label), 
            "definition": f'{str(prop.label)}: {str(prop.description)}.' 
        }
        subjects = []
        objects = []
        for datum in tqdm(data, desc=f'{property}', total=len(data)):
            subj_id = get_entity_id_from_label(session, property, datum['sub_label'], datum['obj_uri'][0])
            if subj_id:
                try:
                    subj = wikidata_client.get(subj_id, load=True)
                    subject = { 
                        "id": str(subj.id), 
                        "label": str(subj.label), 
                        "description": f'{str(subj.label)}: {wikipedia.summary(str(subj.label), auto_suggest=False)}' 
                    }
                    subjects.append(subject)
                    obj = wikidata_client.get(datum['obj_uri'][0], load=True)
                    objekt = { 
                        "id": str(obj.id), 
                        "label": str(obj.label), 
                        "description": f'{str(obj.label)}: {wikipedia.summary(str(obj.label), auto_suggest=False)}' 
                    }
                    objects.append(objekt)
                    example = { 
                        "predicate": predicate, 
                        "arguments": [ subject, objekt ],
                        "in_extension": "1" 
                    }
                    statement = f'{example["predicate"]}({example["arguments"][0]},{example["arguments"][1]})'
                    positives.append(example)
                    statements.add(statement)
                except:
                    pass
        for i in range(len(positives)):
            subject = random.choice(subjects)
            objekt = random.choice(objects)
            statement = f'{predicate["label"]}({subject["label"]},{objekt["label"]})'
            while statement in statements:
                subject = random.choice(subjects)
                objekt = random.choice(objects)
                statement = f'{predicate["label"]}({subject["label"]},{objekt["label"]})'
            example = { 
                "predicate": predicate, 
                "arguments": [ subject, objekt ],
                "in_extension": "0" 
            }
            statements.add(statement)
            negatives.append(example)
        json.dump(positives + negatives, open(filename, "w+"))

Skipping P607
Skipping P277


P585: 100%|██████████| 200/200 [06:03<00:00,  1.82s/it]


Skipping P467
Skipping P197
Skipping P915
Skipping P200
Skipping P87
Skipping P7959
Skipping P159
Skipping P931
Skipping P509
Skipping P306
Skipping P1038
Skipping P30




  lis = BeautifulSoup(html).find_all('li')
P39: 100%|██████████| 200/200 [06:40<00:00,  2.00s/it]
P710: 100%|██████████| 200/200 [06:42<00:00,  2.01s/it]
P2094: 100%|██████████| 200/200 [06:17<00:00,  1.89s/it]
P1350: 100%|██████████| 200/200 [06:03<00:00,  1.82s/it]
P171: 100%|██████████| 200/200 [06:38<00:00,  1.99s/it]
P5353: 100%|██████████| 200/200 [06:33<00:00,  1.97s/it]
P571: 100%|██████████| 200/200 [06:02<00:00,  1.81s/it]
P4552: 100%|██████████| 200/200 [06:32<00:00,  1.96s/it]
P7153: 100%|██████████| 200/200 [06:39<00:00,  2.00s/it]
P1308: 100%|██████████| 200/200 [06:33<00:00,  1.97s/it]
P1408: 100%|██████████| 200/200 [06:54<00:00,  2.07s/it]
P488: 100%|██████████| 200/200 [06:38<00:00,  1.99s/it]
P2437: 100%|██████████| 200/200 [06:02<00:00,  1.81s/it]
P2031: 100%|██████████| 200/200 [06:04<00:00,  1.82s/it]
P737: 100%|██████████| 200/200 [06:44<00:00,  2.02s/it]
P131: 100%|██████████| 200/200 [06:46<00:00,  2.03s/it]
P183:   4%|▍         | 8/200 [00:17<06:55,  2.16s/i