In [3]:
import csv
import json
import re
import pandas as pd
import numpy as np
import time
from datetime import datetime
from collections import Counter
from SPARQLWrapper import SPARQLWrapper, JSON

In [5]:
sparql_query_cache = dict()

In [98]:
def convert_date_string(date_string):
    pattern = r"^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z$"
    match = re.match(pattern, date_string)
    if match:
        year, month, day, hour, minute, second = match.groups()
        date = datetime(int(year), int(month), int(day))
        month_name = date.strftime("%B")
        new_date_string = f"{day} {month_name} {year}"
        return new_date_string
    else:
        None
        
def get_splits(triples, splits = [0.4, 0.3, 0.3]):
    triples = np.array(triples)
    indices = np.random.permutation(triples.shape[0])
    train_count = int(triples.shape[0] * splits[0])
    val_count = int(triples.shape[0] * splits[1])
    test_count = triples.shape[0] - train_count - val_count
    train_triples = triples[indices[:train_count]]
    val_triples = triples[indices[train_count:train_count+val_count]]
    test_triples = triples[indices[train_count+val_count:]]
    return train_triples.tolist(), val_triples.tolist(), test_triples.tolist()

def save_triples(onto_id, train_all, val_all, test_all):
    with open(f"data/{onto_id}/{onto_id}_train.jsonl", "w") as out_file:
        for idx, tr in enumerate(train_all):
            data = {"id": f"{onto_id}_train_{idx+1}", "sub_label": tr[0], "rel_label": tr[1], "obj_label": tr[2], "sent": tr[6], "sub": tr[3], "rel": tr[4], "obj": tr[5]}
            out_file.write(f"{json.dumps(data)}\n")
            
    with open(f"data/{onto_id}/{onto_id}_validation.jsonl", "w") as out_file:
        for idx, tr in enumerate(val_all):
            data = {"id": f"{onto_id}_val_{idx+1}", "sub_label": tr[0], "rel_label": tr[1], "obj_label": tr[2], "sent": tr[6], "sub": tr[3], "rel": tr[4], "obj": tr[5]}
            out_file.write(f"{json.dumps(data)}\n")
            
    with open(f"data/ground_truth/{onto_id}_ground_truth.jsonl", "w") as out_file:
        for idx, tr in enumerate(test_all):
            data = {"id": f"{onto_id}_test_{idx+1}", "sub_label": tr[0], "rel_label": tr[1], "obj_label": tr[2], "sent": tr[6], "sub": tr[3], "rel": tr[4], "obj": tr[5]}
            out_file.write(f"{json.dumps(data)}\n") 
            
    with open(f"data/{onto_id}/{onto_id}_test.jsonl", "w") as out_file:
        for idx, tr in enumerate(test_all):
            data = {"id": f"{onto_id}_test_{idx+1}", "sent": tr[6]}
            out_file.write(f"{json.dumps(data)}\n")
    
def get_triples_with_sentences(relation_pid: str, relation_label: str, rel_domain: str, rel_range: str, limit: int = 200):
    assert relation_pid, "relation id can't be empty"
    assert rel_domain, "domain can't be empty"
    
    ## build the SPARQL query
    sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
    query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/> \n PREFIX wd: <http://www.wikidata.org/entity/> \n"
    query += "SELECT DISTINCT ?sub ?subEntity ?objEntity ?objLabel { \n ?subEntity wdt:P31/wdt:P279* wd:" + rel_domain + " . \n"
    query += '?subEntity rdfs:label ?sub . FILTER (lang(?sub) = "en") \n '
    query += '?subEntity wdt:' + relation_pid + ' ?objEntity . \n'
    if rel_range and rel_range != "":
        query += '?objEntity wdt:P31*/wdt:P279* wd:' + rel_range + ' . \n '
    query += 'OPTIONAL { ?objEntity rdfs:label ?objLabel . FILTER (lang(?objLabel) = "en") } \n } '
    # we get 15 time (we can adjuct n) more the results because we have to ignore some if we don't find sentences or repeating sub/obj values
    query += f"LIMIT 10000"
    if show_query:
        print(query)
        
    if query in sparql_query_cache:
        triples = sparql_query_cache[query] 
    else:
        # execute the query and a get a set of triples
        triples = list()
        subject_counter, object_counter = Counter(), Counter()
        secondary_triples = list()
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        sparql.setTimeout(1500)
        results = sparql.query().convert()
        print(f'  {len(results["results"]["bindings"])} SPARQL results.')
        for result in results["results"]["bindings"]:
            t_subject = result['sub']['value']
            if 'objLabel' in result:
                t_object = result['objLabel']['value']
                t_object_id = result['objEntity']['value'].replace("http://www.wikidata.org/entity/","")
            else:
                t_object = result['objEntity']['value']
                date_string = convert_date_string(t_object)
                if date_string:
                    t_object = date_string
                t_object_id = None
            t_subject_id = result['subEntity']['value'].replace("http://www.wikidata.org/entity/","")
            triple = [t_subject, relation_label, t_object, t_subject_id, relation_pid, t_object_id]    
            # in order to get a diverse dataset, we ignore subject/object if they occur more than 10% of the limit
            subject_counter[t_subject] += 1
            object_counter[t_object] += 1
            if subject_counter[t_subject] > (limit / 10) or object_counter[t_object] > (limit / 10):
                secondary_triples.append(triple)
                continue
            triples.append(triple)

        # we just put the diverse triples in the begining and then put all the rest
        triples += secondary_triples
        sparql_query_cache[query] = triples
        
    print(f"  collected {len(triples)} triples")
    if show_sample:
        print(f"  sample:")
        for tr in triples[:5]:
            print(f"    {tr[:3]}")
        
    triples_with_sentences = list()
    for tr in triples:
        search_key = create_key(tr[0],tr[1], tr[2])
        if search_key in sent_index:
            sentence = sent_index[search_key] 
        else:
            continue
        tr.append(sentence)
        triples_with_sentences.append(tr)
        
        # once we actually check for setences, we will stop at the limit
        if len(triples_with_sentences) >= limit:
            break
            
    return triples_with_sentences

    # columns = ["subject", "relation", "object", "subject_entity", "object_entity", "sentence"]
    # df = pd.DataFrame(triples_with_sentences, columns=columns)
    # return df

def create_key(sub_label, rel_label, obj_label):
    # remove spaces and make lower case
    sub_label = re.sub(r"\s+", '', sub_label).lower()
    rel_label = re.sub(r"\s+", '', rel_label).lower()
    obj_label = re.sub(r"\s+", '', obj_label).lower()
    # concatanate them 
    tr_key = f"{sub_label}{rel_label}{obj_label}"
    return tr_key

In [8]:
sent_index = dict()

In [9]:
start_time = time.time()
print("TekGen corpus processing started!")
with open('tekgen.csv') as csv_in_file:
    sent_reader = csv.reader(csv_in_file)
    next(sent_reader)
    for row in sent_reader:
        tr_key = create_key(row[0], row[1], row[2])
        sent = row[4]
        sent_index[tr_key] = sent
        elapsed_time = (time.time()-start_time)/60
    print(f"\ttriple-to-sent index with {len(sent_index)} triples loaded in {elapsed_time:.2f} mins!")

TekGen corpus processing started!
	triple-to-sent index with 11358950 triples loaded in 2.53 mins!


In [93]:
ontology_paths = ['5_military_ontology.json']

In [96]:
ontologies = []
for ontology_path in ontology_paths:
    with open(ontology_path) as in_file:
        ontologies.append(json.load(in_file))

In [99]:
show_sample = True
show_query = False

for onto in ontologies:
    print(f"Ontology: {onto['title']} ({onto['id']})")
    onto_id = onto['id']
    train_all, val_all, test_all = [], [],[]
    for rel in onto['relations']:
        print(f"\nprocessing \"{rel['label']}\" ({rel['pid']}) relation:")
        start_time = time.time()
        triples_with_sentences = get_triples_with_sentences(rel['pid'], rel['label'], rel['domain'], rel['range'], 200)
        elapsed_time = (time.time()-start_time)
        print(f"    {len(triples_with_sentences)} triples with sentences in {elapsed_time:.2f} seconds!")
        train, val, test = get_splits(triples_with_sentences)
        train_all += train
        val_all += val
        test_all += test
    save_triples(onto_id, train_all, val_all, test_all)

Ontology: Military Ontology (ont_5_military)

processing "military rank" (P410) relation:
  collected 10000 triples
  sample:
    ['Wilhelm von Tegetthoff', 'military rank', 'admiral']
    ['Maurice of Nassau', 'military rank', 'admiral']
    ['Prince Luigi Amedeo, Duke of the Abruzzi', 'military rank', 'admiral']
    ['Roger of Lauria', 'military rank', 'admiral']
    ['Kantarō Suzuki', 'military rank', 'admiral']
    200 triples with sentences in 0.01 seconds!

processing "military branch" (P241) relation:
  collected 10000 triples
  sample:
    ['Sean Connery', 'military branch', 'Royal Navy']
    ['Mikhail Katukov', 'military branch', 'Red Army']
    ['Manuel Blanco Encalada', 'military branch', 'Chilean Navy']
    ['Park Chung-hee', 'military branch', 'Republic of Korea Army']
    ['Erwin Rommel', 'military branch', 'Imperial German Army']
    200 triples with sentences in 0.01 seconds!

processing "military casualty classification " (P1347) relation:
  collected 5664 triples
  sa