In [94]:
import csv
import json
import re
import pandas as pd
import time
from datetime import datetime
from collections import Counter
from SPARQLWrapper import SPARQLWrapper, JSON

In [None]:
def convert_date_string(date_string):
    pattern = r"^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z$"
    match = re.match(pattern, date_string)
    if match:
        year, month, day, hour, minute, second = match.groups()
        date = datetime(int(year), int(month), int(day))
        month_name = date.strftime("%B")
        new_date_string = f"{day} {month_name} {year}"
        return new_date_string
    else:
        None
    
def get_triples_with_sentences(relation_pid: str, relation_label: str, rel_domain: str, rel_range: str, limit: int = 100):
    assert relation_pid, "relation id can't be empty"
    assert rel_domain, "domain can't be empty"
    print(f"processing {relation_label} relation:")
    
    ## build the SPARQL query
    sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
    query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/> \n PREFIX wd: <http://www.wikidata.org/entity/> \n"
    query += "SELECT DISTINCT ?sub ?subEntity ?objEntity ?objLabel { ?subEntity wdt:P31 wd:" + rel_domain + " . "
    query += '?subEntity rdfs:label ?sub . FILTER (lang(?sub) = "en") '
    query += '?subEntity wdt:' + relation_pid + ' ?objEntity . OPTIONAL { ?objEntity rdfs:label ?objLabel . FILTER (lang(?objLabel) = "en") }} '
    # we get 15 time (we can adjuct n) more the results because we have to ignore some if we don't find sentences or repeating sub/obj values
    query += f"LIMIT 10000"
    # print(query)
    
    # execute the query and a get a set of triples
    triples = list()
    subject_counter, object_counter = Counter(), Counter()
    secondary_triples = list()
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        t_subject = result['sub']['value']
        if 'objLabel' in result:
            t_object = result['objLabel']['value']
            t_object_id = result['objEntity']['value'].replace("http://www.wikidata.org/entity/","")
        else:
            t_object = result['objEntity']['value']
            date_string = convert_date_string(t_object)
            if date_string:
                t_object = date_string
            t_object_id = None
        t_subject_id = result['subEntity']['value'].replace("http://www.wikidata.org/entity/","")
        triple = [t_subject, relation_label, t_object, t_subject_id, t_object_id]    
        # in order to get a diverse dataset, we ignore subject/object if they occur more than 10% of the limit
        subject_counter[t_subject] += 1
        object_counter[t_object] += 1
        if subject_counter[t_subject] > (limit / 10) or object_counter[t_object] > (limit / 10):
            secondary_triples.append(triple)
            continue
        triples.append(triple)
                
    # if don't at least get asked number of triples, we just add some triples that we ignored due to repetition
    if len(triples) < limit:
        triples += secondary_triples[:limit - len(triples)]
        
    print(f"\tcollected {len(triples)} triples")
    # print(triples[:3])
        
    triples_with_sentences = list()
    for tr in triples:
        search_key = create_key(tr[0],tr[1], tr[2])
        if search_key in sent_index:
            sentence = sent_index[search_key] 
        else:
            continue
        tr.append(sentence)
        triples_with_sentences.append(tr)
        
        # once we actually check for setences, we will stop at the limit
        if len(triples_with_sentences) >= limit:
            break

    columns = ["subject", "relation", "object", "subject_entity", "object_entity", "sentence"]
    df = pd.DataFrame(triples_with_sentences, columns=columns)
    return df

In [None]:
def create_key(sub_label, rel_label, obj_label):
    # remove spaces and make lower case
    sub_label = re.sub(r"\s+", '', sub_label).lower()
    rel_label = re.sub(r"\s+", '', rel_label).lower()
    obj_label = re.sub(r"\s+", '', obj_label).lower()
    # concatanate them 
    tr_key = f"{sub_label}{rel_label}{obj_label}"
    return tr_key


sent_index = dict()
relation_counter = Counter()
start_time = time.time()
with open('tekgen.csv') as csv_in_file:
    sent_reader = csv.reader(csv_in_file)
    next(sent_reader)
    for row in sent_reader:
        tr_key = create_key(row[0], row[1], row[2])
        relation_counter[row[1]] += 1
        sent = row[4]
        sent_index[tr_key] = sent
        elapsed_time = (time.time()-start_time)/60
    print(f"sent index loaded in {elapsed_time:.2f} mins!")

In [39]:
ontology_paths = ['movie_ontology.json']

In [40]:
ontologies = []
for ontology_path in ontology_paths:
    with open(ontology_path) as in_file:
        ontologies.append(json.load(in_file))

In [93]:
rel_df_list = list() 
for onto in ontologies:
    onto_id = onto['id']
    for rel in onto['relations']:
        start_time = time.time()
        tr_df = get_triples_with_sentences(rel['pid'], rel['label'], rel['domain'], rel['range'], 250)
        elapsed_time = (time.time()-start_time)
        print(f"\t{len(tr_df)} triples with sentences in {elapsed_time:.2f} seconds!")
        rel_df_list.append(tr_df)
        rel_label = re.sub(r"\s+", '_', rel['label'])
        tr_df.to_csv(f"data/{onto_id}_{rel['pid']}_{rel_label}.csv", encoding='utf-8')

processing director relation:
	collected 9975 triples
[['Kick-Ass', 'director', 'Matthew Vaughn', 'Q2201', 'Q2593'], ['The Last Winter', 'director', 'Frank Dunlop', 'Q24911', 'Q5486336'], ['A Gang Story', 'director', 'Olivier Marchal', 'Q593', 'Q694259']]
	250 triples with sentences in 5.05 seconds!
processing screenwriter relation:
	collected 9139 triples
[['A Clockwork Orange', 'screenwriter', 'Stanley Kubrick', 'Q181086', 'Q2001'], ['Night Magic', 'screenwriter', 'Leonard Cohen', 'Q3341388', 'Q1276'], ['The Master and Margarita', 'screenwriter', 'Mikhail Bulgakov', 'Q1963505', 'Q835']]
	250 triples with sentences in 4.81 seconds!
processing genre relation:
	collected 2268 triples
[['July 14', 'genre', 'comedy film', 'Q24585', 'Q157443'], ['The Trial', 'genre', 'drama film', 'Q35725', 'Q130232'], ['July 14', 'genre', 'comedy drama', 'Q24585', 'Q859369']]
	167 triples with sentences in 2.38 seconds!
processing based on relation:
	collected 9978 triples
[['Clue', 'based on', 'Cluedo', 

In [90]:
rel_df_list[9]

Unnamed: 0,subject,relation,object,subject_entity,object_entity,sentence
0,The Robe,characters,Jesus,Q1139023,Q302,The Robe is a 1942 historical novel about the ...
1,Chopin: Desire for Love,characters,George Sand,Q670904,Q3816,The plot covers the affair between Chopin and ...
2,Copying Beethoven,characters,Ludwig van Beethoven,Q1347019,Q255,Copying Beethoven is a 2006 dramatic film rele...
3,The King's Speech,characters,Winston Churchill,Q160060,Q8016,Other commentators discussed the film's repres...
4,Mr. Holmes,characters,Sherlock Holmes,Q17344458,Q4653,Mr. Holmes is a 2015 mystery film directed by ...
...,...,...,...,...,...,...
153,Les liaisons dangereuses,characters,Marquise de Merteuil,Q1498136,Q3294811,Les Liaisons dangereuses is the story of the M...
154,Men in Black 3,characters,Agent J,Q327713,Q3390587,This forces Agent J to go back in time and tea...
155,The Man from Majorca,characters,Bo Jarnebring,Q1195494,Q10431801,The plot follows the policemen Bo Jarnebring a...
156,Pride and Prejudice,characters,Mr Bennet,Q940014,Q6929362,Mr Bennet of Longbourn estate has five daughte...
