In [37]:
import json
import re
import pandas as pd
from collections import Counter
from SPARQLWrapper import SPARQLWrapper, JSON

In [53]:
def get_triples_with_sentences(relation_pid: str, relation_label: str, rel_domain: str, rel_range: str, limit: int = 100):
    assert relation_pid, "relation id can't be empty"
    assert rel_domain, "domain can't be empty"
    print(f"processing {relation_label} relation:")
    
    ## build the SPARQL query
    sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
    query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/> \n PREFIX wd: <http://www.wikidata.org/entity/> \n"
    query += "SELECT DISTINCT ?sub ?subEntity ?objEntity ?objLabel { ?subEntity wdt:P31 wd:" + rel_domain + " . "
    query += '?subEntity rdfs:label ?sub . FILTER (lang(?sub) = "en") '
    query += '?subEntity wdt:' + relation_pid + ' ?objEntity . OPTIONAL { ?objEntity rdfs:label ?objLabel . FILTER (lang(?objLabel) = "en") }} '
    # we get 15 time (we can adjuct n) more the results because we have to ignore some if we don't find sentences or repeating sub/obj values
    query += f"LIMIT {limit * 15}"
    # print(query)
    
    # execute the query and a get a set of triples
    triples = list()
    subject_counter, object_counter = Counter(), Counter()
    secondary_triples = list()
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        t_subject = result['sub']['value']
        if 'objLabel' in result:
            t_object = result['objLabel']['value']
            t_object_id = result['objEntity']['value'].replace("http://www.wikidata.org/entity/","")
        else:
            t_object = result['objEntity']['value']
            t_object_id = None
        t_subject_id = result['subEntity']['value'].replace("http://www.wikidata.org/entity/","")
        triple = [t_subject, relation_label, t_object, t_subject_id, t_object_id]    
        # in order to get a diverse dataset, we ignore subject/object if they occur more than 10% of the limit
        subject_counter[t_subject] += 1
        object_counter[t_object] += 1
        if subject_counter[t_subject] > (limit / 10) or object_counter[t_object] > (limit / 10):
            secondary_triples.append(triple)
            continue
        triples.append(triple)
                
    # if don't at least get asked number of triples, we just add some triples that we ignored due to repetition
    if len(triples) < limit:
        triples += secondary_triples[:limit - len(triples)]
        
    print(f"\tcollected {len(triples)} triples")
    
    #TODO check for sentences for collected triples and add then to the triple
    
    triples_with_sentences = list()
    for tr in triples:
        search_key = f"{tr[0].lower()}{tr[1].lower()}{tr[2].lower()}"
        #TODO - search for the triple in the sentences in the index
        sentence = "sentence from index" # this can be either a sentence or None if we don't find any
        # if we don't a sentence for a given triple, we just ignore it 
        if not sentence:
            continue
        tr.append(sentence)
        triples_with_sentences.append(tr)
        
        # once we actually check for setences, we will stop at the limit
        if len(triples_with_sentences) >= limit:
            break

    columns = ["subject", "relation", "object", "subject_entity", "object_entity", "sentence"]
    df = pd.DataFrame(triples_with_sentences, columns=columns)
    return df

In [39]:
ontology_paths = ['movie_ontology.json']

In [40]:
ontologies = []
for ontology_path in ontology_paths:
    with open(ontology_path) as in_file:
        ontologies.append(json.load(in_file))

In [54]:
rel_df_list = list() 
for onto in ontologies:
    onto_id = onto['id']
    for rel in onto['relations']:
        tr_df = get_triples_with_sentences(rel['pid'], rel['label'], rel['domain'], rel['range'])
        rel_df_list.append(tr_df)
        rel_label = re.sub(r"\s+", '_', rel['label'])
        tr_df.to_csv(f"data/{onto_id}_{rel['pid']}_{rel_label}.csv", encoding='utf-8')

processing director relation:
	collected 1466 triples
processing screenwriter relation:
	collected 1141 triples
processing genre relation:
	collected 536 triples
processing based on relation:
	collected 1349 triples
processing cast member relation:
	collected 884 triples
processing award received relation:
	collected 958 triples
processing production company relation:
	collected 248 triples
processing country of origin relation:
	collected 299 triples
processing publication date relation:
	collected 1353 triples
processing characters relation:
	collected 1364 triples
processing narrative location relation:
	collected 106 triples
processing filming location relation:
	collected 141 triples
processing main subject relation:
	collected 1265 triples
processing nominated for relation:
	collected 101 triples
processing cost relation:
	collected 988 triples


In [55]:
rel_df_list[0]

Unnamed: 0,subject,relation,object,subject_entity,object_entity,sentence
0,Family Plot,director,Alfred Hitchcock,Q47296,Q7374,sentence from index
1,Plumíferos,director,Dani De Felippo,Q46916,Q5798262,sentence from index
2,Kick-Ass,director,Matthew Vaughn,Q2201,Q2593,sentence from index
3,A Gang Story,director,Olivier Marchal,Q593,Q694259,sentence from index
4,Fat Girl,director,Catherine Breillat,Q59503,Q289054,sentence from index
...,...,...,...,...,...,...
95,Anna Karamazoff,director,Rustam Khamdamov,Q24340,Q24583,sentence from index
96,Onegin,director,Martha Fiennes,Q50969,Q458730,sentence from index
97,To Die like a Man,director,João Pedro Rodrigues,Q33139,Q782982,sentence from index
98,Yojimbo,director,Akira Kurosawa,Q20475,Q8006,sentence from index
