In [17]:
#imports
import requests
import math

In [18]:
#some useful constants
URL = 'https://query.wikidata.org/sparql'
URI_BEFORE_ID_PART_ENT = 'http://www.wikidata.org/entity/'
URI_BEFORE_ID_PART_REL = 'http://www.wikidata.org/prop/direct/'

TYPE_ENTITY = 0
TYPE_RELATION = 1

UNION_CNT = 50

FROM_FICTIONAL_UNIVERSE_REL = 'P1080'
ASOIF_WORLD_ENT = 'Q2461698'
HP_UNIVERSE_ENT = 'Q5410773'

PRESENT_IN_WORK_REL = 'P1441'
ASOIF_ENT = 'Q45875'
HP_LIT_SERIES = 'Q8337'
HP_FILM_SERIES = 'Q216930'

In [19]:
def get_id_from_uri(uri, input_type):
    """Simply remove all the chars"""
    if input_type == TYPE_ENTITY:
        return uri.replace(URI_BEFORE_ID_PART_ENT, '')
    if input_type == TYPE_RELATION:
        return uri.replace(URI_BEFORE_ID_PART_REL, '')


def build_entity_query(rel, tail):
    query = '''SELECT ?id ?label
    WHERE {
        ?id wdt:%s wd:%s.
        ?id rdfs:label ?label .
        FILTER(lang(?label) = 'en')
      }''' % (rel, tail)
    return query


def wikidata_request(query):
    resp = requests.get(URL, params = {'format': 'json', 'query': query})
    return resp.json()
    
    
def retrieve_id_with_label(items_list):
    """Post-process data from wikidata"""
    res = []
    for item in items_list:
        res.append({
            'id': get_id_from_uri(item['id']['value'], TYPE_ENTITY),
            'label': item['label']['value']
        })
    return res

In [20]:
datasets = [wikidata_request(build_entity_query(FROM_FICTIONAL_UNIVERSE_REL, ASOIF_WORLD_ENT)), **wikidata_request(build_entity_query(FROM_FICTIONAL_UNIVERSE_REL, HP_UNIVERSE_ENT))]
from_universe_entities = []
for data in datasets:
    from_universe_entities += [get_id_from_uri(item['id']['value'], TYPE_ENTITY) for item in data['results']['bindings']]

data = {
    **wikidata_request(build_entity_query(PRESENT_IN_WORK_REL, ASOIF_ENT)), 
    **wikidata_request(build_entity_query(PRESENT_IN_WORK_REL, HP_LIT_SERIES)), 
    **wikidata_request(build_entity_query(PRESENT_IN_WORK_REL, HP_FILM_SERIES))
}
present_in_work_entities = [get_id_from_uri(item['id']['value'], TYPE_ENTITY) for item in data['results']['bindings']]

    

In [21]:
entities = list(set(from_universe_entities+present_in_work_entities + [
    ASOIF_WORLD_ENT, 
    ASOIF_ENT,
    HP_UNIVERSE_ENT,
    HP_LIT_SERIES,
    HP_FILM_SERIES
]))

In [22]:
def create_bind_statement(id):
    return '''
    { BIND (wd:%s as ?s)
      wd:%s ?p ?o . }
    ''' % (id, id)
    
binds = []
for e in present_in_work_entities:
    binds.append(create_bind_statement(e))

res = []
for _ in range(math.ceil(len(binds) / UNION_CNT)):
    q = '''CONSTRUCT {?s ?p ?o}
    WHERE {''' + ' UNION '.join(binds[UNION_CNT * _: UNION_CNT * (_ + 1)]) + ' }'
    t = wikidata_request(q)
    res += t['results']['bindings']


In [23]:
rel_triples = []
rels = []
for item in res:
    if item['object']['value'].startswith('http://www.wikidata.org/entity/Q'):
        head = get_id_from_uri(item['subject']['value'], TYPE_ENTITY)
        rel = get_id_from_uri(item['predicate']['value'], TYPE_RELATION)
        tail = get_id_from_uri(item['object']['value'], TYPE_ENTITY)
        rels.append(rel)
        rel_triples.append((head, tail, rel))
        entities.append(tail)

In [24]:
entities = list(set(entities))
rels = list(set(rels))

In [25]:
final_triples = []
for item in rel_triples:
    #e1 e2 r
    triple = (entities.index(item[0]), entities.index(item[1]), rels.index(item[2]))
    final_triples.append(triple)

In [26]:
with open('datasets/entity2id.txt', 'w') as f:
    f.write("%s\n" % len(entities))
    for index, item in enumerate(entities):
        f.write("%s\t%s\n" % (item, index))

In [27]:
with open('datasets/relation2id.txt', 'w') as f:
    f.write("%s\n" % len(rels))
    for index, item in enumerate(rels):
        f.write("%s\t%s\n" % (item, index))

In [28]:
with open('datasets/train2id.txt', 'w') as f:
    f.write("%s\n" % len(final_triples))
    for item in final_triples:
        f.write('%s\t%s\t%s\n' % item)