In [1]:
import pickle
from tqdm import tqdm

In [68]:
def loadTKG(filename):
    f = open(filename, 'r')
    lines = []
    for line in f:
        line = line.strip().split('\t')
        if len(line) != 5:
            print(len(line), line)
            continue
        lines.append(line)
    return lines

def discretizeKG(lines):
    new_lines = []
    for line in tqdm(lines):
        start = int(line[-2])
        end = int(line[-1])
        for i in range(start, end+1):
            new_line = line[:3]
            new_line.extend([i])
            new_lines.append(new_line)
    return new_lines

def getHeads(dkg, tail, relation):
    out = []
    for line in tqdm(dkg):
        if line[2] == tail and line[1] == relation:
            out.append(line)
    return out

def getTails(dkg, head, relation):
    out = []
    for line in tqdm(dkg):
        if line[0] == head and line[1] == relation:
            out.append(line)
    return out

def openFileAsDict(filename):
    f = open(filename, 'r')
    out = {}
    for line in f:
        line = line[:-1].split('\t') # can't strip() since name can be whitespace
        out[line[0]] = line[1]
    return out

def triplesToText(lines):
    entFile = '../data/wikidata_big/kg/wd_id2entity_text.txt'
    relFile = '../data/wikidata_big/kg/wd_id2relation_text.txt'
    id2ent = openFileAsDict(entFile)
    id2rel = openFileAsDict(relFile)
    out = []
    for line in lines:
        line = line.copy()
        line[0] = id2ent[line[0]]
        line[1] = id2rel[line[1]]
        line[2] = id2ent[line[2]]
        out.append(line)
    return out
    
        

In [69]:
filename = '../data/wikidata_big/kg/full.txt'
kg = loadTKG(filename)

In [70]:
len(kg)

328635

In [71]:
dkg = discretizeKG(kg)

100%|██████████| 328635/328635 [00:02<00:00, 121181.24it/s]


In [46]:
len(dkg)

1834755

In [87]:
tail = 'Q1371091'
relation = 'P39'
out = getHeads(kg, tail, relation)
triplesToText(out)

100%|██████████| 328635/328635 [00:00<00:00, 2369622.34it/s]


[['Boris Johnson',
  'position held',
  'Secretary of State for Foreign and Commonwealth Affairs',
  '2016',
  '2018'],
 ['William Wyndham Grenville, 1st Baron Grenville',
  'position held',
  'Secretary of State for Foreign and Commonwealth Affairs',
  '1791',
  '1801'],
 ['Robert Stewart, Viscount Castlereagh',
  'position held',
  'Secretary of State for Foreign and Commonwealth Affairs',
  '1812',
  '1822'],
 ['George Brown, Baron George-Brown',
  'position held',
  'Secretary of State for Foreign and Commonwealth Affairs',
  '1966',
  '1968'],
 ['Rab Butler',
  'position held',
  'Secretary of State for Foreign and Commonwealth Affairs',
  '1963',
  '1964'],
 ['Philip Hammond',
  'position held',
  'Secretary of State for Foreign and Commonwealth Affairs',
  '2014',
  '2016'],
 ['Archibald Primrose, 5th Earl of Rosebery',
  'position held',
  'Secretary of State for Foreign and Commonwealth Affairs',
  '1886',
  '1886'],
 ['Robert Jenkinson, 2nd Earl of Liverpool',
  'position hel

In [86]:
head = 'Q1061541'
relation = 'P166'
out = triplesToText(getTails(dkg, head, relation))
out

100%|██████████| 1834755/1834755 [00:00<00:00, 3162025.81it/s]


[['A Prophet', 'award received', 'César Award for Best Cinematography', 2010],
 ['A Prophet', 'award received', 'César Award for Best Director', 2010],
 ['A Prophet',
  'award received',
  'BIFA Award for Best Foreign Independent Film',
  2010],
 ['A Prophet', 'award received', 'César Award for Best Editing', 2010],
 ['A Prophet',
  'award received',
  'César Award for Best Original Screenplay',
  2010],
 ['A Prophet', 'award received', 'Louis Delluc Prize', 2009],
 ['A Prophet', 'award received', 'César Award for Best Actor', 2010],
 ['A Prophet', 'award received', 'César', 2010],
 ['A Prophet', 'award received', 'Cannes Film Festival Grand Prix', 2009],
 ['A Prophet',
  'award received',
  'César Award for Best Production Design',
  2010],
 ['A Prophet',
  'award received',
  'National Board of Review Award for Best Foreign Language Film',
  2009],
 ['A Prophet',
  'award received',
  'BAFTA Award for Best Film Not in the English Language',
  2010],
 ['A Prophet', 'award received', '

In [83]:
for x in kg:
    if x[2] == head:
        print(x)

In [84]:
kg[0]

['Q25559009', 'P39', 'Q41582555', '1847', '1852']

In [85]:
head

' Q1061541'