In [1]:
import pickle
import scipy.spatial.distance
import operator
import numpy as np
import pprint
from wikidata.client import Client

In [2]:
#Wikidata client which allows to query wikidata and get a title of the passed entity, etc
client = Client()

#path to the projects with datasets and models dirs without a trailing slash
PROJECT_DIR = '/Users/alexp/wrk/kgemb'

#set the dimension of embeddings and the model used
KG_DIMENSION = 100
MODEL = 'transR' #could be transH as well ?

#nothing to change
ENTITY_EMBEDDING = PROJECT_DIR + '/models/' + MODEL + '/' + str(KG_DIMENSION) + '/entity2vec.vec'
RELATION_EMBEDDING = PROJECT_DIR + '/models/' + MODEL + '/' + str(KG_DIMENSION) + '/relation2vec.vec'
MATRIX = PROJECT_DIR + '/models/' + MODEL + '/' + str(KG_DIMENSION) + '/A.vec'

ENTITY2ID = PROJECT_DIR + '/datasets/entity2id.txt'
RELATION2ID = PROJECT_DIR + '/datasets/relation2id.txt'
TRAIN2ID = PROJECT_DIR + '/datasets/train2id.txt'

In [3]:
names_ids = {
    'Robert': 'Q12056060',
    'Jon': 'Q3183235',
    'Robb': 'Q13634884',
    'Jaime': 'Q3806180',
    'Cersei': 'Q3665163',
    'Sansa': 'Q3472490',
    'Joffrey': 'Q12900597',
    'Tywin': 'Q12902445',
    'Tyrion': 'Q2076759',
    'Stannis': 'Q12056060',
    'Catelyn': 'Q2941743', #not provided in the pretrained embeddings
    'Bran': 'Q3643599',
    'Arya': 'Q3624677',
    'Ned': 'Q259818',
    'Renly': 'Q18920105',
    'Hound': 'Q3948140',
    'Varys': 'Q4008842',
    'Lysa': 'Q19791067',
    'Tommen': 'Q19792294',
#    'Pycelle': 'Maester Pycelle', #not provided in the pretrained embeddings
    'Balon': 'Q23746101',
    'Loras': 'Q18920137',
    'Theon': 'Q1120793',
    'Petyr': 'Q4360302',
    'Edmure': 'Q23749012',
    'Barristan': 'Q5721186',
    'Roose': 'Q19799430',
    'Margaery': 'Q12900933',
    'Hoster': 'Q23730371',
    'Myrcella': 'Q19799435'
}

In [4]:
def dump_entities_ids():
    """
    Save list of entities on which KG embeddings are trained
    Using the index of element in this array we can retrieve
    the coordinates of the vector from model.
    """
    entities = []
    with open(ENTITY2ID, 'r') as file:
        entities_cnt = int(file.readline())
        for _ in range(entities_cnt):
            entities.append(file.readline().split('\t')[0])
    pickle.dump(entities, open(MODEL + '_entities.pickle', 'wb'))


def dump_relations_ids():
    """
    Save list of relations on which KG embeddings are trained
    Using the index of element in this array we can retrieve
    the coordinates of the vector from model.
    """
    entities = []
    with open(RELATION2ID, 'r') as file:
        entities_cnt = int(file.readline())
        for _ in range(entities_cnt):
            entities.append(file.readline().split('\t')[0])
    pickle.dump(entities, open(MODEL + '_relation.pickle', 'wb'))


def get_entity_embedding():
    """
    Read embedding from file
    :return: numpy.memmap
    """
    return np.loadtxt(ENTITY_EMBEDDING)


def get_relation_embedding():
    """
    Read embedding from file
    :return: numpy.memmap
    """
    return np.loadtxt(RELATION_EMBEDDING)


def get_vec_from_embedding(embedding, index):
    """
    get the vector for embedding
    :param embedding: numpy.memmap object with embedding
    :param index: index of the required element
    :return:
    """
    return embedding[index]


In [5]:
# Do not have to execute if there are entities.pickle and relations.pickle already
dump_relations_ids()
dump_entities_ids()

In [6]:
entities_ids = pickle.load(open(MODEL + '_entities.pickle', 'rb'))
relation_ids = pickle.load(open(MODEL + '_relation.pickle', 'rb'))
matrix = np.loadtxt(MATRIX)

entity_embedding = get_entity_embedding()
relation_embedding = get_entity_embedding()

In [7]:
#download labels for each relation from wikidata
relation_labels = {}
for relation in relation_ids:
    relation_labels[relation] = str(client.get(relation).label)

In [18]:
tywin = get_vec_from_embedding(entity_embedding, entities_ids.index(names_ids['Tywin']))
tywin_matrix = get_vec_from_embedding(matrix, entities_ids.index(names_ids['Tywin']))
jaime = get_vec_from_embedding(entity_embedding, entities_ids.index(names_ids['Jaime']))
jaime_matrix = get_vec_from_embedding(matrix, entities_ids.index(names_ids['Jaime']))
cersei = get_vec_from_embedding(entity_embedding, entities_ids.index(names_ids['Cersei']))
cersei_matrix = get_vec_from_embedding(matrix, entities_ids.index(names_ids['Cersei']))
father = get_vec_from_embedding(relation_embedding, relation_ids.index('P22'))
mother = get_vec_from_embedding(relation_embedding, relation_ids.index('P25'))

In [19]:
# if the relation between chosen entities is correct, head+relation-tail should tend to zero
# or at least closer to zero than other relations
print(np.mean(abs(tywin*tywin_matrix+father-jaime*jaime_matrix)))
print(np.mean(abs(jaime*jaime_matrix+father-tywin*tywin_matrix)))

print(np.mean(abs(jaime*jaime_matrix+father-tywin*tywin_matrix)) > np.mean(abs(jaime*jaime_matrix+mother-tywin*tywin_matrix)))
print(np.mean(abs(tywin*tywin_matrix+father-jaime*jaime_matrix)) > np.mean(abs(tywin*tywin_matrix+mother-jaime*jaime_matrix)))

0.025467354481829997
0.025285558446349995
False
False


In [22]:
# experiments with the idea provided above

rels = {}
for relation in relation_ids:
    rel = get_vec_from_embedding(relation_embedding, relation_ids.index(relation))
    #rels[relation] = np.mean(abs(tywin+rel-jaime))
    rels[relation_labels[relation]] = np.mean(abs(tywin*tywin_matrix+rel-jaime*jaime_matrix))
    
sorted_rels = sorted(rels.items(), key=operator.itemgetter(1))

print('first five items:')
pprint.pprint(sorted_rels[:5])
print()
print('last five items:')
pprint.pprint(sorted_rels[-5:])


print('\n########\n')


rels = {}
for relation in relation_ids:
    rel = get_vec_from_embedding(relation_embedding, relation_ids.index(relation))
    #rels[relation] = np.mean(abs(tywin+rel-jaime))
    rels[relation_labels[relation]] = np.mean(abs(jaime*jaime_matrix+rel-tywin*tywin_matrix))
    
sorted_rels = sorted(rels.items(), key=operator.itemgetter(1))

print('first five items:')
pprint.pprint(sorted_rels[:5])
print()
print('last five items:')
pprint.pprint(sorted_rels[-5:])

first five items:
[('place of death', 0.010780149672650001),
 ('named after', 0.01436815275057),
 ('participant of', 0.016830264063669997),
 ('medical condition', 0.020509286698489997),
 ('given name', 0.021314651194150006)]

last five items:
[('from fictional universe', 0.05534917011431),
 ('sexual orientation', 0.058503168988069995),
 ('country', 0.06328725118483),
 ("topic's main category", 0.06614846593307),
 ('partner', 0.06702763906555)]

########

first five items:
[('place of death', 0.011403133205329999),
 ('named after', 0.01448901302655),
 ('participant of', 0.01691039182307),
 ('medical condition', 0.020579526169590003),
 ('given name', 0.02122639282465)]

last five items:
[('from fictional universe', 0.05562800876498999),
 ('sexual orientation', 0.05854643101193),
 ('country', 0.06331701534163),
 ("topic's main category", 0.06636317406693),
 ('partner', 0.06680780093444999)]


In [57]:
valid_triples = []
with open(TRAIN2ID, 'r') as file:
        triples_cnt = int(file.readline())
        for _ in range(triples_cnt):
            valid_triples.append(file.readline().split())

success = []
fail = []
changed_list = []
for index, triple in enumerate(valid_triples):
    e1 = get_vec_from_embedding(entity_embedding, int(triple[0]))
    e1_matrix = get_vec_from_embedding(matrix, int(triple[0]))
    e2 = get_vec_from_embedding(entity_embedding, int(triple[1]))
    e2_matrix = get_vec_from_embedding(matrix, int(triple[1]))
    r = get_vec_from_embedding(relation_embedding, int(triple[2]))
    r_mean = min_r = np.mean(abs(e1*e1_matrix+r-e2*e2_matrix))
    min_r_vec = r
    
    for relation in relation_ids:
        rel = get_vec_from_embedding(relation_embedding, relation_ids.index(relation))
        curr_r = np.mean(abs(e1*e1_matrix+rel-e2*e2_matrix))
        if curr_r < min_r:
            min_r = curr_r
            min_r_vec = rel
    
    
    if np.array_equal(min_r_vec,r):
        success.append(triple)
    else:
        fail.append(triple)

In [59]:
print("Successfully predicted:")
for triple in success:
    e1 = client.get(entities_ids[int(triple[0])]).label
    e2 = client.get(entities_ids[int(triple[1])]).label
    r = client.get(relation_ids[int(triple[2])]).label
    print(f'{e1} - {r} - {e2}')
    

Successfully predicted:
Lancel Lannister - place of death - Great Sept of Baelor
False
The High Sparrow - place of death - Great Sept of Baelor
False
Margaery Tyrell - place of death - Great Sept of Baelor
False
Loras Tyrell - place of death - Great Sept of Baelor
False
