In [1]:
import pickle
import scipy.spatial.distance
import operator
import numpy as np

In [2]:
#pass the path to WikiData embeddings which could be downloaded from http://139.129.163.161/download/wikidata
WIKIDATA_DIR = '/Users/alexp/Downloads/Wikidata'

#set the dimension of embeddings. The archive above provide embedings with dimensions 50 and 100
KG_DIMENSION = 100

#nothing to change
ENTITY_EMBEDDING = WIKIDATA_DIR + '/embeddings/dimension_' + str(KG_DIMENSION) + '/transe/entity2vec.bin'
RELATION_EMBEDDING = WIKIDATA_DIR + '/embeddings/dimension_' + str(KG_DIMENSION) + '/transe/relation2vec.bin'

ENTITY2ID = WIKIDATA_DIR + '/knowledge graphs/entity2id.txt'
RELATION2ID = WIKIDATA_DIR + '/knowledge graphs/relation2id.txt'

In [3]:
names_ids = {
    'Robert': 'Q12056060',
    'Jon': 'Q3183235',
    'Robb': 'Q13634884',
    'Jaime': 'Q3806180',
    'Cersei': 'Q3665163',
    'Sansa': 'Q3472490',
    'Joffrey': 'Q12900597',
    'Tywin': 'Q12902445',
    'Tyrion': 'Q2076759',
    'Stannis': 'Q12056060',
    'Catelyn': 'Catelyn Stannis', #not provided in the pretrained embeddings
    'Bran': 'Q3643599',
    'Arya': 'Q3624677',
    'Ned': 'Q259818',
    'Renly': 'Q18920105',
    'Hound': 'Q3948140',
    'Varys': 'Q4008842',
    'Lysa': 'Q19791067',
    'Tommen': 'Q19792294',
    'Pycelle': 'Maester Pycelle', #not provided in the pretrained embeddings
    'Balon': 'Q23746101',
    'Loras': 'Q18920137',
    'Theon': 'Q1120793',
    'Petyr': 'Q4360302',
    'Edmure': 'Q23749012',
    'Barristan': 'Q5721186',
    'Roose': 'Q19799430',
    'Margaery': 'Q12900933',
    'Hoster': 'Q23730371',
    'Myrcella': 'Q19799435'
}

In [4]:
def dump_entities_ids():
    """
    Save list of entities on which KG embeddings are trained
    Using the index of element in this array we can retrieve
    the coordinates of the vector from model.
    """
    entities = []
    with open(ENTITY2ID, 'r') as file:
        entities_cnt = int(file.readline())
        for _ in range(entities_cnt):
            entities.append(file.readline().split('\t')[0])
    pickle.dump(entities, open('entities.pickle', 'wb'))


def dump_relations_ids():
    """
    Save list of relations on which KG embeddings are trained
    Using the index of element in this array we can retrieve
    the coordinates of the vector from model.
    """
    entities = []
    with open(RELATION2ID, 'r') as file:
        entities_cnt = int(file.readline())
        for _ in range(entities_cnt):
            entities.append(file.readline().split('\t')[0])
    pickle.dump(entities, open('relation.pickle', 'wb'))


def get_entity_embedding():
    """
    Read embedding from file
    :return: numpy.memmap
    """
    return np.memmap(ENTITY_EMBEDDING, dtype='float32', mode='r')


def get_relation_embedding():
    """
    Read embedding from file
    :return: numpy.memmap
    """
    return np.memmap(RELATION_EMBEDDING, dtype='float32', mode='r')


def get_vec_from_embedding(embedding, index):
    """
    get the vector for embedding
    :param embedding: numpy.memmap object with embedding
    :param index: index of the required element
    :return:
    """
    return embedding[index * KG_DIMENSION:index * KG_DIMENSION + KG_DIMENSION]


In [5]:
# Do not have to execute if there are entities.pickle and relations.pickle already
dump_relations_ids()
dump_entities_ids()

In [7]:
entities_ids = pickle.load(open('entities.pickle', 'rb'))
relation_ids = pickle.load(open('relation.pickle', 'rb'))

entity_embedding = get_entity_embedding()
relation_embedding = get_entity_embedding()

In [8]:
jaime = get_vec_from_embedding(entity_embedding, entities_ids.index(names_ids['Jaime']))
tywin = get_vec_from_embedding(entity_embedding, entities_ids.index(names_ids['Tywin']))

rels = {}

#iterate through all the relations in the pretrained embeddings
for relation in relation_ids:
    rel = get_vec_from_embedding(relation_embedding, relation_ids.index(relation))
    #save cosine similarity of tywin + relation and jaime
    rels[relation] = (1 - scipy.spatial.distance.cosine(tywin + rel, jaime))

#iterate through resulting similarities and print them as well as the most similar relation.
for key,val in rels.items():
    print(f'{key}\t{val}')
print(max(rels.items(), key=operator.itemgetter(1))[0])

P2670	0.33083656430244446
P2959	0.21230417490005493
P2184	0.3552343249320984
P793	0.19461022317409515
P31	0.3930019736289978
P1419	0.3731709420681
P527	0.4134470224380493
P361	0.2253141701221466
P910	0.22658906877040863
P1343	0.24602903425693512
P607	0.1714085340499878
P237	0.36817416548728943
P1412	0.3043231666088104
P119	0.2742873430252075
P19	0.24249154329299927
P509	0.29892992973327637
P9	0.36560001969337463
P7	0.33977776765823364
P26	0.2947688400745392
P463	0.2959870398044586
P460	0.18194805085659027
P106	0.305629163980484
P103	0.38589000701904297
P102	0.3492889404296875
P25	0.42297402024269104
P27	0.26766863465309143
P21	0.27564355731010437
P20	0.2735394537448883
P22	0.2477220743894577
P40	0.27122557163238525
P39	0.30612894892692566
P241	0.3188403844833374
P166	0.24964125454425812
P410	0.2842424511909485
P734	0.3992866277694702
P735	0.2982926666736603
P140	0.3640022277832031
P1196	0.25536900758743286
P1441	0.41698604822158813
P512	0.2515535354614258
P451	0.32968440651893616
P108	

In [16]:
#get some relations
father = get_vec_from_embedding(relation_embedding, relation_ids.index('P22'))
mother = get_vec_from_embedding(relation_embedding, relation_ids.index('P25'))
child = get_vec_from_embedding(relation_embedding, relation_ids.index('P40'))

#get some characters
jaime = get_vec_from_embedding(entity_embedding, entities_ids.index(names_ids['Jaime']))
tywin = get_vec_from_embedding(entity_embedding, entities_ids.index(names_ids['Tywin']))
cersei = get_vec_from_embedding(entity_embedding, entities_ids.index(names_ids['Cersei']))
renly =  get_vec_from_embedding(entity_embedding, entities_ids.index(names_ids['Renly']))

#print some cosine similarities
print(1 - scipy.spatial.distance.cosine(tywin,jaime))
print(1 - scipy.spatial.distance.cosine(tywin, cersei))
print(1 - scipy.spatial.distance.cosine(tywin, renly))

0.3895951509475708
0.42600196599960327
0.06363288313150406


In [10]:
# experiment inspired by https://github.com/thunlp/OpenKE/blob/master/models/TransE.py
# if the relation between chosen entities is correct, head+relation-tail should tend to zero
np.mean(abs(tywin+father-jaime))


0.108608685

In [11]:
# experiments with the idea provided above

rels = {}
for relation in relation_ids:
    rel = get_vec_from_embedding(relation_embedding, relation_ids.index(relation))
    rels[relation] = np.mean(abs(tywin+rel-jaime))
    
sorted_rels = sorted(rels.items(), key=operator.itemgetter(1))
print(sorted_rels[:5])
print(sorted_rels[-5:])
import pprint
pprint.pprint(sorted_rels)

[('P2633', 0.08812601), ('P25', 0.09186284), ('P163', 0.092917986), ('P501', 0.09311284), ('P790', 0.09323021)]
[('P504', 0.12497418), ('P518', 0.12520437), ('P2512', 0.12551242), ('P769', 0.12590435), ('P567', 0.12990887)]
[('P2633', 0.08812601),
 ('P25', 0.09186284),
 ('P163', 0.092917986),
 ('P501', 0.09311284),
 ('P790', 0.09323021),
 ('P1018', 0.09428894),
 ('P3150', 0.094352625),
 ('P1885', 0.09454864),
 ('P3190', 0.09460247),
 ('P411', 0.095032066),
 ('P516', 0.0951595),
 ('P2184', 0.095491536),
 ('P1441', 0.09551218),
 ('P608', 0.095518515),
 ('P694', 0.09569063),
 ('P1038', 0.095892966),
 ('P737', 0.09617087),
 ('P376', 0.0967441),
 ('P1074', 0.096958406),
 ('P2894', 0.09713523),
 ('P105', 0.09746348),
 ('P720', 0.0974896),
 ('P1382', 0.09750522),
 ('P122', 0.09767021),
 ('P2289', 0.09773627),
 ('P138', 0.09812271),
 ('P69', 0.098308094),
 ('P2371', 0.0984037),
 ('P1889', 0.09846036),
 ('P681', 0.09861391),
 ('P527', 0.09863968),
 ('P1881', 0.098734766),
 ('P1072', 0.098874845

In [12]:
# the same playing around
rels = {}
for relation in relation_ids:
    rel = get_vec_from_embedding(relation_embedding, relation_ids.index(relation))
    rels[relation] = np.mean(abs(jaime+rel-tywin))
    
sorted_rels = sorted(rels.items(), key=operator.itemgetter(1))
print(sorted_rels[:5])
print(sorted_rels[-5:])
import pprint
pprint.pprint(sorted_rels)

[('P1192', 0.09041003), ('P567', 0.09148895), ('P2512', 0.09370851), ('P1560', 0.09431634), ('P2293', 0.09525961)]
[('P177', 0.12650509), ('P2175', 0.1268054), ('P706', 0.12775719), ('P1596', 0.12776172), ('P501', 0.12809077)]
[('P1192', 0.09041003),
 ('P567', 0.09148895),
 ('P2512', 0.09370851),
 ('P1560', 0.09431634),
 ('P2293', 0.09525961),
 ('P853', 0.09538241),
 ('P1340', 0.09556075),
 ('P2869', 0.095647074),
 ('P822', 0.09567438),
 ('P1312', 0.0957092),
 ('P2379', 0.0958906),
 ('P54', 0.09607203),
 ('P2291', 0.09646633),
 ('P694', 0.096479386),
 ('P84', 0.0966073),
 ('P749', 0.0966363),
 ('P1302', 0.09663895),
 ('P1574', 0.09671621),
 ('P3005', 0.09693881),
 ('P1547', 0.09701071),
 ('P2679', 0.09709818),
 ('P209', 0.09720605),
 ('P1057', 0.09752768),
 ('P1533', 0.09777663),
 ('P275', 0.09785069),
 ('P825', 0.0978722),
 ('P3189', 0.09797583),
 ('P2321', 0.09812177),
 ('P114', 0.098166086),
 ('P655', 0.098338805),
 ('P1151', 0.09840383),
 ('P872', 0.098700315),
 ('P2417', 0.0991804

In [13]:
# and again
rels = {}
for relation in relation_ids:
    rel = get_vec_from_embedding(relation_embedding, relation_ids.index(relation))
    rels[relation] = np.mean(abs(cersei+rel-jaime))
    
sorted_rels = sorted(rels.items(), key=operator.itemgetter(1))
print(sorted_rels[:5])
print(sorted_rels[-5:])

[('P664', 0.085954696), ('P2633', 0.08675995), ('P121', 0.08891115), ('P85', 0.09041537), ('P25', 0.091012165)]
[('P914', 0.124485314), ('P504', 0.124696836), ('P682', 0.125526), ('P2959', 0.12599167), ('P518', 0.12992796)]


In [15]:
#experiment on one-to-one relation, but still inappropriate results
rels = {}
moscow = get_vec_from_embedding(entity_embedding, entities_ids.index('Q649'))
europe = get_vec_from_embedding(entity_embedding, entities_ids.index('Q46'))

for relation in relation_ids:
    rel = get_vec_from_embedding(relation_embedding, relation_ids.index(relation))
    rels[relation] = np.mean(abs(moscow+rel-europe))
    
sorted_rels = sorted(rels.items(), key=operator.itemgetter(1))
print(sorted_rels[:5])
print(sorted_rels[-5:])

[('P407', 0.07587111), ('P2554', 0.08289944), ('P2157', 0.08944348), ('P20', 0.089468725), ('P25', 0.0898647)]
[('P971', 0.12674052), ('P2291', 0.12753363), ('P1071', 0.12976506), ('P414', 0.13242318), ('P2758', 0.1415138)]
