# TransE 实践

在这个演示中，我们使用TransE([论文链接](https://www.utc.fr/~bordesan/dokuwiki/_media/en/transe_nips13.pdf))对示例中文知识图谱进行链接预测，从而达到补全知识图谱的目的。

希望在这个demo中帮助大家了解知识图谱表示学习的作用原理和机制。

本demo建议使用python3运行。

## 数据集
这个示例中，我们使用的是一个人工构造的小的中文数据集，示例图谱可视化如下：
<img src="./pic/示例图谱可视化.png" style="zoom:50%;" />

train.txt：包含28个训练三元组，文件的每一行每一行表示一个三元组, 按头实体、关系、尾实体顺序 ，并用’\t’分隔。

test.txt：包含4个测试三元组，文件的每一行表示一个三元组, 按头实体、关系、尾实体顺序 ，并用’\t’分隔。

entity2id.txt：存储了实体对应id的信息，示例图谱中共包含17个实体，为每个实体分配一个id从0-16，文件每一行表示一个实体和对应的id并用’\t’分隔。

relation2id.txt： 存储了关系对应id的信息，示例图谱中共包含6个关系，为每个关系分配一个id从0-5，文件每一行表示一个关系和对应的id并用’\t’分隔


### TransE 原理回顾
TransE将每个实体和关系都表示成一个向量，并假设对于一个存在在知识图谱中的三元组$(h,r,t)$, $h, r, t$的向量表示$\mathbf{h}, \mathbf{r}, \mathbf{t}$满足：
<img src="./pic/TransE-向量空间假设.png" style="zoom:50%;" />
即
$$\mathbf{h} + \mathbf{r} = \mathbf{t}$$



对于每个正确的三元组的优化目标是：
$$\mathbf{h} + \mathbf{r} \approx \mathbf{t}$$
对于一个三元组的评分函数为：
$$f_r(h,t) = \| \mathbf{h} + \mathbf{r} - \mathbf{t} \| _{L_1/L_2} $$
TransE的损失函数：
$$ L = \sum_{(h,r,t)\in S} \sum_{(h^\prime, r^\prime, t^\prime) \in S^\prime} max(0, f_r(h,t) + \gamma - f_{r^\prime} (h^\prime, t^\prime)) $$
其中$S$是所有正样本的集合，$S^\prime$是所有负样本的集合，对于一个正样本$(h,r,t)$负样本通过随机替换$h$或$t$得到， $\gamma$表示间隔，是一个超参。

### 代码实践

In [1]:
import tensorflow as tf 
import time 
import argparse
import random
import numpy as np 
import os.path
import math
import timeit
from multiprocessing import JoinableQueue, Queue, Process
from collections import defaultdict

In [2]:
class TransE:
    @property
    def variables(self):
        return self.__variables

    @property
    def num_triple_train(self):
        return self.__num_triple_train

    @property 
    def num_triple_test(self):
        return self.__num_triple_test

    @property
    def testing_data(self):
        return self.__triple_test

    @property 
    def num_entity(self):
        return self.__num_entity

    @property
    def embedding_entity(self):
        return self.__embedding_entity


    @property
    def embedding_relation(self):
        return self.__embedding_relation

    @property
    def hr_t(self):
        return self.__hr_t

    @property 
    def tr_h(self):
        return self.__tr_h
    
    @property
    def entity2id(self):
        return self.__entity2id
    
    @property
    def relation2id(self):
        return self.__relation2id

    @property
    def id2entity(self):
        return self.__id2entity
    
    @property
    def id2relation(self):
        return self.__id2relation

    def training_data_batch(self, batch_size = 512):
        n_triple = len(self.__triple_train)
        rand_idx = np.random.permutation(n_triple)
        start = 0
        while start < n_triple:
            start_t = timeit.default_timer()
            end = min(start+batch_size, n_triple)
            size = end - start 
            train_triple_positive = np.asarray([ self.__triple_train[x] for x in  rand_idx[start:end]])
            train_triple_negative = []
            num_negative = 5 
            for t in train_triple_positive:
                replace_entity_id = np.random.randint(self.__num_entity)
                random_num = np.random.random()

                if self.__negative_sampling == 'unif':
                    replace_head_probability = 0.5
                elif self.__negative_sampling == 'bern':
                    replace_head_probability = self.__relation_property[t[1]]
                else:
                    raise NotImplementedError("Dose not support %s negative_sampling" %negative_sampling)

                if random_num<replace_head_probability:
                    train_triple_negative.append((replace_entity_id, t[1],t[2]))
                else:
                    train_triple_negative.append((t[0], t[1], replace_entity_id))

            start = end
            prepare_t = timeit.default_timer()-start_t

            yield train_triple_positive, train_triple_negative, prepare_t


    def __init__(self, data_dir, negative_sampling,learning_rate, 
             batch_size, max_iter, margin, dimension, norm, evaluation_size, regularizer_weight):
        # this part for data prepare
        self.__data_dir=data_dir
        self.__negative_sampling=negative_sampling
        self.__regularizer_weight = regularizer_weight
        self.__norm = norm

        self.__entity2id={}
        self.__id2entity={}
        self.__relation2id={}
        self.__id2relation={}

        self.__triple_train=[] #[(head_id, relation_id, tail_id),...]
        self.__triple_test=[]
        self.__triple_valid=[]
        self.__triple = []

        self.__num_entity=0
        self.__num_relation=0
        self.__num_triple_train=0
        self.__num_triple_test=0
        self.__num_triple_valid=0

        # load all the file: entity2id.txt, relation2id.txt, train.txt, test.txt, valid.txt
        self.load_data()
        print('finish preparing data. ')


        # this part for the model:
        self.__learning_rate = learning_rate
        self.__batch_size = batch_size
        self.__max_iter = max_iter
        self.__margin = margin
        self.__dimension = dimension
        self.__variables= []
        #self.__norm = norm
        self.__evaluation_size = evaluation_size
        bound = 6 / math.sqrt(self.__dimension)
        with tf.device('/cpu'):
            self.__embedding_entity = tf.get_variable('embedding_entity', [self.__num_entity, self.__dimension],
                                                       initializer=tf.random_uniform_initializer(minval=-bound, maxval=bound, seed = 123))
            self.__embedding_relation = tf.get_variable('embedding_relation', [self.__num_relation, self.__dimension],
                                                         initializer=tf.random_uniform_initializer(minval=-bound, maxval=bound, seed =124))
            self.__variables.append(self.__embedding_entity)
            self.__variables.append(self.__embedding_relation)
            print('finishing initializing')


    def load_data(self):
        print('loading entity2id.txt ...')
        with open(os.path.join(self.__data_dir, 'entity2id.txt'), encoding='utf-8') as f:
            self.__entity2id = {line.strip().split('\t')[0]: int(line.strip().split('\t')[1]) for line in f.readlines()}
            self.__id2entity = {value:key for key,value in self.__entity2id.items()}

        print('loading reltion2id.txt ...')     
        with open(os.path.join(self.__data_dir,'relation2id.txt'), encoding='utf-8') as f:
            self.__relation2id = {line.strip().split('\t')[0]: int(line.strip().split('\t')[1]) for line in f.readlines()}
            self.__id2relation = {value:key for key, value in self.__relation2id.items()}

        def load_triple(self, triplefile):
            triple_list = [] #[(head_id, relation_id, tail_id),...]
            with open(os.path.join(self.__data_dir, triplefile), encoding='utf-8') as f:
                for line in f.readlines():
                    line_list = line.strip().split('\t')
                    assert len(line_list) == 3
                    headid = self.__entity2id[line_list[0]]
                    relationid = self.__relation2id[line_list[1]]
                    tailid = self.__entity2id[line_list[2]]
                    triple_list.append((headid, relationid, tailid))
                    self.__hr_t[(headid, relationid)].add(tailid)
                    self.__tr_h[(tailid, relationid)].add(headid)
            return triple_list

        self.__hr_t = defaultdict(set)
        self.__tr_h = defaultdict(set)
        self.__triple_train = load_triple(self, 'train.txt')
        self.__triple_test = load_triple(self, 'test.txt')
        self.__triple_valid = load_triple(self, 'valid.txt')
        self.__triple = np.concatenate([self.__triple_train, self.__triple_test, self.__triple_valid], axis = 0 )

        self.__num_relation = len(self.__relation2id)
        self.__num_entity = len(self.__entity2id)
        self.__num_triple_train = len(self.__triple_train)
        self.__num_triple_test = len(self.__triple_test)
        self.__num_triple_valid = len(self.__triple_valid)

        print('entity number: ' + str(self.__num_entity))
        print('relation number: ' + str(self.__num_relation))
        print('training triple number: ' + str(self.__num_triple_train))
        print('testing triple number: ' + str(self.__num_triple_test))
        print('valid triple number: ' + str(self.__num_triple_valid))


        if self.__negative_sampling == 'bern':
            self.__relation_property_head = {x:[] for x in range(self.__num_relation)} #{relation_id:[headid1, headid2,...]}
            self.__relation_property_tail = {x:[] for x in range(self.__num_relation)} #{relation_id:[tailid1, tailid2,...]}
            self.__relation_property = {x:[] for x in range(self.__num_relation)} 
            for t in self.__triple_train:
                #print(t)
                self.__relation_property_head[t[1]].append(t[0])
                self.__relation_property_tail[t[1]].append(t[2])
            #print(self.__relation_property_head[0])
            #print(self.__relation_property_tail[0])
            for x in self.__relation_property_head.keys():
                t = len(set(self.__relation_property_tail[x]))
                h = len(set(self.__relation_property_head[x]))
                self.__relation_property[x] = float(t)/(h+t+0.000000001)
            #self.__relation_property = {x:(len(set(self.__relation_property_tail[x])))/(len(set(self.__relation_property_head[x]))+ len(set(self.__relation_property_tail[x]))) \
            #							 for x in self.__relation_property_head.keys()} # {relation_id: p, ...} 0< num <1, and for relation replace head entity with the property p
        else: 
            print("unif set don't need to calculate hpt and tph")



    def train(self, inputs):
        embedding_relation = self.__embedding_relation
        embedding_entity = self.__embedding_entity

        triple_positive, triple_negative = inputs # triple_positive:(head_id,relation_id,tail_id)

        norm_entity = tf.nn.l2_normalize(embedding_entity, dim = 1)
        norm_relation = tf.nn.l2_normalize(embedding_relation, dim = 1)
        norm_entity_l2sum = tf.sqrt(tf.reduce_sum(norm_entity**2, axis = 1))

        embedding_positive_head = tf.nn.embedding_lookup(norm_entity, triple_positive[:, 0])
        embedding_positive_tail = tf.nn.embedding_lookup(norm_entity, triple_positive[:, 2])
        embedding_positive_relation = tf.nn.embedding_lookup(norm_relation, triple_positive[:, 1])

        embedding_negative_head = tf.nn.embedding_lookup(norm_entity, triple_negative[:, 0])
        embedding_negative_tail = tf.nn.embedding_lookup(norm_entity, triple_negative[:, 2])
        embedding_negative_relation = tf.nn.embedding_lookup(norm_relation, triple_negative[:, 1])

        score_positive = tf.reduce_sum(tf.abs(embedding_positive_head + embedding_positive_relation - embedding_positive_tail), axis = 1)
        score_negative = tf.reduce_sum(tf.abs(embedding_negative_head + embedding_negative_relation - embedding_negative_tail), axis = 1)

        loss_every = tf.maximum(0., score_positive + self.__margin - score_negative)
        loss_triple = tf.reduce_sum(tf.maximum(0., score_positive + self.__margin - score_negative))
        self.__loss_regularizer = loss_regularizer = tf.reduce_sum(tf.abs(self.__embedding_relation)) + tf.reduce_sum(tf.abs(self.__embedding_entity))
        return loss_triple, loss_every, norm_entity_l2sum #+ loss_regularizer*self.__regularizer_weight

    def test(self, inputs):
        embedding_relation = self.__embedding_relation
        embedding_entity = self.__embedding_entity

        triple_test = inputs # (headid, tailid, tailid)
        head_vec = tf.nn.embedding_lookup(embedding_entity, triple_test[0])
        rel_vec = tf.nn.embedding_lookup(embedding_relation, triple_test[1])
        tail_vec = tf.nn.embedding_lookup(embedding_entity, triple_test[2])

        norm_embedding_entity = tf.nn.l2_normalize(embedding_entity, dim =1 )
        norm_embedding_relation = tf.nn.l2_normalize(embedding_relation, dim = 1)
        norm_head_vec = tf.nn.embedding_lookup(norm_embedding_entity, triple_test[0])
        norm_rel_vec = tf.nn.embedding_lookup(norm_embedding_relation, triple_test[1])
        norm_tail_vec = tf.nn.embedding_lookup(norm_embedding_entity, triple_test[2])

        _, id_replace_head = tf.nn.top_k(tf.reduce_sum(tf.abs(embedding_entity + rel_vec - tail_vec), axis=1), k=self.__num_entity)
        _, id_replace_tail = tf.nn.top_k(tf.reduce_sum(tf.abs(head_vec + rel_vec - embedding_entity), axis=1), k=self.__num_entity)

        _, norm_id_replace_head = tf.nn.top_k(tf.reduce_sum(tf.abs(norm_embedding_entity + norm_rel_vec - norm_tail_vec), axis=1), k=self.__num_entity)
        _, norm_id_replace_tail = tf.nn.top_k(tf.reduce_sum(tf.abs(norm_head_vec + norm_rel_vec - norm_embedding_entity), axis=1), k=self.__num_entity)

        return id_replace_head, id_replace_tail, norm_id_replace_head, norm_id_replace_tail

In [3]:
def train_operation(model, learning_rate=0.01, margin=1.0, optimizer_str = 'gradient'):
    with tf.device('/cpu'):
        train_triple_positive_input = tf.placeholder(tf.int32, [None, 3])
        train_triple_negative_input = tf.placeholder(tf.int32, [None, 3])

        loss, loss_every, norm_entity = model.train([train_triple_positive_input, train_triple_negative_input])
        if optimizer_str == 'gradient':
            optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
        elif optimizer_str == 'rms':
            optimizer = tf.train.RMSPropOptimizer(learning_rate = learning_rate)
        elif optimizer_str == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
        else:
            raise NotImplementedError("Dose not support %s optimizer" %optimizer_str)

        grads = optimizer.compute_gradients(loss, model.variables)
        op_train = optimizer.apply_gradients(grads)

        return train_triple_positive_input, train_triple_negative_input, loss, op_train, loss_every, norm_entity

In [4]:
def test_operation(model):
    with tf.device('/cpu'):
        test_triple = tf.placeholder(tf.int32, [3])
        head_rank, tail_rank, norm_head_rank, norm_tail_rank = model.test(test_triple)
        return test_triple, head_rank, tail_rank, norm_head_rank, norm_tail_rank

In [5]:
def test_job(inputqueue, outputqueue, hr_t, tr_h):
    while True:
        dat = inputqueue.get()
        if dat is None:
            inputqueue.task_done()
            continue
        t, id_replace_head, id_replace_tail, norm_id_replace_head, norm_id_replace_tail = dat 
        hrank = 0
        fhrank = 0
        for i in range(len(id_replace_head)):
            val = id_replace_head[-i-1]
            if val == t[0]:
                break
            else: 
                hrank += 1
                fhrank += 1 
                if val in tr_h[(t[2],t[1])]:
                    fhrank -= 1

        norm_hrank = 0
        norm_fhrank = 0
        for i in range(len(norm_id_replace_head)):
            val = norm_id_replace_head[-i-1]
            if val == t[0]:
                break
            else: 
                norm_hrank += 1
                norm_fhrank += 1 
                if val in tr_h[(t[2],t[1])]:
                    norm_fhrank -= 1


        trank = 0
        ftrank = 0
        for i in range(len(id_replace_tail)):
            val = id_replace_tail[-i-1]
            if val == t[2]:
                break
            else:
                trank += 1
                ftrank += 1
                if val in hr_t[(t[0], t[1])]:
                    ftrank -= 1

        norm_trank = 0
        norm_ftrank = 0
        for i in range(len(norm_id_replace_tail)):
            val = norm_id_replace_tail[-i-1]
            if val == t[2]:
                break
            else:
                norm_trank += 1
                norm_ftrank += 1
                if val in hr_t[(t[0], t[1])]:
                    norm_ftrank -= 1
        #print(hrank, fhrank, trank, ftrank,norm_hrank, norm_fhrank, norm_trank, norm_ftrank)
        outputqueue.put((hrank, fhrank, trank, ftrank,norm_hrank, norm_fhrank, norm_trank, norm_ftrank))
        inputqueue.task_done()

In [6]:
class Args:
    pass

In [7]:
# 设置参数等
args  = Args()
args.data_dir = './data/'
args.learning_rate = 0.005
args.batch_size = 1
args.max_iter = 100
args.optimizer = 'adam'
args.dimension = 10
args.margin = 1.0
args.norm = 'L1'
args.evaluation_size = 500
args.save_dir = 'output/'
args.negative_sampling = 'bern'
args.evaluate_per_iteration = 10
args.evaluate_worker = 3
args.regularizer_weight = 1e-5
args.n_test = 1
args.save_per = 100
args.n_worker = 5
print(args)
model = TransE(negative_sampling=args.negative_sampling, data_dir=args.data_dir,
                learning_rate=args.learning_rate, batch_size=args.batch_size,
                max_iter=args.max_iter, margin=args.margin, 
                dimension=args.dimension, norm=args.norm, evaluation_size=args.evaluation_size, 
                regularizer_weight = args.regularizer_weight)

train_triple_positive_input, train_triple_negative_input, loss, op_train, loss_every, norm_entity = train_operation(model, learning_rate = args.learning_rate, margin = args.margin, optimizer_str = args.optimizer)
test_triple, head_rank, tail_rank , norm_head_rank, norm_tail_rank= test_operation(model)

<__main__.Args object at 0xb3cbc8898>
loading entity2id.txt ...
loading reltion2id.txt ...
entity number: 17
relation number: 6
training triple number: 28
testing triple number: 4
valid triple number: 4
finish preparing data. 
finishing initializing
Instructions for updating:
dim is deprecated, use axis instead


In [8]:
# 训练模型
args.max_iter = 100

config = tf.ConfigProto()
config.gpu_options.allow_growth = False
config.log_device_placement = False
config.allow_soft_placement = True
config.gpu_options.per_process_gpu_memory_fraction=0.68
session = tf.Session(config=config)
session.as_default()

tf.initialize_all_variables().run(session=session)
saver = tf.train.Saver()
norm_rel = session.run(tf.nn.l2_normalize(model.embedding_relation, dim =1))
session.run(tf.assign(model.embedding_relation, norm_rel))
norm_ent = session.run(tf.nn.l2_normalize(model.embedding_entity, dim =1))
session.run(tf.assign(model.embedding_entity, norm_ent))

test_input_queue = JoinableQueue()
test_output_queue = Queue()

workers = list()
for i in range(args.n_worker):
    worker = Process(target=test_job, args=(test_input_queue, test_output_queue, model.hr_t, model.tr_h ))
    worker.start()
    workers.append(worker)


for n_iter in range(args.max_iter):
    accu_loss =0.
    batch = 0
    num_batch = model.num_triple_train/args.batch_size
    start_time = timeit.default_timer()
    prepare_time = 0.

    for tp, tn , t in  model.training_data_batch(batch_size= args.batch_size):
        l, _, l_every, norm_e = session.run([loss, op_train, loss_every, norm_entity], {train_triple_positive_input:tp, train_triple_negative_input: tn})
        accu_loss += l
        batch += 1
        print('[%.2f sec](%d/%d): -- loss: %.5f' %(timeit.default_timer()-start_time, batch, num_batch , l), end='\r')
        prepare_time += t
    print('iter[%d] ---loss: %.5f ---time: %.2f ---prepare time : %.2f' %(n_iter, accu_loss, timeit.default_timer()-start_time, prepare_time))

    if n_iter % args.save_per == 0 or n_iter ==0 or n_iter == args.max_iter-1:
        save_path = saver.save(session, os.path.join('./save/TransE_' + str(n_iter) + '.ckpt'))
        print('Model saved at %s' % save_path)

    if n_iter %args.evaluate_per_iteration == 0 or n_iter ==0 or n_iter == args.max_iter-1:
        rank_head = []
        rank_tail = []
        filter_rank_head = []
        filter_rank_tail = []

        norm_rank_head = []
        norm_rank_tail = []
        norm_filter_rank_head = []
        norm_filter_rank_tail = []

        start = timeit.default_timer()
        testing_data = model.testing_data
        hr_t = model.hr_t
        tr_h = model.tr_h
        n_test = args.n_test
        if n_iter == args.max_iter-1:	n_test = model.num_triple_test      


Instructions for updating:
Use `tf.global_variables_initializer` instead.
iter[0] ---loss: 31.93908 ---time: 0.66 ---prepare time : 0.00
Model saved at ./save/TransE_0.ckpt
iter[1] ---loss: 36.24439 ---time: 0.06 ---prepare time : 0.00
iter[2] ---loss: 22.75225 ---time: 0.07 ---prepare time : 0.00
iter[3] ---loss: 15.13599 ---time: 0.07 ---prepare time : 0.00
iter[4] ---loss: 17.93469 ---time: 0.05 ---prepare time : 0.00
iter[5] ---loss: 14.20885 ---time: 0.06 ---prepare time : 0.00
iter[6] ---loss: 13.51436 ---time: 0.05 ---prepare time : 0.00
iter[7] ---loss: 17.23384 ---time: 0.05 ---prepare time : 0.00
iter[8] ---loss: 13.62076 ---time: 0.05 ---prepare time : 0.00
iter[9] ---loss: 9.69718 ---time: 0.06 ---prepare time : 0.00
iter[10] ---loss: 10.03317 ---time: 0.05 ---prepare time : 0.00
iter[11] ---loss: 6.70868 ---time: 0.05 ---prepare time : 0.00
iter[12] ---loss: 9.66001 ---time: 0.05 ---prepare time : 0.00
iter[13] ---loss: 9.46227 ---time: 0.07 ---prepare time : 0.00
iter[14]

In [9]:
# 测试一个样本函数
def test_one_sample(model, trp, session):
    t = trp
    id_replace_head , id_replace_tail, norm_id_replace_head , norm_id_replace_tail  = session.run([head_rank, tail_rank, norm_head_rank, norm_tail_rank], {test_triple:t})
    hr_t = model.hr_t
    tr_h = model.tr_h
    
    hrank = 0
    fhrank = 0
    predicted_head_tmp = []
    for i in range(len(id_replace_head)):
        val = id_replace_head[-i-1]
        predicted_head_tmp.append(val)
        if val == t[0]:
            break
        else: 
            hrank += 1
            fhrank += 1 
            if val in tr_h[(t[2],t[1])]:
                fhrank -= 1
    predicted_head_tmp = [id_replace_head[-i-1] for i in range(len(id_replace_head))]
    
    norm_hrank = 0
    norm_fhrank = 0
    norm_predicted_head_tmp = []
    for i in range(len(norm_id_replace_head)):
        val = norm_id_replace_head[-i-1]
        norm_predicted_head_tmp.append(val)
        if val == t[0]:
            break
        else: 
            norm_hrank += 1
            norm_fhrank += 1 
            if val in tr_h[(t[2],t[1])]:
                norm_fhrank -= 1
    norm_predicted_head_tmp = [id_replace_head[-i-1] for i in range(len(norm_id_replace_head))]

    trank = 0
    ftrank = 0
    predicted_tail_tmp = []
    for i in range(len(id_replace_tail)):
        val = id_replace_tail[-i-1]
        predicted_tail_tmp.append(val)
        if val == t[2]:
            break
        else:
            trank += 1
            ftrank += 1
            if val in hr_t[(t[0], t[1])]:
                ftrank -= 1
    predicted_tail_tmp = [id_replace_tail[-i-1] for i in range(len(id_replace_tail))]

    norm_trank = 0
    norm_ftrank = 0
    norm_predicted_tail_tmp = []
    for i in range(len(norm_id_replace_tail)):
        val = norm_id_replace_tail[-i-1]
        norm_predicted_tail_tmp.append(val)
        if val == t[2]:
            break
        else:
            norm_trank += 1
            norm_ftrank += 1
            if val in hr_t[(t[0], t[1])]:
                norm_ftrank -= 1
    norm_predicted_tail_tmp = [id_replace_tail[-i-1] for i in range(len(norm_id_replace_tail))]
    
    return hrank, fhrank, trank, ftrank, norm_hrank, norm_fhrank, norm_trank, norm_ftrank, \
            predicted_head_tmp, predicted_tail_tmp, norm_predicted_head_tmp, norm_predicted_tail_tmp

In [10]:
# 测试模型
predicted_tail = []
norm_predicted_tail = []
predicted_head = []
norm_predicted_head = []

rank_head = []
rank_tail = []
filter_rank_head = []
filter_rank_tail = []

norm_rank_head = []
norm_rank_tail = []
norm_filter_rank_head = []
norm_filter_rank_tail = []

start = timeit.default_timer()
testing_data = model.testing_data
# hr_t = model.hr_t
# tr_h = model.tr_h
n_test = args.n_test
if n_iter == args.max_iter-1:	n_test = model.num_triple_test
predicted_tail = []
norm_predicted_tail = []
predicted_head = []
norm_predicted_head = []
for i in range(n_test):
    print('[%.2f sec] --- testing[%d/%d]' %(timeit.default_timer()-start, i+1, n_test))
    t = testing_data[i]
    hrank, fhrank, trank, ftrank, norm_hrank, norm_fhrank, norm_trank, norm_ftrank, \
            predicted_head_tmp, predicted_tail_tmp, norm_predicted_head_tmp, norm_predicted_tail_tmp = test_one_sample(model, t, session)
#     print(hrank, fhrank, trank, ftrank, norm_hrank, norm_fhrank, norm_trank, norm_ftrank)
    rank_head.append(hrank)
    rank_tail.append(trank)
    filter_rank_head.append(fhrank)
    filter_rank_tail.append(ftrank)

    norm_rank_head.append(norm_hrank)
    norm_rank_tail.append(norm_trank)
    norm_filter_rank_head.append(norm_fhrank)
    norm_filter_rank_tail.append(norm_ftrank)

    predicted_tail.append(predicted_tail_tmp)
    norm_predicted_tail.append(norm_predicted_tail_tmp)
    predicted_head.append(predicted_head_tmp)
    norm_predicted_head.append(norm_predicted_head_tmp)

mean_rank_head = np.sum(rank_head, dtype=np.float32)/n_test
mean_rank_tail = np.sum(rank_tail, dtype=np.float32)/n_test
# print('rank_tail:', rank_tail)
# # print('predicted_tail:', predicted_tail)
# print('norm_predicted_tail:', norm_predicted_tail)
# print('predicted_head:', predicted_head)
# print('norm_predicted_head:', norm_predicted_head)

filter_mean_rank_head = np.sum(filter_rank_head, dtype=np.float32)/n_test
filter_mean_rank_tail = np.sum(filter_rank_tail, dtype=np.float32)/n_test

norm_mean_rank_head = np.sum(norm_rank_head, dtype=np.float32)/n_test
norm_mean_rank_tail = np.sum(norm_rank_tail, dtype=np.float32)/n_test
norm_filter_mean_rank_head = np.sum(norm_filter_rank_head, dtype=np.float32)/n_test
norm_filter_mean_rank_tail = np.sum(norm_filter_rank_tail, dtype=np.float32)/n_test

hit10_head = np.sum(np.asarray(np.asarray(rank_head)<10 , dtype=np.float32))/n_test
hit10_tail = np.sum(np.asarray(np.asarray(rank_tail)<10 , dtype=np.float32))/n_test
filter_hit10_head = np.sum(np.asarray(np.asarray(filter_rank_head)<10 , dtype=np.float32))/n_test
filter_hit10_tail = np.sum(np.asarray(np.asarray(filter_rank_tail)<10 , dtype=np.float32))/n_test

norm_hit10_head = np.sum(np.asarray(np.asarray(norm_rank_head)<10 , dtype=np.float32))/n_test
norm_hit10_tail = np.sum(np.asarray(np.asarray(norm_rank_tail)<10 , dtype=np.float32))/n_test
norm_filter_hit10_head = np.sum(np.asarray(np.asarray(norm_filter_rank_head)<10 , dtype=np.float32))/n_test
norm_filter_hit10_tail = np.sum(np.asarray(np.asarray(norm_filter_rank_tail)<10 , dtype=np.float32))/n_test

print('iter:%d --mean rank: %.2f --hit@10: %.2f' %(n_iter, (mean_rank_head+ mean_rank_tail)/2, (hit10_tail+hit10_head)/2))
print('iter:%d --filter mean rank: %.2f --filter hit@10: %.2f' %(n_iter, (filter_mean_rank_head+ filter_mean_rank_tail)/2, (filter_hit10_tail+filter_hit10_head)/2))

print('iter:%d --norm mean rank: %.2f --norm hit@10: %.2f' %(n_iter, (norm_mean_rank_head+ norm_mean_rank_tail)/2, (norm_hit10_tail+norm_hit10_head)/2))
print('iter:%d --norm filter mean rank: %.2f --norm filter hit@10: %.2f' %(n_iter, (norm_filter_mean_rank_head+ norm_filter_mean_rank_tail)/2, (norm_filter_hit10_tail+norm_filter_hit10_head)/2))


[0.00 sec] --- testing[1/4]
[0.13 sec] --- testing[2/4]
[0.13 sec] --- testing[3/4]
[0.13 sec] --- testing[4/4]
iter:99 --mean rank: 1.75 --hit@10: 1.00
iter:99 --filter mean rank: 1.38 --filter hit@10: 1.00
iter:99 --norm mean rank: 1.62 --norm hit@10: 1.00
iter:99 --norm filter mean rank: 1.25 --norm filter hit@10: 1.00


In [11]:
# 测试一个样本,、
# 下面我们选取单个样本进行测试，
# 输出tail以及head prediction的rank，并出给top k的list，
# 这里k取值长度为rank+1+3，方便大家观察排序为正确实体前后的实体分别是什么。

sample = ['北京市','包含区','海淀区']
# sample = ['鼓楼区','位于省','江苏省']
# sample = ['合肥市','位于省','安徽省']
# sample = ['浙江大学','位于市','杭州市']
trp = [model.entity2id[sample[0]],
       model.relation2id[sample[1]],
       model.entity2id[sample[2]]]
print(sample, trp)

hrank, fhrank, trank, ftrank, norm_hrank, norm_fhrank, norm_trank, norm_ftrank, \
            predicted_head_tmp, predicted_tail_tmp, norm_predicted_head_tmp, norm_predicted_tail_tmp = test_one_sample(model, trp, session)

tail_prediction = [model.id2entity[ent] for ent in predicted_tail_tmp]
head_prediction = [model.id2entity[ent] for ent in predicted_head_tmp]

norm_tail_prediction = [model.id2entity[ent] for ent in norm_predicted_tail_tmp]
norm_head_prediction = [model.id2entity[ent] for ent in norm_predicted_head_tmp]

print('-- trank:', trank, tail_prediction[:trank +3])
print('-- norm_trank', norm_trank, norm_tail_prediction[:norm_trank +3])
print('-- hrank', hrank, head_prediction[:hrank +3])
print('-- norm_hrank', norm_hrank, norm_head_prediction[:norm_hrank +3])


['北京市', '包含区', '海淀区'] [15, 5, 9]
-- trank: 0 ['海淀区', '北京市', '清华大学']
-- norm_trank 0 ['海淀区', '北京市', '清华大学']
-- hrank 0 ['北京市', '海淀区', '北京大学']
-- norm_hrank 0 ['北京市', '海淀区', '北京大学']


In [12]:
# kill all the worker process
num_worker = 0
for p in workers:
    num_worker += 1
    p.terminate()
    #print('kill worker %d'%num_worker)
session.close()
print("FINISHED~")

FINISHED~


本demo中不包括调参的部分，有兴趣的同学可以自行尝试不同的参数组合，并观察对模型训练和预测结果的影响 :-)