In [1]:
from embeddings.demo import Demo
import numpy as np
from heapq import nlargest

def token_align(pharses:list, length:int):
    tokens = [pharse.split() for pharse in pharses if len(pharse.split()) <= length]
    return [token + ['<pad>'] * (length - len(token)) for token in tokens], [' '.join(token) for token in tokens]

d = Demo('data/result_3/best_model.pt', 'data/result_3/saved_config.json', 'data/rel_20.txt')
keywords = open('data/keyword_p.txt').read().strip().split('\n')
keywords_token, keywords = token_align(keywords, 6)
relations = open('data/rel_20.txt').read().strip().split('\n')
relations_token = [rel.split() for rel in relations]
relations = [(rel[:rel.index('<pad>')].strip() if '<pad>' in rel else rel) for rel in relations]

Loaded checkpoint 'data/result_3/best_model.pt' (epoch 17 iter: 540001 train_loss: 1.632509862942454, dev_loss: 2.3585431747436525, train_pos:0.5035273432731628, train_neg: 0.017596999183297157, dev_pos: 0.31248798966407776, dev_neg: 0.016466999426484108)


In [2]:
def ugly_normalize(vecs:np.ndarray):
   normalizers = np.sqrt((vecs * vecs).sum(axis=1))
   normalizers[normalizers==0]=1
   return (vecs.T / normalizers).T

def ntopidx(n, score:np.ndarray):
    s = nlargest(n, zip(np.arange(len(score)), score), key = lambda x: x[1])
    return [item[0] for item in s]

def read_test_file(file_name:str):
    ret = []
    with open(file_name) as f_in:
        for line in f_in:
            central_kw, kws = line.strip().split(':')
            ret.append((central_kw, kws.split(',')))
    return ret

def run_test(test_data:list, d:Demo, keywords:list, keywords_token:list, relations:list):
    ret = []
    for central_kw, kws in test_data:
        central_kw_token = central_kw.split()
        kws_token, kws = token_align(kws, 6)
        general_rel_prediction = ugly_normalize(d.model.get_prediction(keywords_token, [central_kw_token] * len(keywords_token)).numpy())
        test_rel_prediction = ugly_normalize(d.model.get_prediction(kws_token, [central_kw_token] * len(kws_token)).numpy())
        rel_score = np.matmul(test_rel_prediction, d.relation_representation.T)
        rel_predict_score = np.matmul(test_rel_prediction, general_rel_prediction.T)
        for i in range(len(kws)):
            rel_top_40 = [relations[idx] for idx in ntopidx(40, rel_score[i])]
            kws_top_40 = [keywords[idx] for idx in ntopidx(40, rel_predict_score[i])]
            ret.append((kws[i], central_kw, rel_top_40, kws_top_40))
    return ret

def write_result(data:list, file_name):
    with open(file_name, 'w', encoding='utf-8') as f_out:
        content = []
        for kw, central_kw, similar_rels, similar_kws in data:
            content.append('!%s<=>%s\n' % (kw, central_kw))
            content.append('>Similar Relation')
            content += similar_rels
            content.append('\n>Similar Keyword')
            content += similar_kws
            content.append('\n')
        f_out.write('\n'.join(content))

def do_test(test_file:str, result_file:str, d:Demo, keywords:list, keywords_token:list, relations:list):
    test_data = read_test_file(test_file)
    result = run_test(test_data, d, keywords, keywords_token, relations)
    write_result(result, result_file)

In [5]:
do_test('test/single_word_test.txt', 'result/single_word_out.txt', d, keywords, keywords_token, relations)