In [2]:
import pickle
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"


In [3]:
import os
from tqdm import tqdm
from typing import Dict
from collections import defaultdict
import numpy as np

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def numLines(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1
def loadData(filename, max_points):
    file_len = numLines(filename)
    f = open(filename, 'r')
    inputs = []
    outputs = []
    for i in tqdm(range(file_len)):
        if i == max_points:
            break
        line = f.readline()
        if line[-1] == '\n':
            line = line[:-1]
        line = line.split('\t')
        inputs.append(line[0])
        outputs.append(line[1])
    data = {'inputs': inputs, 'outputs': outputs}
    return data
        
def load_entity_strings(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
    return lines

def get_entity_wd_id_dict(filename):
    out = {}
    f = open(filename, 'r')
    for line in f:
        if line[-1] == '\n':
            line = line[:-1]
        line = line.split('\t')
        out[line[1]] = line[0]
    return out
    

def create_filter_dict(data) -> Dict[str, int]:
    filter_dict = defaultdict(list)
    for input, output in zip(data["inputs"], data["outputs"]):
        filter_dict[input].append(output)
    return filter_dict

def getAllFilteringEntities(input, filter_dicts):
    entities = []
    splits = ['train', 'test', 'valid']
    for s in splits:
        entities.extend(filter_dicts[s][input])
    return list(set(entities))

def wikidata_link_from_id(id):
    uri = 'https://www.wikidata.org/wiki/' + id
    return uri

In [4]:
dataset_name = 'yago3-10_2'
entity_strings = load_entity_strings(os.path.join("data", dataset_name, "entity_strings.txt"))

In [5]:
entity_strings_set = set(entity_strings)

In [6]:
data = {}
splits = ['train', 'valid', 'test']
dataset_name = 'yago3-10_2'
for split in splits:
    data[split] = loadData(os.path.join('data', dataset_name, split + '.txt'), -1)

100%|█████████████████████████████████████████████████████| 2158080/2158080 [00:01<00:00, 1116726.25it/s]
100%|██████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 595046.46it/s]
100%|██████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 560983.32it/s]


In [7]:
filter_dicts = {}
splits = ['train', 'valid', 'test']
for split in splits:
    filter_dicts[split] = create_filter_dict(data[split])

In [8]:
# fname = 'scores.pickle'
fname = 'scores/scores_full_yago2.pickle'
# fname = 'scores_500_base_trie.pickle'
scores_data = pickle.load(open(fname, 'rb'))

In [9]:
predictions_scores_dicts = []
for string_arr, score_arr in tqdm(zip(scores_data['prediction_strings'], scores_data['scores'])):
    ps_pairs = [(p,s) for p,s in zip(string_arr, score_arr)]
    ps_pairs = list(set(ps_pairs)) # while sampling, duplicates are created
    # remove predictions that are not entities
    ps_dict_only_entities = defaultdict(list)
    for ps in ps_pairs:
        if ps[0] in entity_strings_set:
            ps_dict_only_entities[ps[0]] = ps[1]
    predictions_scores_dicts.append(ps_dict_only_entities)

10000it [00:01, 7107.19it/s]


In [10]:
max([len(predictions_scores_dicts[x]) for x in range(500)])

372

In [12]:
len(predictions_scores_dicts)

10000

In [17]:
import numpy as np
predictions_filtered = []
head_num_filter = 0
tail_num_filter = 0
count = 0
for i in tqdm(range(len(predictions_scores_dicts))):
    ps_dict = predictions_scores_dicts[i].copy()
    target = scores_data['target_strings'][i]
    inputs = scores_data['input_strings'][i]
    prediction_strings = ps_dict.keys()
    if target in prediction_strings:
        original_score = ps_dict[target]
    # get filtering entities
    filtering_entities = getAllFilteringEntities(inputs, filter_dicts)
    if len(filtering_entities) <= 20:
        count += 1
    if 'head' in inputs:
        head_num_filter += len(filtering_entities)
    else:
        tail_num_filter += len(filtering_entities)
    for ent in filtering_entities:
        if ent in ps_dict:
            ps_dict[ent] = -float("inf")
    if target in prediction_strings:
        ps_dict[target] = original_score
    # softmax for scores
    names_arr = []
    scores_arr = []
    for k, v in ps_dict.items():
        names_arr.append(k)
        scores_arr.append(v)
    scores_arr = np.array(scores_arr)
#     scores_arr = softmax(scores_arr)
    for name, score in zip(names_arr, scores_arr):
        ps_dict[name] = score
    predictions_filtered.append(ps_dict)
head_num_filter/len(predictions_filtered), tail_num_filter/len(predictions_filtered), count

100%|████████████████████████████████████████████████████████████| 10000/10000 [00:05<00:00, 1816.10it/s]


(1826.6029, 4.5389, 6084)

In [23]:
from tqdm.notebook import tqdm
count = {}
reciprocal_ranks = 0.0
k_list = [1,3,10]
for k in k_list:
    count[k] = 0
num_small_arrs = 0
count2 = 0
total_count = 0
for i in tqdm(range(len(predictions_filtered))):
    target = scores_data['target_strings'][i]
    inputs = scores_data['input_strings'][i]
    filtering_entities = getAllFilteringEntities(inputs, filter_dicts)
    if len(filtering_entities) <= 1:
        total_count += 1
    else:
        continue
    ps_dict = predictions_filtered[i]
    ps_sorted = sorted(ps_dict.items(), key=lambda item: -item[1])
    if len(ps_dict) == 0:
        preds = []
    else:
        preds = [x[0] for x in ps_sorted]
    if target in preds:
        rank = preds.index(target) + 1
        reciprocal_ranks += 1./rank
    for k in k_list:
        if target in preds[:k]:
            count[k] += 1
    if len(preds) < 10 and target not in preds:
        num_small_arrs += 1
    if target in preds:
        count2 += 1
        
# total_count = len(predictions_filtered)
for k in k_list:
    hits_at_k = count[k]/total_count
    print('hits@{}'.format(k), hits_at_k)
print('mrr', reciprocal_ranks/total_count)
print(num_small_arrs/total_count, 'were <10 length preds array without answer')
print('For excel:')
print(round(reciprocal_ranks/total_count, 3))
for k in k_list:
    hits_at_k = count[k]/total_count
    print(round(hits_at_k, 3))


  0%|          | 0/10000 [00:00<?, ?it/s]

hits@1 0.49946977730646874
hits@3 0.5673382820784729
hits@10 0.6171792152704135
mrr 0.5397657936869451
0.16542948038176034 were <10 length preds array without answer
For excel:
0.54
0.499
0.567
0.617


In [264]:
count2/500

0.554

In [346]:
id = 23
inputs = scores_data['input_strings'][id]
preds = predictions_filtered[id]
preds = sorted(preds.items(), key=lambda item: -item[1])
target = scores_data['target_strings'][id]
print(inputs, 'Target:', target)
preds[:10], target

predict head: Portsmouth F.C. | is affiliated to | Target: Mark Blake (footballer born 1970)


([('John Wark', -6.662752151489258),
  ('James Hurst (footballer)', -7.524306297302246),
  ('Paul Moody (footballer)', -7.6999406814575195),
  ('Kevin Miller (footballer)', -7.935149192810059),
  ('James Baird (footballer)', -8.028887748718262),
  ('Bill Rawlings', -8.164727210998535),
  ('Bill White (footballer born 1907)', -8.24087142944336),
  ("John O'Rourke (footballer)", -8.302714347839355),
  ('David Knight (English footballer)', -8.331277847290039),
  ('Kevin Amankwaah', -8.336922645568848)],
 'Mark Blake (footballer born 1970)')

predict tail: Rosemere, Quebec | is located in | Target: Quebec


([('Mauricie', -6.508617401123047),
  ('Louiseville', -6.886135101318359),
  ("Marguerite-D'Youville Regional County Municipality", -6.97409725189209),
  ('Burgundy', -7.875734329223633),
  ('Urban agglomeration of Longueuil', -10.717867851257324),
  ('Therese-De Blainville Regional County Municipality', -inf),
  ('Laurentides', -inf)],
 'Quebec')

In [274]:
x = getAllFilteringEntities(inputs, filter_dicts)
for ent in x:
    if 'Joe' in ent:
        print(ent)
len(x)

Joe Johnson (footballer born 1911)
Joe Mercer
Joe Payne (footballer)
Joe Smith (footballer born 1890)
Joe Spence (footballer born 1898)
Joe Corrigan
Joe Baker
Joe Smith (football forward born 1889)
Joe Bradford
Joe Royle
Joey Barton
Joe Bacuzzi


556

In [281]:
for e in entity_strings:
    if 'Surinam' in e:
        print(e)

Suriname national football team
Suriname


In [206]:
def getRelation(input):
    relation = input.split('|')[1].strip()
    return relation

In [212]:
for id in range(100):
    input=scores_data['input_strings'][id]
    preds = predictions_filtered[id]
    preds = sorted(preds.items(), key=lambda item: -item[1])
    first_pred = preds[0][0]
    relation = getRelation(input)
    target = scores_data['target_strings'][id]
    if target != first_pred:
        print(relation)
    

plays for
plays for
is located in
plays for
plays for
has gender
plays for
plays for
is affiliated to
is affiliated to
plays for
plays for
is affiliated to
is affiliated to
is affiliated to
plays for
has gender
plays for
is affiliated to
is affiliated to
acted in
acted in
is affiliated to
is affiliated to
is affiliated to
plays for
is affiliated to
is affiliated to
is affiliated to
is affiliated to
plays for
plays for
plays for
plays for
plays for
plays for
wrote music for
wrote music for
plays for
plays for
is affiliated to
plays for
plays for
is affiliated to
is affiliated to
is affiliated to
plays for
has gender
edited
edited
is affiliated to
was born in
was born in
wrote music for
wrote music for
is affiliated to
is affiliated to
is affiliated to
plays for
plays for
is affiliated to
plays for
edited
edited


In [482]:
# only head/tails
count = 0
for id in range(60,120, 2):
    inputs = scores_data['input_strings'][id]
    preds = predictions_filtered[id]
    preds = sorted(preds.items(), key=lambda item: -item[1])
    target = scores_data['target_strings'][id]
    pred1 = preds[0][0]
    if pred1 == target:
        print(int(id/2), inputs, pred1)
        count += 1
'count', count

32 predict tail: ali kazemaini | birthplace | tehran
34 predict tail: ashta, maharashtra | instance of | human settlement
37 predict tail: roy shaw 0 | instance of | human being
40 predict tail: t. canby jones | has surname | jones (family name)
45 predict tail: naveen kumar | instance of | human being
46 predict tail: barlow respiratory hospital | host country | united states of america
48 predict tail: camiling | office held by head of government | mayor
51 predict tail: hazel soan | instance of | human being
52 predict tail: efrain herrera | sport played | association football
53 predict tail: oluf munck | instance of | human being
54 predict tail: thomas gilchrist | instance of | human being
58 predict tail: desmoplastic fibroma | subclass of | fibroma


('count', 12)

In [419]:
%%html
print("<a href='your_url_here'>Showing Text</a>")


In [364]:
e2wdid['pakistan']

'Q4121082'

In [174]:
sequences = ['english', 'english language', 'french']
t = Trie(sequences)

In [178]:
t.get('x')

[]