In [1]:
import pickle

In [2]:
import os
from tqdm import tqdm
from typing import Dict
from collections import defaultdict
import numpy as np

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def numLines(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1
def loadData(filename, max_points):
    file_len = numLines(filename)
    f = open(filename, 'r')
    inputs = []
    outputs = []
    for i in tqdm(range(file_len)):
        if i == max_points:
            break
        line = f.readline()
        if line[-1] == '\n':
            line = line[:-1]
        line = line.split('\t')
        inputs.append(line[0])
        outputs.append(line[1])
    data = {'inputs': inputs, 'outputs': outputs}
    return data
        
def load_entity_strings(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
    return lines

def get_entity_wd_id_dict(filename):
    out = {}
    f = open(filename, 'r')
    for line in f:
        if line[-1] == '\n':
            line = line[:-1]
        line = line.split('\t')
        out[line[1]] = line[0]
    return out
    

def create_filter_dict(data) -> Dict[str, int]:
    filter_dict = defaultdict(list)
    for input, output in zip(data["inputs"], data["outputs"]):
        filter_dict[input].append(output)
    return filter_dict

def getAllFilteringEntities(input, filter_dicts):
    entities = []
    splits = ['train', 'test', 'valid']
    for s in splits:
        entities.extend(filter_dicts[s][input])
    return list(set(entities))

def wikidata_link_from_id(id):
    uri = 'https://www.wikidata.org/wiki/' + id
    return uri

In [3]:
dataset_name = 'MetaQA_half'
entity_strings = load_entity_strings(os.path.join("data", dataset_name, "entity_strings.txt"))

In [4]:
entity_strings_set = set(entity_strings)

In [5]:
data = {}
splits = ['train', 'valid', 'test']
dataset_name = 'MetaQA_half'
for split in splits:
    data[split] = loadData(os.path.join('data', dataset_name, split + '.txt'), -1)

100%|██████████████████████████████████████████████████████| 150160/150160 [00:00<00:00, 801905.64it/s]
100%|██████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 509465.73it/s]
100%|██████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 493027.01it/s]


In [6]:
filter_dicts = {}
splits = ['train', 'valid', 'test']
for split in splits:
    filter_dicts[split] = create_filter_dict(data[split])

In [7]:
def getHeadEntity(input):
    x = input.split(':')[1][1:]
    ent = x.split('|')[0][:-1]
    return ent

In [9]:
# fname = 'scores.pickle'
fname = 'scores/scores_3hop.pickle'
# fname = 'scores/scores_metaqa_2hop_test.pickle'
# fname = 'scores_500_base_trie.pickle'
scores_data = pickle.load(open(fname, 'rb'))

In [20]:
id = 3
scores_data['prediction_strings'][id]

['2013', '2003']

In [21]:
scores_data['target_strings'][id]

['1995',
 '1996',
 '1999',
 '1998',
 '1989',
 '2002',
 '2000',
 '2008',
 '2011',
 '2010']

In [15]:
scores_data['input_strings'][id]

'predict answer: Catch Me If You Can | the films that share directors with the film NE were in which languages |'

In [11]:
scores_data.keys()

dict_keys(['prediction_strings', 'scores', 'target_strings', 'input_strings'])

In [12]:
scores_data['input_strings'][0]

'predict answer: Catch Me If You Can | the films that share directors with the film NE were in which languages |'

In [136]:
scores_data['target_strings'][0]

['Before the Rain']

In [137]:
predictions_scores_dicts = []
for pred, score, input in tqdm(zip(scores_data['prediction_strings'], scores_data['scores'],
                           scores_data['input_strings'])):
    head = getHeadEntity(input)
    ps_dict_only_entities = {}
    if not isinstance(pred, list):
#         ps_pairs = [(pred, score)]
        ps_dict_only_entities[pred] = score
    else:
        ps_pairs = [(p, s.item()) for p, s in zip(pred, score)]
        # remove predictions that head entity
        for ps in ps_pairs:
            if ps[0] != head:
                ps_dict_only_entities[ps[0]] = ps[1]
    predictions_scores_dicts.append(ps_dict_only_entities)

9947it [00:00, 17260.05it/s]


In [138]:
predictions_scores_dicts

[{'The Man from Elysian Fields': -0.2834974527359009,
  'The Apparition': -0.670163631439209},
 {'The Inbetweeners 2': -0.38184690475463867,
  'The Inbetweeners Movie': -0.7623830437660217},
 {'Inspector Gadget': -0.25948861241340637,
  'Black Christmas': -0.2646867036819458},
 {'The Girl in the Red Velvet Swing': -0.28876930475234985,
  'Kiss and Make-Up': -0.33719396591186523},
 {'Jab We Met': -0.310052752494812, 'The Perfect Murder': -0.8263988494873047},
 {'Six Shooter': -0.4660927951335907, 'The General': -0.8173674941062927},
 {"The Adventure of Sherlock Holmes' Smarter Brother": -0.36984407901763916,
  'Murder at 1600': -0.7800700664520264},
 {'First Daughter': -1.0208216905593872, 'Firstborn': -1.6512309312820435},
 {'The Butterfly Effect 2': -0.541078507900238,
  "Valentine's Day": -0.6579490303993225},
 {'Plan 9 from Outer Space': -0.2834526598453522,
  'The Rainmaker': -0.6207447052001953},
 {'The Last of the High Kings': -0.2930285632610321,
  'After.Life': -0.6374243497848

In [139]:
predictions_scores_dicts[0]

{'The Man from Elysian Fields': -0.2834974527359009,
 'The Apparition': -0.670163631439209}

In [140]:
import numpy as np
# no filtering for QA
predictions_filtered = predictions_scores_dicts


In [141]:
count = {}
reciprocal_ranks = 0.0
k_list = [1,3,10]
for k in k_list:
    count[k] = 0
num_small_arrs = 0
for i in range(len(predictions_filtered)):
    targets = scores_data['target_strings'][i]
    ps_dict = predictions_filtered[i]
    ps_sorted = sorted(ps_dict.items(), key=lambda item: -item[1])
#     print(ps_dict)
    if len(ps_dict) == 0:
        preds = []
    else:
        preds = [x[0] for x in ps_sorted]
    pred = preds[0]
    if pred in targets:
        rank = preds.index(pred) + 1
        reciprocal_ranks += 1./rank
for k in k_list:
    hits_at_k = count[k]/len(predictions_filtered)
    print('hits@{}'.format(k), hits_at_k)
print('mrr', reciprocal_ranks/len(predictions_filtered))
print(num_small_arrs/len(predictions_filtered), 'were <10 length preds array without answer')

hits@1 0.0
hits@3 0.0
hits@10 0.0
mrr 0.5956569820046245
0.0 were <10 length preds array without answer


In [148]:
id = 3
inputs = scores_data['input_strings'][id]
preds = predictions_filtered[id]
preds = sorted(preds.items(), key=lambda item: -item[1])
target = scores_data['target_strings'][id]
print(inputs, 'Target:', target)
preds, target

predict answer: Helen Mack | what does NE star in | Target: ['The Son of Kong', 'Kiss and Make-Up', 'Divorce']


([('The Girl in the Red Velvet Swing', -0.28876930475234985),
  ('Kiss and Make-Up', -0.33719396591186523)],
 ['The Son of Kong', 'Kiss and Make-Up', 'Divorce'])

In [144]:
# see if answer is head entity
count = 0
for id in range(len(predictions_filtered)):
    input = scores_data['input_strings'][id]
    head = getHeadEntity(input)
    pred = scores_data['prediction_strings'][id]
    if head in pred:
        count += 1
        print(input)
count/len(predictions_filtered)

predict answer: Lemmy | what films did NE act in |
predict answer: Lemmy | who acted in the movie NE |


0.00020106564793405046

In [113]:
# only head/tails
count = 0
for id in range(60,120, 2):
    inputs = scores_data['input_strings'][id]
    preds = predictions_filtered[id]
    preds = sorted(preds.items(), key=lambda item: -item[1])
    target = scores_data['target_strings'][id]
    pred1 = preds[0][0]
    if pred1 == target:
        print(int(id/2), inputs, pred1)
        count += 1
'count', count

('count', 0)

In [419]:
%%html
print("<a href='your_url_here'>Showing Text</a>")


In [364]:
e2wdid['pakistan']

'Q4121082'

In [174]:
sequences = ['english', 'english language', 'french']
t = Trie(sequences)

In [178]:
t.get('x')

[]