In [1]:
import argparse
from typing import Dict
import logging
import torch
from torch import optim
import pickle
import numpy as np

from qa_models import QA_model, QA_model_Only_Embeddings, QA_model_BERT, QA_model_EaE, QA_model_EmbedKGQA, QA_model_EaE_replace, QA_model_EmbedKGQA_complex
from qa_datasets import QA_Dataset, QA_Dataset_model1, QA_Dataset_EaE, QA_Dataset_EmbedKGQA, QA_Dataset_EaE_replace
from torch.utils.data import Dataset, DataLoader
import utils
from tqdm import tqdm
from utils import loadTkbcModel, loadTkbcModel_complex
from collections import defaultdict
from datetime import datetime
from collections import OrderedDict


In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"


In [5]:
dataset_name = 'wikidata_big'
tkbc_model_file = 'tkbc_model_17dec.ckpt'
tkbc_model = loadTkbcModel_complex('models/{dataset_name}/kg_embeddings/{tkbc_model_file}'.format(
    dataset_name = dataset_name, tkbc_model_file=tkbc_model_file
))

Loading complex tkbc model from models/wikidata_big/kg_embeddings/tkbc_model_17dec.ckpt
Number ent,rel,ts from loaded model: 125726 406 9621
Loaded complex tkbc model


In [6]:
class Args:
    lm_frozen = 1
    frozen = 1
def openFileAsDict(filename):
    f = open(filename, 'r')
    out = {}
    for line in f:
        line = line[:-1].split('\t') # can't strip() since name can be whitespace
        out[line[0]] = line[1]
    return out

def convertToDataPoint(question_text, entities, times, answer_type='entity', answers=set()):
    question = {}
    question['question'] = question_text
#     question['answers'] = answers
    question['answers'] = set(['Q888504'])
    question['answer_type'] = answer_type
    question['entities'] = set(entities)
    question['times'] = set(times)
    entFile = 'data/wikidata_big/kg/wd_id2entity_text.txt'
    id2ent = openFileAsDict(entFile)
    paraphrase = question_text
    for e in entities:
        paraphrase = paraphrase.replace(e, id2ent[e])
    question['paraphrases'] = [paraphrase]
    return question
    


In [7]:
entFile = 'data/wikidata_big/kg/wd_id2entity_text.txt'
id2ent = openFileAsDict(entFile)

In [10]:
args = Args()
qa_model = QA_model_EmbedKGQA(tkbc_model, args)
filename = 'models/{dataset_name}/qa_models/{model_file}.ckpt'.format(
    dataset_name=dataset_name,
    model_file='embedkgqa_dual_frozen_lm_fix_order_ce'
)
print('Loading model from', filename)
qa_model.load_state_dict(torch.load(filename))
print('Loaded qa model from ', filename)
qa_model = qa_model.cuda()



  0%|          | 0/267967963 [00:00<?, ?B/s][A[ACouldn't reach server at 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin' to download pretrained weights.


OSError: [Errno 28] No space left on device

In [None]:
valid_dataset = QA_Dataset_EmbedKGQA(split='valid', dataset_name=dataset_name)
original_dataset = QA_Dataset_EmbedKGQA(split='valid', dataset_name=dataset_name)

In [None]:
def predict(qa_model, dataset, batch_size = 128, split='valid', k=10):
    num_workers = 4
    qa_model.eval()
    eval_log = []
    k_for_reporting = k # not change name in fn signature since named param used in places
    # k_list = [1, 3, 10]
    # k_list = [1, 10]
    k_list = [1, 5]
    max_k = max(k_list)
    eval_log.append("Split %s" % (split))
    print('Evaluating split', split)

    # id = 13799        
    ids = [0]
    prepared_data = {}
    for k, v in dataset.prepared_data.items():
        prepared_data[k] = [v[i] for i in ids]
    dataset.prepared_data = prepared_data
    dataset.data = [dataset.data[i] for i in ids]

    # dataset.print_prepared_data()

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, 
                            num_workers=num_workers, collate_fn=dataset._collate_fn)
    topk_answers = []
    topk_scores = []
    total_loss = 0
    loader = tqdm(data_loader, total=len(data_loader), unit="batches")
    
    
    for i_batch, a in enumerate(loader):
        # if size of split is multiple of batch size, we need this
        # todo: is there a more elegant way?
        if i_batch * batch_size == len(dataset.data):
            break
        answers_khot = a[-1] # last one assumed to be target
        scores = qa_model.forward(a)
        sm = torch.nn.Softmax(dim=1)
        
        scores = sm(scores)
        # scores = torch.nn.functional.normalize(scores, p=2, dim=1)

        for s in scores:
            pred_s, pred = dataset.getAnswersFromScoresWithScores(s, k=max_k)
            topk_answers.append(pred)
            topk_scores.append(pred_s)
        loss = qa_model.loss(scores, answers_khot.cuda())
        total_loss += loss.item()
    eval_log.append('Loss %f' % total_loss)
    eval_log.append('Eval batch size %d' % batch_size)

    for i in range(len(dataset.data)):
        question = dataset.data[i]
        predicted_answers = topk_answers[i]
        predicted_scores = topk_scores[i]
        actual_answers = question['answers']

        if question['answer_type'] == 'entity':
            actual_answers = [dataset.getEntityToText(x) for x in actual_answers]
            pa = []
            aa = []
            for a in predicted_answers:
                if 'Q' in str(a): # TODO: hack to check whether entity or time predicted
                    pa.append(dataset.getEntityToText(a))
                else:
                    pa.append(a)
            predicted_answers = pa

            for a in actual_answers:
                if 'Q' in str(a): # TODO: hack to check whether entity or time predicted
                    aa.append(dataset.getEntityToText(a))
                else:
                    aa.append(a)
            actual_answers = aa


        # print(question['paraphrases'][0])
        # print('Actual answers', actual_answers)
        # print('Predicted answers', predicted_answers)
        # print()
        print(question['paraphrases'][0])
        print(question['question'])
        answers_with_scores_text = []
        for pa, ps in zip(predicted_answers, predicted_scores):
            formatted = '{answer} ({score})'.format(answer = pa, score=ps)
            answers_with_scores_text.append(formatted)
        print('Predicted:', ', '.join(answers_with_scores_text))
        print('Actual:', ', '.join([str(x) for x in actual_answers]))
        print()
    

In [None]:
def getEntities(question_text):
    words = question_text.split(' ')
    entities = []
    for word in words:
        if word[0] == 'Q': # TODO: hack
            entities.append(word)
    return entities

In [None]:
question_text = 'What is the name of the first team that Q1487425 was part of'
entities = getEntities(question_text)
times = []
dataPoint = convertToDataPoint(question_text, entities, times)
data = [dataPoint]
valid_dataset.data = data
valid_dataset.prepared_data = valid_dataset.prepare_data(data)

predict(qa_model, valid_dataset)

In [None]:
id2ent['Q1543']

 90%|█████████ | 242308096/267967963 [01:16<00:04, 5946768.28B/s]

In [109]:
original_dataset.data[7:10]

[{'question': 'What is the name of the last team that Q1487425 was part of',
  'answers': {'Q1543'},
  'answer_type': 'entity',
  'template': 'What is the name of the last team that {head} was part of',
  'entities': {'Q1487425'},
  'times': set(),
  'relations': {'P54'},
  'type': 'first_last',
  'annotation': {'head': 'Q1487425', 'adj': 'last'},
  'uniq_id': 47,
  'paraphrases': ['What is the name of the last team that Gianni Bui was part of']},
 {'question': 'Q1237590 received Q351723 in what year',
  'answers': {2004},
  'answer_type': 'time',
  'template': '{head} received {tail} in what year',
  'entities': {'Q1237590', 'Q351723'},
  'times': set(),
  'relations': {'P166'},
  'type': 'simple_time',
  'annotation': {'head': 'Q1237590', 'tail': 'Q351723'},
  'uniq_id': 23016,
  'paraphrases': ['Dominic Joyce received Adams Prize in what year']},
 {'question': 'Who played with Q1616158 on the Q8428',
  'answers': {'Q1612196',
   'Q1616158',
   'Q16268364',
   'Q257588',
   'Q2824948

NameError: name 'id2ent' is not defined

In [31]:
valid_dataset.prepared_data

{'question_text': ['Name the award that Bobby Clarke first received'],
 'head': [120143],
 'tail': [120143],
 'time': [125726],
 'answers_arr': [[]]}

In [64]:
original_dataset.prepared_data.keys()

dict_keys(['question_text', 'head', 'tail', 'time', 'answers_arr'])

In [70]:
original_dataset.data[0]

{'question': 'Name the award that Q888504 first received',
 'answers': {'Q795029'},
 'answer_type': 'entity',
 'template': 'Name the award that {head} first received',
 'entities': {'Q888504'},
 'times': set(),
 'relations': {'P166'},
 'type': 'first_last',
 'annotation': {'head': 'Q888504', 'adj': 'first'},
 'uniq_id': 23280,
 'paraphrases': ['Name the award that Bobby Clarke first received']}

In [66]:
original_dataset.prepared_data['head'][0]

120143

In [67]:
original_dataset.prepared_data['tail'][0]

120143

In [68]:
original_dataset.prepared_data['time'][0]

125726

In [69]:
original_dataset.prepared_data['answers_arr'][0]

[116438]