In [2]:
import argparse
from typing import Dict
import logging
import torch
from torch import optim

from tkbc.datasets import TemporalDataset
from optimizers import TKBCOptimizer, IKBCOptimizer
from models import ComplEx, TComplEx, TNTComplEx
from regularizers import N3, Lambda3

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"


ImportError: cannot import name 'TKBCModel' from 'models' (unknown location)

In [None]:
a = [(1,2), (3,1), (2,2)]

In [100]:
a.sort()
a

[(1, 2), (2, 2), (3, 1)]

In [105]:
[x[1] for x in a]

[2, 2, 1]

In [2]:
import argparse
from typing import Dict
import logging
import torch
from torch import optim
import pickle
import numpy as np

from datasets import TemporalDataset
from models import ComplEx, TComplEx, TNTComplEx
from regularizers import N3, Lambda3
from qa_models import QA_model

import utils

class Args:
    dataset = 'wikidata_small'
    model = 'TComplEx'
    max_epochs = 50
    valid_freq = 5
    rank = 256
    batch_size = 512
    learning_rate = 1e-2
    emb_reg = 0.01
    time_reg = 0.01
    no_time_emb = False
    
args=Args()

dataset = TemporalDataset(args.dataset)
sizes = dataset.get_shape()
tkbc_model = TComplEx(sizes, args.rank, no_time_emb=args.no_time_emb)
tkbc_model = tkbc_model.cuda()
print('loading tkbc model')

# load model
# path = 'model2.ckpt'
path = 'model_tkbc_60kent.ckpt'
tkbc_model.load_state_dict(torch.load(path))
print('tkbc model loaded')
tkbc_model.eval()

fname = '/scratche/home/apoorv/tempqa/data/questions/questions_position_held_small_1_paraphrases_shuffled.pickle'

questions = pickle.load(open(fname, 'rb'))

all_dicts = utils.getAllDicts()

for question_type in ['predictHead', 'predictTime']:
    correct_count = 0
    total_count = 0
    k = 1 # hit at k
    for i in range(len(questions)):
        question_template = questions[i]['template']
        if question_type == 'predictHead':
            which_question_function = utils.predictHead
            target_template = 'Who was the {tail} in {time}?'
        elif question_type == 'predictTime':
            which_question_function = utils.predictTime
            target_template = 'When did {head} hold the position of {tail}?'            
        if question_template != target_template:
            continue
        total_count += 1
        id = i   
        predicted = which_question_function(questions[id], tkbc_model, all_dicts, k)
        intersection_set = questions[id]['answers'].intersection(predicted)
        if len(intersection_set) > 0:
            correct_count += 1
    
    print(question_type, correct_count, total_count, correct_count/total_count)

loading tkbc model
tkbc model loaded
predictHead 8408 8603 0.9773334883180286
predictTime 12224 12272 0.9960886571056062


In [34]:
from pathlib import Path
import pkg_resources
import pickle
from collections import defaultdict
from typing import Dict, Tuple, List

import numpy as np
import torch
# from qa_models import QA_model
import utils
from tqdm import tqdm
# warning: padding id 0 is being used, can have issue like in Tucker
# however since so many entities (and timestamps?), it may not pose problem

class QA_Dataset(object):
    def __init__(self, 
                filename='/scratche/home/apoorv/tempqa/data/questions/questions_position_held_small_with_paraphrases_v2_shuffled.pickle'):
        num_valid = 500
        num_test = 500
        questions = pickle.load(open(filename, 'rb'))
        self.all_dicts = utils.getAllDicts()
        self.valid = questions[:num_valid]
        self.test = questions[num_valid: num_valid + num_test]
        self.train = questions[num_valid + num_test :]
        print('Total questions = ', len(questions))

        self.data = {}
        self.data['valid'] = self.valid
        self.data['train'] = self.train
        self.data['test'] = self.test

    # todo: implement this
    def getOrderedEntities(self, question):
        return list(question['entities'])

    # todo: implement this
    def getOrderedTimes(self, question):
        return list(question['times'])

    def entitiesToIds(self, entities):
        output = []
        ent2id = self.all_dicts['ent2id']
        for e in entities:
            output.append(ent2id[e])
        return output
    
    def idToEntTime(self, id):
        type = self.getIdType(id)
        if type == 'entity':
            return self.all_dicts['id2ent'][id]
        else:
            return self.all_dicts['id2ts'][id]
        
    def getIdType(self, id):
        if id < len(self.all_dicts['ent2id']):
            return 'entity'
        else:
            return 'time'
    
    def getEntityToText(self, entity_wd_id):
        return self.all_dicts['wd_id_to_text'][entity_wd_id]
    
    def getEntityIdToText(self, id):
        ent = self.all_dicts['id2ent'][id]
        return self.getEntityToText(ent)

    def timesToIds(self, times):
        output = []
        ts2id = self.all_dicts['ts2id']
        for t in times:
            output.append(ts2id[(t, 0, 0)])
        return output

    # from pytorch Transformer:
    # If a BoolTensor is provided, the positions with the value of True will be ignored 
    # while the position with the value of False will be unchanged.
    # 
    # so we want to pad with True
    def padding_tensor(self, sequences):
        """
        :param sequences: list of tensors
        :return:
        """
        num = len(sequences)
        max_len = max([s.size(0) for s in sequences])
        out_dims = (num, max_len)
        out_tensor = sequences[0].data.new(*out_dims).fill_(0)
        # mask = sequences[0].data.new(*out_dims).fill_(0)
        mask = torch.ones((num, max_len), dtype=torch.bool) # fills with True
        for i, tensor in enumerate(sequences):
            length = tensor.size(0)
            out_tensor[i, :length] = tensor
            mask[i, :length] = False # fills good area with False
        return out_tensor, mask
    
    def toOneHot(self, indices, vec_len):
        indices = torch.LongTensor(indices).cuda()
        one_hot = torch.FloatTensor(vec_len).cuda()
        one_hot.zero_()
        one_hot.scatter_(0, indices, 1)
        return one_hot

    # def process_and_save_data(self, split):
    #     data = self.data[split]
    #     question_text, entities_times_padded, entities_times_padded_mask, answers_khot = self.process_data(data)
    #     print('Done for ', len(data))
    #     exit(0)
    #     return

    def process_data(self, data):
        question_text = []
        entity_time_ids = []
        
        num_total_entities = len(self.all_dicts['ent2id'])
        num_total_times = len(self.all_dicts['ts2id'])
        answers_khot = []

        for question in data:
            question_text.append(question['paraphrases'][0])
            # question_text.append(question['template']) # todo: this is incorrect
            et_id = []
            entity_ids = self.entitiesToIds(self.getOrderedEntities(question))
            time_ids = self.timesToIds(self.getOrderedTimes(question))
            # adding num_total_entities to each time id
            for i in range(len(time_ids)):
                time_ids[i] += num_total_entities
            et_id = entity_ids + time_ids # todo: maybe we want ordering as is in question? here entities first, time 2nd
            entity_time_ids.append(torch.tensor(et_id, dtype=torch.long))
            if question['answer_type'] == 'entity':
                answers = self.entitiesToIds(question['answers'])
            else:
                # adding num_total_entities to each time id
                answers = [x + num_total_entities for x in self.timesToIds(question['answers'])]
            answers_khot.append(self.toOneHot(answers, num_total_entities + num_total_times))

        entities_times_padded, entities_times_padded_mask = self.padding_tensor(entity_time_ids)
        answers_khot = torch.stack(answers_khot)

        return question_text, entities_times_padded, entities_times_padded_mask, answers_khot

    
    
    def get_batch(self, split='train', start_index=0, batch_size=50):
        # just example
        return self.process_data(self.data[split][start_index: start_index + batch_size])


In [4]:
import math
import torch
from torch import nn
import numpy as np
from models import TComplEx
from sentence_transformers import SentenceTransformer


# training data: questions
# model:
# 1. tkbc model embeddings (may or may not be frozen)
# 2. question sentence embeddings (may or may not be frozen)
# 3. linear layer to project question embeddings (unfrozen)
# 4. transformer that takes these embeddings (unfrozen) (cats them along a dimension, also takes a mask)
# 5. average output embeddings of transformer or take last token embedding?
# 6. linear projection of this embedding to tkbc embedding dimension
# 7. score with all possible entities/times and sigmoid
# or
# 7. directly project to dimension num_entity + num_time and sigmoid
# 8. BCE loss (multiple correct possible)


class QA_model(nn.Module):
    def __init__(self, tkbc_model):
        super().__init__()
        self.st_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
        self.tkbc_embedding_dim = tkbc_model.embeddings[0].weight.shape[1]
        self.sentence_embedding_dim = 768 # hardwired from sentence_transformers?

        # transformer
        self.transformer_dim = self.tkbc_embedding_dim # keeping same so no need to project embeddings
        self.nhead = 8
        self.num_layers = 6
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=self.transformer_dim, nhead=self.nhead)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=self.num_layers)

        self.project_sentence_to_transformer_dim = nn.Linear(self.sentence_embedding_dim, self.transformer_dim)
        # not needed:
        # self.project_tkbc_to_transformer_dim = nn.Linear(self.tkbc_embedding_dim, self.transformer_dim)
        
        # creating combined embedding of time and entities (entities come first)
        num_entities = tkbc_model.embeddings[0].weight.shape[0]
        num_times = tkbc_model.embeddings[2].weight.shape[0]
        ent_emb_matrix = tkbc_model.embeddings[0].weight.data
        time_emb_matrix = tkbc_model.embeddings[2].weight.data
        full_embed_matrix = torch.cat([ent_emb_matrix, time_emb_matrix], dim=0)
        self.entity_time_embedding = nn.Embedding(num_entities + num_times, self.tkbc_embedding_dim)
        self.entity_time_embedding.weight.data.copy_(full_embed_matrix)
        self.entity_time_embedding.weight.requires_grad = False
        self.loss = nn.BCEWithLogitsLoss(reduction='mean')
#         self.final_linear = nn.Linear(self.transformer_dim, num_entities + num_times)
        return

    def forward(self, question_text, entities_times_padded, entities_times_padded_mask):
        entity_time_embedding = self.entity_time_embedding(entities_times_padded)
        question_embedding = torch.from_numpy(self.st_model.encode(question_text)).cuda()
        question_embedding = self.project_sentence_to_transformer_dim(question_embedding)
        question_embedding = question_embedding.unsqueeze(1)
        sequence = torch.cat([question_embedding, entity_time_embedding], dim=1)
        sequence = torch.transpose(sequence, 0, 1)
        batch_size = len(question_text)
        false_vector = torch.zeros((batch_size, 1), dtype=torch.bool).cuda() # fills with True
        mask = torch.cat([false_vector, entities_times_padded_mask], dim=1)
        output = self.transformer_encoder(sequence, src_key_padding_mask=mask)
        output = torch.transpose(output, 0, 1)
        # summing token embeddings
        output = torch.sum(output, dim=1)
        # now we can either project output to final dim, or we can take dot-product with
        # entity/time embedding weight matrix
        scores = torch.matmul(output, self.entity_time_embedding.weight.data.T)
#         scores = self.final_linear(output)
        # scores = torch.sigmoid(scores)
        return scores
        


In [47]:
def getAnswersFromScores(scores, dataset):
    val, ind = torch.topk(scores, 10, largest=True)
    predict = ind[0]
    answers = []
    for a_id in predict:
        a_id = a_id.item()
        type = dataset.getIdType(a_id)
        if type == 'entity':
            answers.append(dataset.getEntityIdToText(a_id))
        else:
            time_id = a_id - len(dataset.all_dicts['ent2id'])
            time = dataset.all_dicts['id2ts'][time_id]
            answers.append(time[0])
    return answers
    

In [36]:
lr = 0.01

In [86]:
questions_file = '/scratche/home/apoorv/tempqa/data/questions/questions_position_held_small_1_paraphrases_shuffled.pickle'
dataset = QA_Dataset(questions_file)


Total questions =  57954


In [87]:
qa_model = QA_model(tkbc_model)
path = 'qa_model_frozen_3.ckpt'
qa_model.load_state_dict(torch.load(path))
print('Loaded qa model from ', path)

qa_model = qa_model.cuda()
optimizer = torch.optim.Adam([param for param in qa_model.parameters() if param.requires_grad == True], lr=lr)
optimizer.zero_grad()

Loaded qa model from  qa_model_frozen_3.ckpt


In [10]:
batch_size = 100
qa_model.train()
for epoch in range(1):
    epoch_loss = 0
    for i in range(len(dataset.train)// batch_size):
#     for i in range(1):
        qa_model.zero_grad()
        question_text, entities_times_padded, entities_times_padded_mask, answers_khot = dataset.get_batch(
            split='train',
            start_index = i*batch_size,
            batch_size=batch_size)
        scores = qa_model.forward(question_text, entities_times_padded.cuda(), entities_times_padded_mask.cuda())
        loss = qa_model.loss(scores, answers_khot.cuda())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print('epoch loss', epoch_loss)

epoch loss 1.3238293676404282


In [88]:
qa_model.eval()
start_index = 0
split = 'valid'

question_text, entities_times_padded, entities_times_padded_mask, answers_khot = dataset.get_batch(
    split=split,
    start_index=start_index,
    batch_size=1)
scores = qa_model.forward(question_text, entities_times_padded.cuda(), entities_times_padded_mask.cuda())
loss = qa_model.loss(scores, answers_khot.cuda())


In [89]:
loss

tensor(0.0015, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [90]:
question_text

['When was the first time that Henry George Boldero was the Member of the 14th Parliament of the United Kingdom?']

In [92]:
dataset.data['valid'][start_index]

{'question': 'When was the first time that Q5721867 was the Q41582553?',
 'answers': {1841},
 'answer_type': 'time',
 'template': 'When was the {adj} time that {head} was the {tail}?',
 'entities': {'Q41582553', 'Q5721867'},
 'times': set(),
 'relations': {'P39'},
 'paraphrases': ['When was the first time that Henry George Boldero was the Member of the 14th Parliament of the United Kingdom?']}

In [93]:
torch.argmax(answers_khot[0])

tensor(61071, device='cuda:0')

In [94]:
val, ind = torch.topk(scores, 10, largest=True)
ind[0]

tensor([26196, 35754, 13940, 61204, 23946, 61226, 61228, 61235, 61231, 61236],
       device='cuda:0')

In [95]:
getAnswersFromScores(scores, dataset)

['Ann Taylor, Baroness Taylor of Bolton',
 'Porter Goss',
 'John Hume',
 1974,
 'Bill Richardson',
 1996,
 1998,
 2005,
 2001,
 2006]

In [96]:
len(scores)

1

In [441]:
torch.sum(answers_khot)

tensor(1.)

In [377]:
len(all_dicts['id2ent'])

21322