In [1]:
import argparse
from typing import Dict
import logging
import torch
from torch import optim
import pickle
import numpy as np

from qa_models import QA_model, QA_model_Only_Embeddings, QA_model_BERT, QA_model_EaE, QA_model_EmbedKGQA, QA_model_EaE_replace, QA_model_EmbedKGQA_complex
from qa_datasets import QA_Dataset, QA_Dataset_model1, QA_Dataset_EaE, QA_Dataset_EmbedKGQA, QA_Dataset_EaE_replace
from torch.utils.data import Dataset, DataLoader
import utils
from tqdm import tqdm
from utils import loadTkbcModel, loadTkbcModel_complex
from collections import defaultdict
from datetime import datetime
from collections import OrderedDict


In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"


In [3]:
dataset_name = 'wikidata_big'
tkbc_model_file = 'tkbc_model_17dec.ckpt'
tkbc_model = loadTkbcModel('models/{dataset_name}/kg_embeddings/{tkbc_model_file}'.format(
    dataset_name = dataset_name, tkbc_model_file=tkbc_model_file
))

Loading tkbc model from models/wikidata_big/kg_embeddings/tkbc_model_17dec.ckpt
Number ent,rel,ts from loaded model: 125726 406 9621
Loaded tkbc model


In [4]:
class Args:
    lm_frozen = 1
    frozen = 1
    multi_label=0
    combine_all_ents = 'None'
    attention = False


In [5]:
args = Args()
qa_model = QA_model_EmbedKGQA(tkbc_model, args)
filename = 'models/{dataset_name}/qa_models/{model_file}.ckpt'.format(
    dataset_name=dataset_name,
    model_file='cronkgqa_finalds2'
)
print('Loading model from', filename)
# add cpu thing here if no gpu available
qa_model.load_state_dict(torch.load(filename))
print('Loaded qa model from ', filename)
qa_model = qa_model.cuda()

Freezing LM params
Freezing entity/time embeddings
Loading model from models/wikidata_big/qa_models/cronkgqa_finalds2.ckpt
Loaded qa model from  models/wikidata_big/qa_models/cronkgqa_finalds2.ckpt


In [60]:
def makeProbeData(dataset):
    filtered_questions = []
    for q in dataset.data:
        question_type = q['type']
        answer_type = q['answer_type']
        entities = q['entities']
        paraphrases = q['paraphrases']
        if question_type == 'first_last' and answer_type == 'time' and len(entities) == 1:
            is_first = False
            for pp in paraphrases:
                if 'first' in pp:
                    is_first = True
                    break
            if is_first:
                filtered_questions.append(q)
    final_data = []
    for q in filtered_questions:
        rel = list(q['relations'])[0]
        ent = list(q['entities'])[0]
        answer = list(q['answers'])[0]
        item = {}
        item['relation_id'] = dataset.all_dicts['rel2id'][rel]
        item['entity_id'] = dataset.all_dicts['ent2id'][ent]
        item['time'] = int(answer)
        final_data.append(item)
    X_list = []
    y_list = []
    # do device cpu if no gpu
    device = torch.device('cuda:0')
    for item in final_data:
        relation_id = torch.tensor([item['relation_id']]).long().to(device)
        rel_vector = tkbc_model.embeddings[1](relation_id)[0].cpu().numpy()
        entity_id = torch.tensor([item['entity_id']]).long().to(device)
        ent_vector = tkbc_model.embeddings[0](entity_id)[0].cpu().numpy()
        input_vector = np.concatenate((rel_vector, ent_vector))
        X_list.append(input_vector)
        time = item['time']
        y = time
        y_list.append(y)
    X_data = np.array(X_list)
    y_data = np.array(y_list)
    
    return X_data, y_data


In [61]:
train = QA_Dataset_EmbedKGQA(split='train', dataset_name=dataset_name)
X_train, y_train = makeProbeData(train)
X_train.shape, y_train.shape

Total questions =  350000
Preparing data for split train


((10237, 1024), (10237,))

In [62]:
valid = QA_Dataset_EmbedKGQA(split='valid', dataset_name=dataset_name)
X_valid, y_valid = makeProbeData(valid)
X_valid.shape, y_valid.shape

Total questions =  30000
Preparing data for split valid


((1060, 1024), (1060,))

In [66]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
X, y = X_train, y_train
rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=100, verbose=1)
reg = rf.fit(X,y)


In [67]:
import pickle
with open('probe_randomforest_regressor_fulltrain.pkl', 'wb') as fid:
    pickle.dump(reg, fid)    

In [81]:
reg2 = LinearRegression().fit(X_train, y_train)

In [94]:
id = 4
reg.predict([X_valid[id]]), y_valid[id]

(array([1975.49]), 1983)

In [105]:
from tqdm import tqdm
import math
exact = 0
error = 0
predictor = reg
predictions = []
for id in tqdm(range(len(X_valid))):
    pred = predictor.predict([X_valid[id]])[0]
    true = y_valid[id]
    predictions.append(pred)
    error += (pred - true)**2
math.sqrt(error/len(X_valid) )

100%|██████████| 1060/1060 [00:42<00:00, 24.70it/s]


77.49095063461692

In [106]:
num_correct = 0
total = len(X_valid)
gap = 3
for i in range(len(predictions)):
    pred = predictions[i]
    true = y_valid[i]
    range_low, range_high = true - gap, true + gap
    if pred >= range_low and pred <= range_high:
        num_correct += 1
num_correct/total

0.47924528301886793