In [53]:
from tqdm.notebook import tqdm
import pickle
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"
from utils_accelerate import *

In [52]:
def readLines(fname):
    lines = []
    f = open(fname)
    for line in f:
        lines.append(line.rstrip())
    f.close()
    return lines

def readTriples(fname):
    lines = readLines(fname)
    triples = []
    for line in lines:
        triples.append(line.split('\t'))
    return triples

def tripleToText(t, aliases_dict):
    try:
        out = [aliases_dict[x] for x in t]
    except:
        print('Error for ', t)
        out = t
    return out

def writeLines(lines, fname):
    f = open(fname, 'w')
    for line in lines:
        f.write(line + '\n')
    f.close()
    return

def writeTriples(triples, fname):
    f = open(fname, 'w')
    for t in triples:
        line = '\t'.join(t)
        f.write(line + '\n')
    f.close()
    return

In [4]:
ckpt = 'wd5m-sentencepiece-rp/530000.pt'
model, optimizer, _, _ = load_accelerator_model('models/{}'.format(ckpt))

Model args
Namespace(batch_size=100, dataset='wikidata5m', epochs=15, learning_rate=None, load_checkpoint='wd5m-sentencepiece-rp/75000.pt', loss_steps=250, max_checkpoints=5, model_size='t5-small', num_workers=3, optimizer='adafactor', relation_prediction=1, resume=None, save_prefix='wd5m-sentencepiece-rp', save_steps=5000, start_steps=0, tokenizer='sentencepiece')
Vocab size is 30000


100%|██████████████████████████████████████████████████████| 10714/10714 [00:00<00:00, 507702.61it/s]


In [50]:
model.cuda()
1

1

In [8]:
dataset = T5_Dataset('valid', dataset_name='wikidata5m', tokenizer_type='sentencepiece')

Vocab size is 30000


100%|██████████████████████████████████████████████████████| 10714/10714 [00:00<00:00, 686240.50it/s]


In [54]:
fname = '/scratche/home/apoorv/transformer-kgc/data/wikidata5m/aliases.txt'
lines = readLines(fname)
aliases_dict = {}
for line in tqdm(lines):
    line = line.split('\t')
    try:
        aliases_dict[line[0]] = line[1]
    except:
        print(line)

  0%|          | 0/4819507 [00:00<?, ?it/s]

['Q11159396']


In [55]:
def getTripleScore(model, triple, aliases_dict, direction='tail'):
    t = [aliases_dict[x] for x in triple]
    input_text = 'predict tail: {0} | {1} |'.format(t[0], t[1])
    target_text = t[2]
    model.eval()
    inputs_tokenized = dataset.tokenizer([input_text], padding=True, truncation=True, return_tensors="pt")
    input_ids = inputs_tokenized.input_ids
    target_tokenized = dataset.tokenizer([target_text], padding=True, truncation=True, return_tensors="pt")
    target_ids = target_tokenized.input_ids
    outputs = model(input_ids = input_ids.cuda(),
               labels = target_ids.cuda())
    logits = torch.log_softmax(outputs.logits, dim=2).squeeze()
    score = 0
    for i, x in enumerate(target_ids.squeeze()):
        s = logits[i][x]
        score += s.item()
    return score

In [56]:
fname = '/scratche/home/apoorv/repos/probe_kge/data/wikidata5m/valid.txt'
valid = readTriples(fname)
fname = '/scratche/home/apoorv/repos/probe_kge/data/wikidata5m/test.txt'
test = readTriples(fname)

In [57]:
fname = '/scratche/home/apoorv/repos/probe_kge/wd5m_valid_neg.txt'
valid_neg = readTriples(fname)
fname = '/scratche/home/apoorv/repos/probe_kge/wd5m_test_neg.txt'
test_neg = readTriples(fname)

In [72]:
triples_to_score = test_neg
scores_list = []
for t in tqdm(triples_to_score):
    score = getTripleScore(model, t, aliases_dict)
    item = {}
    item['triple'] = t
    item['score'] = score
    scores_list.append(item)

  0%|          | 0/9789 [00:00<?, ?it/s]

In [73]:
import pickle
fname = 'wd5m_t5_test_neg_scores.pickle'
pickle.dump(scores_list, open(fname, 'wb'))

In [74]:
scores_list[:10]

[{'triple': ['Q268615', 'P27', 'Q851'], 'score': -1.576493263244629},
 {'triple': ['Q49452', 'P27', 'Q30'], 'score': -4.784881591796875},
 {'triple': ['Q1452648', 'P509', 'Q11081'], 'score': -7.18287992477417},
 {'triple': ['Q19199', 'P106', 'Q177220'], 'score': -2.7694578170776367},
 {'triple': ['Q104049', 'P1412', 'Q9288'], 'score': -6.375690460205078},
 {'triple': ['Q187884', 'P27', 'Q29999'], 'score': -1.4096927642822266},
 {'triple': ['Q436693', 'P20', 'Q60'], 'score': -3.654574394226074},
 {'triple': ['Q315391', 'P20', 'Q220'], 'score': -5.577649116516113},
 {'triple': ['Q47878', 'P27', 'Q183'], 'score': -7.386842727661133},
 {'triple': ['Q168821', 'P495', 'Q40'], 'score': -5.70074462890625}]

In [32]:
input_text = 'predict tail: barack obama | coo |'
target_text = 'united states of america'
model.eval()
inputs_tokenized = dataset.tokenizer([input_text], padding=True, truncation=True, return_tensors="pt")
input_ids = inputs_tokenized.input_ids
target_tokenized = dataset.tokenizer([target_text], padding=True, truncation=True, return_tensors="pt")
target_ids = target_tokenized.input_ids

In [33]:
outputs = model(input_ids = input_ids.cuda(),
               labels = target_ids.cuda())

In [43]:
logits = torch.log_softmax(outputs.logits, dim=2).squeeze()

In [45]:
score = 0
for i, x in enumerate(target_ids):
    s = logits[i][x]
    print(s)

tensor([ -0.0375, -10.0608, -14.5330, -12.3434, -13.9325], device='cuda:0',
       grad_fn=<IndexBackward>)


In [48]:
score = 0
for i, x in enumerate(target_ids.squeeze()):
    s = logits[i][x]
    score += s.item()
score

-0.05280494689941406

In [15]:
output_ids = model.generate(input_ids = input_ids.cuda(),
                           eos_token_id = dataset.tokenizer.eos_token_id,)

In [16]:
predicted_text = dataset.tokenizer.batch_decode(output_ids, skip_special_tokens=True)

In [17]:
predicted_text

['united states of america']