In [1]:
import math
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader
import torch
from pymongo import MongoClient
from torch.utils.data import Dataset
from transformers import AutoConfig, AutoTokenizer, HfArgumentParser, T5ForConditionalGeneration, T5Config, PretrainedConfig
import os
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class EvalBatch:
    def __init__(self, items, tokenizer):
#         print(items)
        self.inputs = [item['input'] for item in items]
        self.target_text = [item["outputs"] for item in items]
        self.inputs_tokenized = tokenizer(self.inputs, padding=True, truncation=True, return_tensors="pt")

    # custom memory pinning method on custom type
    def pin_memory(self):
        self.inputs_tokenized.input_ids = self.inputs_tokenized.input_ids.pin_memory()
        self.inputs_tokenized.attention_mask = self.inputs_tokenized.attention_mask.pin_memory()
        return self

In [3]:
class KGLMDataset(Dataset):
    def __init__(self, port, db, collection):
        self.client = MongoClient('localhost', port)
        self.db_name = db
        self.collection_name = collection
        self.collection = self.client[db][collection]
        self.tokenizer = AutoTokenizer.from_pretrained("t5-base")
        self.tokenizer.add_tokens(['[SEP]'], special_tokens=True)
        self.length = self.client[self.db_name].command("collstats", self.collection_name)['count']

    def  __getitem__(self, idx):
        item = {}
        doc = self.collection.find_one({'_id': str(idx)})
        item["input"] = doc['verbalization']
        item["outputs"] = doc['target']
        return item
        
    def __len__(self):
        return self.length
    
    def _collate_eval(self, batch):
        encode_plus_kwargs = {'truncation': True, 'padding': 'longest', 'pad_to_multiple_of': 1}

        
        inputs = [b['input'] for b in batch]
        inputs_tokenized = self.tokenizer.batch_encode_plus(list(inputs), max_length=512, return_tensors='pt',
                                                   **encode_plus_kwargs)
        
        target_text = [b["outputs"] for b in batch]

        return inputs_tokenized.input_ids, inputs_tokenized.attention_mask, target_text

    def _collate_eval_with_input_strings(self, items):
        return EvalBatch(items, self.tokenizer)
        # inputs = [item[0] for item in items]
        # target_text = [item[1] for item in items]
        # inputs_tokenized = self.tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
        # return inputs_tokenized.input_ids, inputs_tokenized.attention_mask, target_text, inputs


In [4]:
dataset = KGLMDataset(27017, 'KGLM', 'test')
# data_loader = DataLoader(
#     dataset,
#     batch_size=1,
#     shuffle=False,
#     num_workers=1,
#     collate_fn=dataset._collate_eval
# )

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
path = 'lr5e-05_constant_with_warmup_adamw_wd1e-03_512-512_bs64_iters4000000/run_1/'

model_cpt = os.path.join('.', 'model_best-3.pth')
config_path = os.path.join('t5configs', 't5-small.json')

model_cfg = AutoConfig.from_pretrained('t5-small')
model = T5ForConditionalGeneration(config=model_cfg)

cpt = torch.load(model_cpt, map_location='cpu')
model.load_state_dict(cpt['model_state_dict'])

RuntimeError: PytorchStreamReader failed reading file data/123724256: invalid header or archive is corrupted

In [None]:
class Args:
    def __init__(self, length_penalty=1.0, max_output_length=20, length_normalization=0,
                 batch_size=1, beam_size=1, save_file='neigh_model', num_predictions=200):
        self.batch_size = batch_size
        self.beam_size = beam_size
        self.save_file = save_file
        self.num_predictions = num_predictions
        self.length_penalty = length_penalty
        self.max_output_length = max_output_length
        self.length_normalization = length_normalization
args = Args(save_file='baseline_model')

In [None]:
def getScores(ids, scores, pad_token_id, length_normalization = 0):
    # ids is list of tokenized strings
    # scores is a list of tensors. each tensor contains score of each token in vocab
    # conditioned on ids till that point
    # stack scores
    scores = torch.stack(scores, dim=1)
    
    # after stacking, shape is (batch_size*num_return_sequences, num tokens in sequence, vocab size)
    # get probs
    log_probs = torch.log_softmax(scores, dim=2)
    # remove start token
    ids = ids[:,1:]
    # gather needed probs
    x = ids.unsqueeze(-1).expand(log_probs.shape)
    needed_logits = torch.gather(log_probs, 2, x)
    final_logits = needed_logits[:, :, 0]
    padded_mask = (ids == pad_token_id)
    final_logits[padded_mask] = 0
    final_scores = final_logits.sum(dim=-1)
    if length_normalization == 1:
        sequence_lengths = torch.sum(~padded_mask, dim=1)
        final_scores = final_scores/sequence_lengths

    return final_scores.cpu().detach().numpy()

In [None]:
def grouper(arr, n):
    "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
    total = len(arr)
    if total % n != 0:
        raise ValueError('Cannot divide %d by %d' % (total, n))
    out = []
    for i in range(int(total/n)):
        start_id = i * n
        out.append(arr[start_id:start_id+n])
    return out

In [None]:
def eval(model, dataset, args):
    num_workers = 1
    batch_size = args.batch_size
    # batch_size = 200
    model.cuda()
    model.eval()
    print('Using model.generate')

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers,
        collate_fn=dataset._collate_eval_with_input_strings, pin_memory=True)
    
    loader = tqdm(data_loader, total=len(data_loader), unit="batches")
    i = 0
    targets = []
    predictions = []
    prediction_scores = []
    model_inputs = []
    
    with torch.inference_mode():
        for steps, batch in enumerate(loader):
            
            input_ids, attention_mask, target_text, input_text = batch.inputs_tokenized.input_ids, \
                batch.inputs_tokenized.attention_mask, batch.target_text, batch.inputs
                
            outputs = model.generate(input_ids = input_ids.cuda(), attention_mask=attention_mask.cuda(),
                        temperature=1.0, #TODO: make this argument?
                        do_sample=True,
                        num_return_sequences = args.num_predictions,
                        num_beams = args.beam_size,
                        eos_token_id = dataset.tokenizer.eos_token_id,
                        pad_token_id = dataset.tokenizer.pad_token_id,
                        output_scores = True,
                        return_dict_in_generate=True,
                        length_penalty = args.length_penalty,
                        max_length=args.max_output_length,
                        # top_p=0.95,
                        # top_k=250,
                        #  prefix_allowed_tokens_fn=prefixFn,
                        )
            sequences = outputs.sequences
            
            if args.beam_size > 1:
                final_scores = outputs.sequences_scores
                if args.length_normalization == 1:
                    # get sequence lengths. see getScores for how this works
                    sequence_lengths = torch.sum((sequences[:,1:] != dataset.tokenizer.pad_token_id), dim=1)
                    final_scores = final_scores/sequence_lengths
                final_scores = final_scores.cpu()

            else:
                scores = outputs.scores
                final_scores = getScores(sequences, scores, dataset.tokenizer.pad_token_id,
                                         length_normalization = args.length_normalization)
            
            predicted_text = dataset.tokenizer.batch_decode(sequences, skip_special_tokens=True)
            
            if len(predicted_text) == len(input_text):
                final_scores = final_scores.tolist()
            else:
                predicted_text = grouper(predicted_text, args.num_predictions) # grouping only needed if multiple predictions
                final_scores = grouper(final_scores, args.num_predictions)
                
            targets.extend(target_text)
            model_inputs.extend(input_text)
            predictions.extend(predicted_text)
            prediction_scores.extend(final_scores)
#             print(targets)
#             print(predictions)
#             print(prediction_scores)

    correct = 0     
    num_not_in_entities = 0
    for p, t in zip(predictions, targets):

        if t in p:
            correct += 1

    # print(num_not_in_entities/len(predictions), 'predictions were not entities')
    data_to_save = {'prediction_strings': predictions, 
                    'scores': prediction_scores,
                    'target_strings': targets,
                    'input_strings': model_inputs}
    fname = 'scores/' + args.save_file + '.pickle'
    pickle.dump(data_to_save, open(fname, 'wb'))
    accuracy = correct/len(targets)
    return accuracy    
            

In [None]:
eval(model, dataset, args)