In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from utils_accelerate import *

tokenizer = T5Tokenizer.from_pretrained('t5-small')

model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.eval()
# model.cpu()

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [2]:
input = "obama is the president of"
input_ids = tokenizer(input, return_tensors="pt").input_ids  # Batch size 1


In [3]:
# outputs = model.sample(input_ids)
from transformers import (
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    BeamSearchScorer,
)

In [4]:
fname = 'data/codex-m/valid.txt'
f = open(fname, 'r')
data = []
for line in f:
    data.append(line.strip())
f.close()

In [5]:
len(data)

20620

In [6]:
data[0]

'predict tail: novalis | occupation |\tphilosopher'

In [7]:
import torch
# data_point = 'predict tail: novalis | occupation |    philosopher'
id = 0
data_point = data[id]
encoder_input_str, target = data_point.split('\t')
encoder_input_str = [encoder_input_str]
encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
num_beams = 10
num_predictions = 3
input_ids = torch.ones((len(encoder_input_str) * num_beams, 1), device=model.device, dtype=torch.long)
input_ids = input_ids * model.config.decoder_start_token_id
model_kwargs = {
    "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
}
beam_scorer = BeamSearchScorer(
    batch_size=len(encoder_input_str),
    max_length=model.config.max_length,
    num_beams=num_beams,
    device=model.device,
    num_beam_hyps_to_keep=num_predictions,
    length_penalty=0.3
)
logits_processor = LogitsProcessorList([])

In [54]:
input = 'JDK is developed by <extra_id_0>'
encoder_input_str = [input]
encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
encoder_input_str

['JDK is developed by <extra_id_0>']

In [55]:
# outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
# print("Beam:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
outputs = model.generate(encoder_input_ids)
print('Greedy:', tokenizer.batch_decode(outputs, skip_special_tokens=True))
print('Target:', target)

Greedy: ['a team of experts.']
Target: philosopher


In [131]:
def getGreedyOutput(model, tokenizer, encoder_input_str):
    encoder_input_str = [encoder_input_str]
    encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
    outputs = model.generate(encoder_input_ids)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [132]:
def getBeamOutput(model, tokenizer, encoder_input_str, num_beams=10, 
                  num_predictions=3, length_penalty=0.3):
    encoder_input_str = [encoder_input_str]
    encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
    input_ids = torch.ones((len(encoder_input_str) * num_beams, 1), device=model.device, dtype=torch.long)
    input_ids = input_ids * model.config.decoder_start_token_id
    model_kwargs = {
        "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
    }
    beam_scorer = BeamSearchScorer(
        batch_size=len(encoder_input_str),
        max_length=model.config.max_length,
        num_beams=num_beams,
        device=model.device,
        num_beam_hyps_to_keep=num_predictions,
        length_penalty=length_penalty
    )
    logits_processor = LogitsProcessorList([])
    outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [119]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 64)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 64)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=64, out_features=16, bias=False)
              (k): Linear(in_features=64, out_features=16, bias=False)
              (v): Linear(in_features=64, out_features=16, bias=False)
              (o): Linear(in_features=16, out_features=64, bias=False)
              (relative_attention_bias): Embedding(32, 2)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=64, out_features=256, bias=False)
              (wo): Linear(in_features=256, out_features=64, bias=False)
              (dropout): Dropout(p=0.1, in

In [136]:
from tqdm import tqdm
# id = 100
scorer_function = getBeamOutput 
# scorer_function = getGreedyOutput 
num_points = 200
correct = 0
for id in tqdm(range(0, num_points)):
    data_point = data[id]
    input, target = data_point.split('\t')
    predicted = set(scorer_function(model, tokenizer, input))
    if target in predicted:
        correct += 1
print(correct/num_points)

100%|██████████| 200/200 [00:17<00:00, 11.73it/s]

0.155





In [118]:
outputs.shape

torch.Size([20, 15])

In [71]:
outputs

tensor([[    0,  1038,  2137,    21, 20532,    11,   606,     1]])

Generated: ['international bank for reconstruction and development']


In [3]:
print(input)
print(''.join(tokenizer.convert_ids_to_tokens(outputs[0])))

predict tail: united states of america | member of |
<pad>▁international▁bank▁for▁reconstruction▁and▁development</s>


In [4]:
from dataset import T5_Dataset

In [5]:
valid_dataset = T5_Dataset('test', dataset_name='codex-m')

100%|██████████| 20622/20622 [00:00<00:00, 840393.08it/s]


In [6]:
from eval_accelerate import removePadding, eval

In [7]:
class Args:
    batch_size = 200
args=Args()

In [8]:
acc = eval(model, valid_dataset, args)

100%|██████████| 104/104 [00:59<00:00,  1.74batches/s]


In [9]:
acc

0.10876733585491223

In [8]:
actual = tokenizer("international development association", return_tensors="pt").input_ids[0].numpy()

In [9]:
actual

array([1038,  606, 6028,    1])

In [10]:
predicted = outputs[0][1:].numpy()

In [11]:
predicted

array([ 1038,  2137,    21, 20532,    11,   606,     1])

In [30]:
actual == predicted

array([ True,  True,  True,  True])

In [25]:
actual.numpy()

array([1038,  606, 6028,    1])