In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel
# ~/.cache/huggingface/hub
from datasets import load_dataset
import numpy as np
import pandas as pd
import scienceplots
import matplotlib.pyplot as plt

plt.style.use(['science', 'notebook', 'grid', 'ieee'])

- t5: text-to-text transfer transformer
    - https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html
    - https://huggingface.co/docs/transformers/model_doc/t5
- Model size
    - t5-small, 512 (64*8)
    - t5-base, 768 (64*12)
    - t5-large, 1024 (64*16)
    - t5-3b, 4096 (128*32)
    - t5-11b, 16384 (128*128)

In [3]:
model_name = 't5-base'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
config

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "pre

In [4]:
model

T5Model(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dropout(p=0.1, inplace

#### Forward

In [41]:
text = 'Studies have been shown that owning a dog is good for you'
batch_inputs = tokenizer(text, return_tensors='pt')
input_ids = batch_inputs['input_ids']
input_ids, len(*input_ids)

(tensor([[6536,   43,  118, 2008,   24,  293,   53,    3,    9, 1782,   19,  207,
            21,   25,    1]]),
 15)

In [42]:
decoder_input_ids = tokenizer('Studies show that', return_tensors='pt')
decoder_input_ids = decoder_input_ids['input_ids']
print(decoder_input_ids)
decoder_input_ids = model._shift_right(decoder_input_ids)
print(decoder_input_ids)

tensor([[6536,  504,   24,    1]])
tensor([[   0, 6536,  504,   24]])


In [43]:
model.eval()
# forward pass
outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
last_hidden_states = outputs.last_hidden_state
last_hidden_states, last_hidden_states.shape

(tensor([[[ 0.0991,  0.1743, -0.1142,  ..., -0.0117,  0.0558,  0.1053],
          [ 0.0314, -0.1558,  0.0109,  ..., -0.0440, -0.0196, -0.0842],
          [-0.1150, -0.1574,  0.0051,  ..., -0.0378, -0.0137, -0.0972],
          [-0.1110,  0.0093, -0.0019,  ...,  0.0514,  0.0396, -0.0774]]],
        grad_fn=<MulBackward0>),
 torch.Size([1, 4, 768]))

In [44]:
# Details in the T5 forward
encoder_outputs = model.encoder(input_ids=input_ids)
hidden_states = encoder_outputs[0]
print(hidden_states.shape)

torch.Size([1, 15, 768])


In [45]:
decoder_outputs = model.decoder(input_ids=decoder_input_ids, encoder_hidden_states=hidden_states)
decoder_last_hidden_states = decoder_outputs.last_hidden_state
decoder_last_hidden_states

tensor([[[ 0.0991,  0.1743, -0.1142,  ..., -0.0117,  0.0558,  0.1053],
         [ 0.0314, -0.1558,  0.0109,  ..., -0.0440, -0.0196, -0.0842],
         [-0.1150, -0.1574,  0.0051,  ..., -0.0378, -0.0137, -0.0972],
         [-0.1110,  0.0093, -0.0019,  ...,  0.0514,  0.0396, -0.0774]]],
       grad_fn=<MulBackward0>)

In [46]:
last_hidden_states == decoder_last_hidden_states

tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]])

#### If we are doing a inference

In [99]:
text = 'Translate it: Studies have been shown that owning a dog is good for you.'
batch_inputs = tokenizer(text, return_tensors='pt')
input_ids = batch_inputs['input_ids']
input_ids, len(*input_ids)

(tensor([[30355,    15,    34,    10,  6536,    43,   118,  2008,    24,   293,
             53,     3,     9,  1782,    19,   207,    21,    25,     5,     1]]),
 20)

In [100]:
encoder_ouputs = model.get_encoder()(input_ids)
encoder_outputs, encoder_outputs.last_hidden_state.shape

(BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[ 0.0902, -0.1033, -0.4092,  ...,  0.0406,  0.0204, -0.1073],
          [ 0.0528,  0.0487, -0.2332,  ..., -0.2115, -0.0991, -0.3346],
          [-0.0057, -0.0381, -0.2648,  ..., -0.3037, -0.1655, -0.3205],
          ...,
          [ 0.4811,  0.0481, -0.4502,  ..., -0.1001, -0.2813, -0.2189],
          [ 0.4262,  0.1389, -0.5530,  ..., -0.2203, -0.0630,  0.0804],
          [ 0.0226, -0.0007, -0.0135,  ..., -0.0020,  0.0204, -0.0342]]],
        grad_fn=<MulBackward0>), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None),
 torch.Size([1, 15, 768]))

In [76]:
decoder_start_token = model.config.decoder_start_token_id
decoder_input_ids = torch.tensor([[decoder_start_token]], dtype=torch.long)

decoder_outputs = model.get_decoder()(input_ids=decoder_input_ids, 
                                      encoder_hidden_states=encoder_output.last_hidden_state)

# decoder_outputs.keys()
output_tokens = decoder_outputs[0]
probabilites = F.softmax(output_tokens, dim=-1)
next_token_id = torch.argmax(probabilites, dim=-1).item()
output_text = tokenizer.decode([next_token_id])
#print(output_tokens.shape)
#print(output_tokens)
#output_text = tokenizer.decode(output_tokens[0][0].tolist())
output_text

'can'

#### To the next word

In [101]:
decoder_start_token = model.config.decoder_start_token_id
decoder_input_ids = torch.tensor([[decoder_start_token]], dtype=torch.long)

generated_tokens = []

for _ in range(100): # maximum length of the output sequence
    decoder_outputs = model.get_decoder()(input_ids=decoder_input_ids, 
                                          encoder_hidden_states=encoder_output.last_hidden_state)
    logits = decoder_outputs[0]
    next_token_id = logits.argmax(-1)[:, -1]
    decoder_input_ids = torch.cat([decoder_input_ids, next_token_id.unsqueeze(-1)], dim=-1)
    generated_tokens.append(next_token_id.item())
    if next_token_id.item() == tokenizer.eos_token_id:
        break

print(generated_tokens)

[54, 54, 484, 86, 86, 86, 54, 484, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 86, 54, 219, 54, 219, 54, 219, 54, 219, 54, 219, 54, 219, 54, 219, 54, 219, 54, 219, 54, 219, 54, 219, 86, 219, 219, 54, 219, 219, 54, 219, 86, 54, 219, 54, 219, 54, 219, 54, 219, 54, 219, 54, 219, 219, 54, 219, 86, 219, 219, 54, 219, 219, 54, 219, 219, 54, 219, 219, 54, 219, 86, 54, 219, 54, 219]


In [102]:
output_text = tokenizer.decode(generated_tokens)
output_text

'can can book In In In can book In can In can In can In can In can In can In can In can In can In can In can In can In can In can auf can auf can auf can auf can auf can auf can auf can auf can auf can auf can auf In auf auf can auf auf can auf In can auf can auf can auf can auf can auf can auf auf can auf In auf auf can auf auf can auf auf can auf auf can auf In can auf can auf'

#### Using a bigger model

In [112]:
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [115]:
text = 'Translate English to German: Studies have been shown that owning a dog is good for you.'
input_ids = tokenizer.encode(text, return_tensors="pt") 
result = model.generate(input_ids)
tokenizer.decode(result[0])

'<pad> Studien haben gezeigt, dass der Besitz eines Hundes gut für Sie ist.</s>'

In [114]:
input_ids = tokenizer.encode("translate English to German: Hello, my dog is cute", return_tensors="pt") 
result = model.generate(input_ids)
tokenizer.decode(result[0])

'<pad> Hallo, mein Hund ist süß</s>'

In [116]:
text = 'Tell me which part of this sentence is true?:Studies have been shown that owning a dog is good for you.'
input_ids = tokenizer.encode(text, return_tensors="pt") 
result = model.generate(input_ids)
tokenizer.decode(result[0])

'<pad> What is the correct part of this sentence?</s>'

#### Greedy search

In [47]:
sample_text = 'A long long time ago,'
model_inputs = tokenizer(sample_text, return_tensors='pt')
input_ids = model_inputs.input_ids

In [48]:
n_steps = 10
top_x = 5


decoder_start_token = model.config.decoder_start_token_id
decoder_input_ids = torch.tensor([[decoder_start_token]], dtype=torch.long)

iterations = []
with torch.no_grad():
    for _ in range(n_steps):
        iteration = {}
        # the first row
        iteration['input'] = tokenizer.decode(input_ids[0])
        encoder_outputs = model.get_encoder()(input_ids)
        decoder_outputs = model.get_decoder()(input_ids=decoder_input_ids, 
                                              encoder_hidden_states=encoder_outputs.last_hidden_state)
        last_token_logits = decoder_outputs.last_hidden_state[0, -1, :]
        last_token_prob = torch.softmax(last_token_logits, dim=-1)
        sorted_ids = torch.argsort(last_token_prob, dim=-1, descending=True)
        for choice_idx in range(top_x):
            token_id = sorted_ids[top_x]
            token_prob = last_token_prob[token_id]
            token_choice = f'{tokenizer.decode(token_id)}({100*token_prob:.2f}%)'
            iteration[f'choice {choice_idx + 1}'] = token_choice
        
        print('before append input_ids.shape', input_ids.shape)
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
        print('after append input_ids.shape', input_ids.shape)
        
        iterations.append(iteration)

before append input_ids.shape torch.Size([1, 7])
after append input_ids.shape torch.Size([1, 8])
before append input_ids.shape torch.Size([1, 8])
after append input_ids.shape torch.Size([1, 9])
before append input_ids.shape torch.Size([1, 9])
after append input_ids.shape torch.Size([1, 10])
before append input_ids.shape torch.Size([1, 10])
after append input_ids.shape torch.Size([1, 11])
before append input_ids.shape torch.Size([1, 11])
after append input_ids.shape torch.Size([1, 12])
before append input_ids.shape torch.Size([1, 12])
after append input_ids.shape torch.Size([1, 13])
before append input_ids.shape torch.Size([1, 13])
after append input_ids.shape torch.Size([1, 14])
before append input_ids.shape torch.Size([1, 14])
after append input_ids.shape torch.Size([1, 15])
before append input_ids.shape torch.Size([1, 15])
after append input_ids.shape torch.Size([1, 16])
before append input_ids.shape torch.Size([1, 16])
after append input_ids.shape torch.Size([1, 17])


In [49]:
pd.DataFrame(iterations)

Unnamed: 0,input,choice 1,choice 2,choice 3,choice 4,choice 5
0,"A long long time ago,</s>",V(0.01%),V(0.01%),V(0.01%),V(0.01%),V(0.01%)
1,"A long long time ago,</s> can",V(0.01%),V(0.01%),V(0.01%),V(0.01%),V(0.01%)
2,"A long long time ago,</s> can can",V(0.01%),V(0.01%),V(0.01%),V(0.01%),V(0.01%)
3,"A long long time ago,</s> can can can",has(0.01%),has(0.01%),has(0.01%),has(0.01%),has(0.01%)
4,"A long long time ago,</s> can can can can",has(0.02%),has(0.02%),has(0.02%),has(0.02%),has(0.02%)
5,"A long long time ago,</s> can can can can can",has(0.03%),has(0.03%),has(0.03%),has(0.03%),has(0.03%)
6,"A long long time ago,</s> can can can can can can",has(0.03%),has(0.03%),has(0.03%),has(0.03%),has(0.03%)
7,"A long long time ago,</s> can can can can can ...",has(0.04%),has(0.04%),has(0.04%),has(0.04%),has(0.04%)
8,"A long long time ago,</s> can can can can can ...",has(0.03%),has(0.03%),has(0.03%),has(0.03%),has(0.03%)
9,"A long long time ago,</s> can can can can can ...",In(0.04%),In(0.04%),In(0.04%),In(0.04%),In(0.04%)
