In [None]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import numpy as np
import matplotlib.pyplot as plt

In [None]:

class MultiHeadAttention(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, causal=False):
    super().__init__()

    # this part is the same as the Encoder
    # d_k = d_v = d_q
    self.d_k = d_k
    self.n_heads = n_heads

    self.key = nn.Linear(d_model, d_k * n_heads)
    self.query = nn.Linear(d_model, d_k * n_heads)
    self.value = nn.Linear(d_model, d_k * n_heads)

    self.fc = nn.Linear(d_k * n_heads, d_model)

    self.causal = causal
    if causal:
      # causal mask - lower triangle of the matrix is 1, upper is 0
      cm = torch.tril(torch.ones(max_len, max_len))
      self.register_buffer(
          "causal_mask",
          cm.view(1, 1, max_len, max_len) # we have to make it 4 dim so it will broadcast
      )

  def forward(self, q, k, v, pad_mask=None): # q, k, v here are the size of (N, T, d_model) which means they are not the Q, K, V that go into the attention equation
    q = self.query(q) # size N x T x (h * d_k) this is Q from the equation
    k = self.key(k)   # size N x T x (h * d_k) K from the equation
    v = self.value(v) # size N x T x (h * d_k) V from the equation

    N = q.shape[0]
    T_output = q.shape[1]
    T_input = k.shape[1]

    # the following code changes the shapes from
    # (N, T, (h * d_t)) -> (N, T, h, d_k) (view) -> (N, h, T, d_k) (transpose)
    # this is done in order for matrix multiply to work correctly
    q = q.view(N, T_output, self.n_heads, self.d_k).transpose(1, 2)
    k = k.view(N, T_input, self.n_heads, self.d_k).transpose(1, 2)
    v = v.view(N, T_input, self.n_heads, self.d_k).transpose(1, 2)

    # compute attention weights
    # (N, h, T, d_k) x (N, h, d_k, T) (because of transpose) --> (N, h, T, T)
    attn_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)

    # pad_mask is the encoder mask
    if pad_mask is not None:
      attn_scores = attn_scores.masked_fill(
          pad_mask[:, None, None, :] == 0, float('-inf')
      )

    # new relative to the Encoder
    if self.causal:
      attn_scores = attn_scores.masked_fill(
          self.causal_mask[:, :, :T_output, :T_input] == 0, float('-inf')
      )

    # apply the softmax
    attn_weights = F.softmax(attn_scores, dim=-1)

    # compute attention-weighted values
    # (N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    # reshape it back before final linear layer
    A = A.transpose(1, 2) # (N, T, h, d_k)
    A = A.contiguous().view(N, T_output, self.d_k * self.n_heads) # (N, T, h * d_k)

    return self.fc(A)

In [None]:
class EncoderBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = MultiHeadAttention(d_k, d_model, n_heads, max_len, causal=False)
    self.ann = nn.Sequential( # Feed forward network (last layer from the Transformer diagram in Encoder)
        nn.Linear(d_model, d_model * 4),
        nn.GELU(),
        nn.Linear(d_model * 4, d_model),
        nn.Dropout(dropout_prob)
    )
    self.dropout = nn.Dropout(p=dropout_prob)

  def forward(self, x, mask=None):
    x = self.ln1(x + self.mha(x, x, x, mask)) # apply normalization -> pass through Attn and add the residual connection
    x = self.ln2(x + self.ann(x)) # same as above but instead of Attn we use a Feed Forward network
    x = self.dropout(x) # apply dropout

    return x

In [None]:
class DecoderBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.ln3 = nn.LayerNorm(d_model)
    self.mha1 = MultiHeadAttention(d_k, d_model, n_heads, max_len, causal=True)
    self.mha2 = MultiHeadAttention(d_k, d_model, n_heads, max_len, causal=False)
    self.ann = nn.Sequential( # Feed forward network (last layer from the Transformer diagram in Encoder)
        nn.Linear(d_model, d_model * 4),
        nn.GELU(),
        nn.Linear(d_model * 4, d_model),
        nn.Dropout(dropout_prob)
    )
    self.dropout = nn.Dropout(p=dropout_prob)

  def forward(self, enc_output, dec_input, enc_mask=None, dec_mask=None):
    x = self.ln1(dec_input + self.mha1(dec_input, dec_input, dec_input, dec_mask)) # apply normalization -> pass through Attn and add the residual connection
    x = self.ln2(x + self.mha2(x, enc_output, enc_output, enc_mask)) # apply normalization -> pass through Attn and add the residual connection
    x = self.ln3(x + self.ann(x)) # same as above but instead of Attn we use a Feed Forward network
    x = self.dropout(x) # apply dropout

    return x

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=2048, dropout_prob=0.1):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout_prob)

    position = torch.arange(max_len).unsqueeze(1)
    exp_term = torch.arange(0, d_model, 2)
    div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position * div_term)
    pe[0, :, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe', pe) # to save the variable

  def forward(self, x):
    # x.shape: N x T x D
    x = x + self.pe[:, :x.size(1), :] # x.size(1) will give us the T dimension since it's variable
    return self.dropout(x)

In [None]:
class Encoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               n_heads,
               n_layers,
               dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [
        EncoderBlock(
            d_k,
            d_model,
            n_heads,
            dropout_prob) for _ in range(n_layers)
    ]
    self.transformer_blocks = nn.Sequential(*transformer_blocks) # encapsulate in sequential
    self.ln = nn.LayerNorm(d_model)

  def forward(self, x, mask=None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, mask)

    x = self.ln(x)

    return x

In [None]:
class Decoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               n_heads,
               n_layers,
               dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [
        DecoderBlock(
            d_k,
            d_model,
            n_heads,
            max_len,
            dropout_prob) for _ in range(n_layers)
    ]
    self.transformer_blocks = nn.Sequential(*transformer_blocks) # encapsulate in sequential
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, vocab_size)

  def forward(self, enc_output, dec_input, enc_mask=None, dec_mask=None):
    x = self.embedding(dec_input)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(enc_output, x, enc_mask, dec_mask)

    x = self.ln(x)
    x = self.fc(x)

    return x

In [None]:
class Transformer(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, enc_input, dec_input, enc_mask, dec_mask):
    enc_output = self.encoder(enc_input, enc_mask)
    dec_output = self.decoder(enc_output, dec_input, enc_mask, dec_mask)
    return dec_output

In [None]:
# test
encoder = Encoder(vocab_size=20000,
                  max_len=512,
                  d_k=16,
                  d_model=64,
                  n_heads=4,
                  n_layers=2,
                  dropout_prob=0.1)
decoder = Decoder(vocab_size=10000,
                  max_len=512,
                  d_k=16,
                  d_model=64,
                  n_heads=4,
                  n_layers=2,
                  dropout_prob=0.1)
transformer = Transformer(encoder, decoder)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
decoder.to(device)
encoder.to(device)

cuda:0


Encoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): EncoderBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): EncoderBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementw

In [None]:
xe = np.random.randint(0, 20000, size=(8, 512))
xe_t = torch.tensor(xe).to(device)

In [None]:
xd = np.random.randint(0, 10000, size=(8, 256))
xd_t = torch.tensor(xd).to(device)

In [None]:
maske = np.ones((8, 512))
maske[:, 256:] = 0
maske_t = torch.tensor(maske).to(device)

In [None]:
maskd = np.ones((8, 256))
maskd[:, 128:] = 0
maskd_t = torch.tensor(maskd).to(device)

In [None]:
out = transformer(xe_t, xd_t, maske_t, maskd_t)
out.shape

torch.Size([8, 256, 10000])

In [None]:
out

tensor([[[ 0.0639, -0.4200,  0.6526,  ..., -0.8834, -1.2211,  0.3076],
         [-0.0927, -0.1376,  0.1103,  ..., -0.3979, -0.1186,  0.1184],
         [-0.6172,  0.2420,  1.2010,  ..., -1.1709, -0.2810,  0.5104],
         ...,
         [ 0.2467, -0.3210,  0.3986,  ...,  0.2655,  0.3781,  0.5840],
         [-0.9368,  0.6027,  0.3442,  ..., -0.4211,  0.6294,  0.9663],
         [-0.7184, -0.2712,  0.9349,  ...,  0.3786,  1.2733, -0.1544]],

        [[-0.0653, -0.2495,  0.8296,  ..., -0.9223, -1.4675, -0.2312],
         [-0.2412,  0.2624, -0.0802,  ..., -1.0010, -0.3584,  0.1867],
         [-0.9290, -0.2613,  1.0994,  ..., -1.0747, -0.6560, -0.3446],
         ...,
         [-0.3883,  1.2130,  0.5230,  ..., -0.0742,  0.1759,  0.0272],
         [-0.7353,  0.2821,  0.6887,  ..., -0.4546, -0.8286,  0.4511],
         [-0.7028,  0.3109, -0.1455,  ...,  0.2722, -0.4206,  0.9529]],

        [[ 0.1920,  0.1854,  0.0256,  ..., -1.1710, -1.0312, -0.8665],
         [-0.9228, -0.1647, -0.0440,  ..., -0

In [None]:
import pandas as pd
df = pd.read_csv('../../../data/spa.txt', sep="\t", header=None)
df.head()

Unnamed: 0,0,1
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Hi.,Hola.
4,Run!,¡Corre!


In [None]:
df.shape

(115245, 2)

In [None]:
df = df.iloc[:30000]

In [None]:
df.columns = ['en', 'es']
df.to_csv("spa.csv", index=None)

In [None]:
!head spa.csv

en,es
Go.,Ve.
Go.,Vete.
Go.,Vaya.
Hi.,Hola.
Run!,¡Corre!
Who?,¿Quién?
Wow!,¡Órale!
Fire!,¡Fuego!
Fire!,¡Incendio!


In [None]:
!pip install transformers datasets sentencepiece sacremoses accelerate -U

Collecting transformers
  Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_dataset
raw_dataset = load_dataset('csv', data_files='spa.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'es'],
        num_rows: 30000
    })
})

In [None]:
split = raw_dataset['train'].train_test_split(test_size=0.3, seed=42)
split

DatasetDict({
    train: Dataset({
        features: ['en', 'es'],
        num_rows: 21000
    })
    test: Dataset({
        features: ['en', 'es'],
        num_rows: 9000
    })
})

In [None]:
from transformers import AutoTokenizer

In [None]:
model_checkpoint = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

In [None]:
en_sentence = split["train"][0]["en"]
es_sentence = split["train"][0]["es"]

inputs = tokenizer(en_sentence)
targets = tokenizer(text_target=es_sentence)

tokenizer.convert_ids_to_tokens(targets['input_ids'])

['▁Yo', '▁puedo', '▁arreglarlo', '.', '</s>']

In [None]:
es_sentence

'Yo puedo arreglarlo.'

In [None]:
max_input_length = 128
max_target_length = 128

In [None]:
def preprocess_function(batch):
  model_inputs = tokenizer(
      batch["en"], max_length=max_input_length, truncation=True
  )

  labels = tokenizer(
      text_target=batch["es"], max_length=max_target_length, truncation=True
  )

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
tokenized_datasets = split.map(
    preprocess_function,
    batched=True,
    remove_columns=split["train"].column_names
)

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9000
    })
})

In [None]:
from transformers import DataCollatorForSeq2Seq

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(0, 5)])

In [None]:
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
batch["input_ids"]

tensor([[   33,    88,  9222,    48,     3,     0, 65000, 65000],
        [  552, 11490,     9,   310,   255,     3,     0, 65000],
        [  143,    31,   125,  1208,     3,     0, 65000, 65000],
        [ 1093,   220,  1890,    23,    48,     3,     0, 65000],
        [  124,    20,   100, 18422,    48,   141,     3,     0]])

In [None]:
batch["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])

In [None]:
batch["labels"]

tensor([[  711,  1039, 44159,     3,     0,  -100,  -100,  -100],
        [ 2722, 18663,   239,   212,     3,     0,  -100,  -100],
        [  539,    43,   155,   960,     3,     0,  -100,  -100],
        [15165,  1250,   380,  3564,    36,  1016,     3,     0],
        [  350,     8, 19153,    29, 31326,     3,     0,  -100]])

In [None]:
tokenizer.all_special_ids

[0, 1, 65000]

In [None]:
tokenizer.all_special_tokens

['</s>', '<unk>', '<pad>']

In [None]:
tokenizer('<pad>')

{'input_ids': [65000, 0], 'attention_mask': [1, 1]}

In [None]:
from torch.utils.data import DataLoader

In [None]:
train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)
valid_loader = DataLoader(
    tokenized_datasets["test"],
    batch_size=32,
    collate_fn=data_collator
)

In [None]:
for batch in train_loader:
  for k, v in batch.items():
    print("k:", k, "v.shape:", v.shape)
  break

k: input_ids v.shape: torch.Size([32, 9])
k: attention_mask v.shape: torch.Size([32, 9])
k: labels v.shape: torch.Size([32, 10])


In [None]:
tokenizer.vocab_size

65001

In [None]:
tokenizer.decode([60000])

'ѕэр'

In [None]:
tokenizer.add_special_tokens({"cls_token": "<s>"})

1

In [None]:
tokenizer("<s>")

{'input_ids': [65001, 0], 'attention_mask': [1, 1]}

In [None]:
tokenizer.vocab_size

65001

In [None]:
encoder = Encoder(
    vocab_size=tokenizer.vocab_size + 1,
    max_len=512,
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1
)

decoder = Decoder(
    vocab_size=tokenizer.vocab_size + 1,
    max_len=512,
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1
)
transformer = Transformer(encoder, decoder)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
encoder.to(device)
decoder.to(device)

cuda:0


Decoder(
  (embedding): Embedding(65002, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): DecoderBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln3): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha1): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (mha2): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
 

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(transformer.parameters())

In [None]:
from datetime import datetime

def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    for batch in train_loader:
      batch = {k: v.to(device) for k, v in batch.items()}

      optimizer.zero_grad()

      enc_input = batch['input_ids']
      enc_mask = batch['attention_mask']
      targets = batch['labels']

      # shift targets forwards to get decoder inputs
      # this will make:
      # Target      I    Like    Cats    </s>
      # After Roll </s>  I       Like    Cats
      dec_input = targets.clone().detach()
      dec_input = torch.roll(dec_input, shifts=1, dims=1)
      dec_input[:, 0] = 65001 # mark start token

      # also convert all -100 to pad token id
      dec_input = dec_input.masked_fill(
          dec_input == -100, tokenizer.pad_token_id
      )

      # make decoder input mask
      dec_mask = torch.ones_like(dec_input)
      dec_mask = dec_mask.masked_fill(dec_input == tokenizer.pad_token_id, 0)

      # Forward pass
      outputs = model(enc_input, dec_input, enc_mask, dec_mask)
      loss = criterion(outputs.transpose(2, 1), targets)

      # backward and optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())

    train_loss = np.mean(train_loss)

    model.eval()
    test_loss = []
    for batch in valid_loader:
      batch = {k: v.to(device) for k, v in batch.items()}

      optimizer.zero_grad()

      enc_input = batch['input_ids']
      enc_mask = batch['attention_mask']
      targets = batch['labels']

      # shift targets forwards to get decoder inputs
      # this will make:
      # Target      I    Like    Cats    </s>
      # After Roll </s>  I       Like    Cats
      dec_input = targets.clone().detach()
      dec_input = torch.roll(dec_input, shifts=1, dims=1)
      dec_input[:, 0] = 65001 # mark start token

      # also convert all -100 to pad token id
      dec_input = dec_input.masked_fill(
          dec_input == -100, tokenizer.pad_token_id
      )

      # make decoder input mask
      dec_mask = torch.ones_like(dec_input)
      dec_mask = dec_mask.masked_fill(dec_input == tokenizer.pad_token_id, 0)

      # Forward pass
      outputs = model(enc_input, dec_input, enc_mask, dec_mask)
      loss = criterion(outputs.transpose(2, 1), targets)

      test_loss.append(loss.item())
    test_loss = np.mean(test_loss)

    # save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss

    dt = datetime.now() - t0
    print(f"Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
      Test Loss: {test_loss:.4f}, Duration: {dt}")
  return train_losses, test_losses

In [None]:
train_losses, test_losses = train(
    transformer, criterion, optimizer, train_loader, valid_loader, epochs=2
)

Epoch 1/2, Train Loss: 1.2092,       Test Loss: 2.4242, Duration: 0:00:22.536353
Epoch 2/2, Train Loss: 1.1713,       Test Loss: 2.4086, Duration: 0:00:29.628188


In [None]:
input_sentence = split["test"][10]["en"]
input_sentence

'Can I take a day off?'

In [None]:
enc_input = tokenizer(input_sentence, return_tensors='pt')
enc_input

{'input_ids': tensor([[1283,   33,  273,    8,  502,  843,   21,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
dec_input_str = '<s>'
dec_input = tokenizer(text_target=dec_input_str, return_tensors='pt')
dec_input

{'input_ids': tensor([[65001,     0]]), 'attention_mask': tensor([[1, 1]])}

In [None]:
enc_input.to(device)
dec_input.to(device)
output = transformer(
    enc_input["input_ids"],
    dec_input["input_ids"][:, :-1],
    enc_input["attention_mask"],
    dec_input["attention_mask"][:, :-1]
)
output

tensor([[[  2.0436,  -8.6198,  -1.9503,  ..., -10.1670, -10.2276, -11.6991]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [None]:
output.shape # N x T x V

torch.Size([1, 1, 65002])

In [None]:
enc_output = encoder(enc_input["input_ids"], enc_input["attention_mask"])
enc_output.shape

torch.Size([1, 8, 64])

In [None]:
dec_output = decoder(
    enc_output,
    dec_input["input_ids"][:, :-1],
    enc_input["attention_mask"],
    dec_input["attention_mask"][:, :-1]
)
dec_output.shape

torch.Size([1, 1, 65002])

In [None]:
torch.allclose(output, dec_output)

True

In [None]:
dec_input_ids = dec_input['input_ids'][:, :-1]
dec_attn_mask = dec_input["attention_mask"][:, :-1]

In [None]:
for _ in range(32):
  dec_output = decoder(
      enc_output,
      dec_input_ids,
      enc_input['attention_mask'],
      dec_attn_mask
  )

  prediction_id = torch.argmax(dec_output[:, -1, :], axis=-1)
  dec_input_ids = torch.hstack((dec_input_ids, prediction_id.view(1, 1)))
  dec_attn_mask = torch.ones_like(dec_input_ids)

  if prediction_id ==0:
    break

In [None]:
tokenizer.decode(dec_input_ids[0])

'<s> ¿Puedo tomar un día libre?</s>'

In [None]:
# TO-DO create a function out of it