<a href="https://colab.research.google.com/github/archyyu/translation-from-RNN-to-transformer/blob/main/machine_translation_by_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import math

In [2]:
url = "https://raw.githubusercontent.com/archyyu/publicResource/main/eng-fra.txt"
response = requests.get(url)
lines = response.text.split('\n')
en_lines = []
fr_lines = []
fr_targets = []

start_character = '<'
end_character = '>'
padding_character = '&'

for i in range(0,10000):
  item = lines[i].split('\t')
  en_lines.append('<' + item[0] + '>')
  fr_lines.append('<' + item[1])
  fr_targets.append(item[1] + '>')

max_len_line_en = max([len(l) for l in en_lines])
max_len_line_fr = max([len(l) for l in fr_lines])
max_len_line_en = max_len_line_fr

for i in range(len(en_lines)):
  if (len(en_lines[i]) < max_len_line_en):
    en_lines[i] = en_lines[i].ljust(max_len_line_en, padding_character)
  if (len(fr_lines[i]) < max_len_line_fr):
    fr_lines[i] = fr_lines[i].ljust(max_len_line_fr, padding_character)
    fr_targets[i] = fr_targets[i].ljust(max_len_line_fr, padding_character)


source_vocab = sorted(set(''.join(en_lines)))
target_vocab = sorted(set(''.join(fr_lines)))
target_vocab.append('>')

source_vocab_size = len(set(''.join(source_vocab)))
target_vocab_size = len(set(''.join(target_vocab)))

source_char_to_ix = {ch: i for i, ch in enumerate(source_vocab)}
source_ix_to_char = {i: ch for i, ch in enumerate(source_vocab)}

target_char_to_ix = {ch: i for i, ch in enumerate(target_vocab)}
target_ix_to_char = {i: ch for i, ch in enumerate(target_vocab)}

padding_token_index = target_char_to_ix[padding_character]

In [3]:
# Hyperparameters
embedding_dim = 200
seq_length = 80
learning_rate = 0.0001
batch_size = 20
num_heads = 10
head_size = 20
layer_num = 4
dropout = 0.1

In [4]:
def line_to_tensor(line):
  result = []
  line_ten = torch.tensor([source_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

def target_line_to_tensor(line):
  result = []
  line_ten = torch.tensor([target_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

en_data = []
fr_data = []
fr_targets_data = []
for i in range(len(en_lines)):
  e = torch.tensor([source_char_to_ix[ch] for ch in en_lines[i]], dtype=torch.long).view(1, -1)
  en_data.append(e)
  f = torch.tensor([target_char_to_ix[ch] for ch in fr_lines[i]], dtype=torch.long).view(1, -1)
  fr_data.append(f)
  f = torch.tensor([target_char_to_ix[ch] for ch in fr_targets[i]], dtype=torch.long).view(1, -1)
  fr_targets_data.append(f)

en_data = torch.cat(en_data, dim=0)
fr_data = torch.cat(fr_data, dim=0)
fr_targets_data = torch.cat(fr_targets_data, dim=0)

In [53]:
class AttentionHead(nn.Module):
  def __init__(self, embed_size, head_size):
    super(AttentionHead, self).__init__()
    self.C = embed_size
    self.head_size = head_size
    self.q = nn.Linear(self.C, head_size, bias=False)
    self.k = nn.Linear(self.C, head_size, bias=False)
    self.v = nn.Linear(self.C, head_size, bias=False)
    self.dropout = nn.Dropout(dropout)

  def forward(self, q, k, v, mask):
    B,T,C = q.shape
    q = self.q(q)
    k = self.k(k)
    v = self.v(v)

    wei = q @ k.transpose(-2, -1) * (self.head_size ** -0.5)
    wei.masked_fill_(mask == 0, -1e9)
    wei = F.softmax(wei, dim=-1)

    out = self.dropout(wei @ v)
    return out


class MultiHeadAttentionBlock(nn.Module):
  def __init__(self, embedding_size, num_heads):
    super(MultiHeadAttentionBlock, self).__init__()
    self.num_heads = num_heads
    head_size = embedding_size // num_heads

    self.heads = nn.ModuleList([
        AttentionHead(embedding_size, head_size) for _ in range(num_heads)
    ])

    self.final_linear = nn.Linear(num_heads * head_size, embedding_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, q, k, v, mask):

    head_outputs = [head(q, k, v, mask) for head in self.heads]
    concatenated_output = torch.cat(head_outputs, dim=-1)
    final_output = self.final_linear(concatenated_output)
    final_output = self.dropout(final_output)
    return final_output


class LayerNormalization(nn.Module):
  def __init__(self, features):
    super().__init__()
    self.norm = nn.LayerNorm(features)

  def forward(self, x):
    return self.norm(x)

class FeedForwardBlock(nn.Module):

  def __init__(self, d_model: int, d_ff: int) -> None:
    super().__init__()
    self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
    self.dropout = nn.Dropout(dropout)
    self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

  def forward(self, x):
    # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
    return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class InputEmbeddings(nn.Module):

  def __init__(self, d_model: int, vocab_size: int) -> None:
    super().__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, d_model)

  def forward(self, x):
    # (batch, seq_len) --> (batch, seq_len, d_model)
    # Multiply by sqrt(d_model) to scale the embeddings according to the paper
    return self.embedding(x) * math.sqrt(self.d_model)


class PositionalEncoding(nn.Module):
  def __init__(self, d_model, seq_len):
    super().__init__()
    self.C = d_model
    T = seq_len
    n = 10000
    pe = torch.zeros((T, self.C))
    for k in range(T):
      for i in torch.arange(int(self.C/2)):
        denominator = torch.pow(n, 2*i/self.C)
        pe[k, 2*i] += torch.sin(k/denominator)
        pe[k, 2*i+1] += torch.cos(k/denominator)

    pe.requires_grad_(False)
    self.register_buffer('pe', pe)

  def forward(self, x):
    return x + self.pe

class ResidualConnection(nn.Module):

  def __init__(self, features: int) -> None:
    super().__init__()
    self.dropout = nn.Dropout(dropout)
    self.norm = LayerNormalization(features)

  def forward(self, x, sublayer):
    return x + self.dropout(sublayer(self.norm(x)))

class EncoderBlock(nn.Module):

  def __init__(self, h, d_model) -> None:
    super().__init__()
    self.self_attention_block = MultiHeadAttentionBlock(d_model, h)
    self.feed_forward_block = FeedForwardBlock(d_model, d_model * 4)
    self.residual_connections = nn.ModuleList([ResidualConnection(d_model) for _ in range(2)])

  def forward(self, x, src_mask):
    x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
    x = self.residual_connections[1](x, self.feed_forward_block)
    return x

class Encoder(nn.Module):

  def __init__(self, h, d_model) -> None:
    super().__init__()
    self.layers = nn.ModuleList([EncoderBlock(h, d_model) for _ in range(layer_num)])
    self.norm = LayerNormalization(d_model)

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

class DecoderSelfAttentionBlock(nn.Module):
  def __init__(self, h, d_model):
    super().__init__()
    self.self_attention_block = MultiHeadAttentionBlock(d_model, h)
    self.residual_conn = ResidualConnection(d_model)

  def forward(self, x, k, v, mask):
    x = self.residual_conn(x, lambda x: self.self_attention_block(x, k, v, mask))
    return x

class DecoderCrossAttentionBlock(nn.Module):
  def __init__(self, h, d_model):
    super().__init__()
    self.cross_attention_block = MultiHeadAttentionBlock(d_model, h)
    self.residual_conn = ResidualConnection(d_model)

  def forward(self, x, encoder_output, mask):
    x = self.residual_conn(x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, mask))
    return x

class DecoderFeedforwardBlock(nn.Module):
  def __init__(self, d_model):
    super().__init__()
    self.feed_forward_block = FeedForwardBlock(d_model, 4*d_model)
    self.residual_conn = ResidualConnection(d_model)

  def forward(self, x):
    x = self.residual_conn(x, self.feed_forward_block)
    return x

class DecoderBlock(nn.Module):

  def __init__(self, h, d_model) -> None:
    super().__init__()
    self.self_attention_block = DecoderSelfAttentionBlock(h, d_model)
    self.cross_attention_block = DecoderCrossAttentionBlock(h, d_model)
    self.feed_forward_block = DecoderFeedforwardBlock(d_model)

  def forward(self, x, encoder_output, src_mask, tgt_mask):
    x = self.self_attention_block(x, x, x, tgt_mask)
    x = self.cross_attention_block(x, encoder_output, src_mask)
    x = self.feed_forward_block(x)
    return x

class Decoder(nn.Module):

  def __init__(self, h, d_model) -> None:
    super().__init__()
    self.layers = nn.ModuleList([DecoderBlock(h, d_model) for i in range(layer_num)])
    self.norm = LayerNormalization(d_model)

  def forward(self, x, encoder_output, src_mask, tgt_mask):
    for layer in self.layers:
      x = layer(x, encoder_output, src_mask, tgt_mask)
    return self.norm(x)

class ProjectionLayer(nn.Module):

  def __init__(self, d_model, vocab_size) -> None:
    super().__init__()
    self.proj = nn.Linear(d_model, vocab_size)

  def forward(self, x) -> None:
    return self.proj(x)

class Transformer(nn.Module):

  def __init__(self, h, d_model, src_vocab_size, tgt_vocab_size, seq_len) -> None:
    super().__init__()
    self.encoder = Encoder(h, d_model)
    self.decoder = Decoder(h, d_model)
    self.src_embed = InputEmbeddings(d_model, src_vocab_size)
    self.tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)
    self.src_pos = PositionalEncoding(d_model, seq_len)
    self.tgt_pos = PositionalEncoding(d_model, seq_len)
    self.projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

  def encode(self, src, src_mask):
    src = self.src_embed(src)
    src = self.src_pos(src)
    return self.encoder(src, src_mask)

  def decode(self, encoder_output, src_mask, tgt, tgt_mask):
    tgt = self.tgt_embed(tgt)
    tgt = self.tgt_pos(tgt)
    return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

  def project(self, x):
    return self.projection_layer(x)

  def forward(self, src, tgt, src_mask, tgt_mask):
    encoder_output = self.encode(src, src_mask)
    decoder_output = self.decode(encoder_output, src_mask, tgt, tgt_mask)
    return self.project(decoder_output)


In [54]:
criterion = nn.CrossEntropyLoss(ignore_index=target_char_to_ix['&'],label_smoothing=0.1)
# model = Transformer(num_heads, embedding_dim, source_vocab_size, target_vocab_size, max_len_line_en, max_len_line_fr, head_size)
model = Transformer(num_heads, embedding_dim, source_vocab_size, target_vocab_size, max_len_line_en)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, eps=1e-9)

def causal_mask(size):
  mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
  return mask == 0

In [56]:
#training
import torch.optim as optim
batch_size = 8
num_epochs = 10
for epoch in range(num_epochs):
  for p in range(0,len(en_data) - batch_size - 1,batch_size):
    source_batch = en_data[p:p+batch_size]
    target_batch = fr_data[p:p+batch_size]
    actual_target_batch = fr_targets_data[p:p+batch_size]

    src_mask = (source_batch != source_char_to_ix['&']).unsqueeze(1).int()
    tgt_mask = (target_batch != target_char_to_ix['&']).unsqueeze(1).int() & causal_mask(target_batch.shape[1])

    cross_mask = (target_batch != target_char_to_ix['&']).unsqueeze(1).int()

    # print(src_mask.shape)
    # print(tgt_mask.shape)
    # print(mm.shape)

    # break

    optimizer.zero_grad(set_to_none=True)
    output = model(source_batch, target_batch, src_mask, tgt_mask)

    # print(torch.argmax(output, dim=-1))

    # padding_mask = (actual_target_batch != padding_token_index).float()
    loss = criterion(output.view(-1, target_vocab_size), actual_target_batch.reshape(-1))
    # loss = (loss * padding_mask).sum() / padding_mask.sum()

    loss.backward()
    optimizer.step()

    if p % 500 == 0:
      print(f'p {p}, Loss: {loss.item()}')

    # break
  # break

p 0, Loss: 4.702218532562256
p 1000, Loss: 2.973327159881592
p 2000, Loss: 2.353698968887329
p 3000, Loss: 2.5687243938446045
p 4000, Loss: 2.864623785018921
p 5000, Loss: 2.68341326713562
p 6000, Loss: 2.676335096359253
p 7000, Loss: 2.36032772064209
p 8000, Loss: 2.2943074703216553
p 9000, Loss: 2.40937876701355
p 0, Loss: 3.0346388816833496
p 1000, Loss: 2.3620543479919434
p 2000, Loss: 2.03411865234375
p 3000, Loss: 2.3095972537994385
p 4000, Loss: 2.6267452239990234
p 5000, Loss: 2.382294178009033
p 6000, Loss: 2.190484046936035
p 7000, Loss: 2.1710896492004395
p 8000, Loss: 2.1295042037963867
p 9000, Loss: 2.287872076034546
p 0, Loss: 2.778531074523926


KeyboardInterrupt: 

In [57]:
with torch.no_grad():
  output = '<'
  test_line = "Hurry home.>&"
  top_three_values = []
  top_three_values.append(['<', 1])

  for i in range(20):

    top_three_values.sort(key=lambda x: x[1],reverse=True)
    top_three_values = top_three_values[:3]

    new_top_three_values = []

    for prefix in top_three_values:

      input = line_to_tensor(test_line)
      target = target_line_to_tensor(prefix[0])
      src_mask = (input != source_char_to_ix['&']).unsqueeze(1).int()
      tgt_mask = (target != target_char_to_ix['&']).unsqueeze(1).int() & causal_mask(target.shape[1])

      print(input)
      print(target)
      print(src_mask)
      print(tgt_mask)

      exit(False)

      outputs = model(input, target, src_mask, tgt_mask)
      outputs = outputs.squeeze(0)
      softmax_probs = F.softmax(outputs, dim=-1)

      topk_item, topk_indices = torch.topk(softmax_probs[len(output) - 1], k=3)

      for i in range(3):
        sss = prefix[0] + target_ix_to_char[topk_indices[i].item()]
        item = [sss, prefix[1] * topk_item[i]]
        new_top_three_values.append(item)

    top_three_values = new_top_three_values

top_three_values.sort(key=lambda x: x[1],reverse=True)
top_three_values = top_three_values[:3]

for o in top_three_values:
  print(o[0],o[1])

tensor([[28, 66, 63, 63, 70,  0, 53, 60, 58, 50,  7, 19,  3]])
tensor([[23, 60, 57, 57, 64,  0, 47, 54, 52, 44,  7, 90,  3]])
tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]], dtype=torch.int32)
tensor([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]], dtype=torch.int32)


RuntimeError: The size of tensor a (13) must match the size of tensor b (58) at non-singleton dimension 1

It is about the Chinese New Year

So I will halt the project contemparaly

and continue after the festival

compared with translation of RNN
the transformer actually need more parameters to train.
if those two model have the same amount of parameters, then rnn will perform better

the main advantage of transformer, I think, is it could scale well. which means if we add more layers, then the performance will increase linearly(i think), but the rnn is not.

in the GPT project, I have tested rnn, if I made the rnn more deep, the overall performance didnot go well.