<a href="https://colab.research.google.com/github/archyyu/translation-from-RNN-to-transformer/blob/main/machine_translation_by_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import math

In [14]:
url = "https://raw.githubusercontent.com/archyyu/publicResource/main/eng-fra.txt"
response = requests.get(url)
lines = response.text.split('\n')
en_lines = []
fr_lines = []

start_character = '<'
end_character = '>'
padding_character = '&'

for i in range(20000,30000):
  item = lines[i].split('\t')
  en_lines.append('<' + item[0] + '>')
  fr_lines.append('<' + item[1] + '>')

max_len_line_en = max([len(l) for l in en_lines])
max_len_line_fr = max([len(l) for l in fr_lines])
max_len_line_en = max_len_line_fr

for i in range(len(en_lines)):
  if (len(en_lines[i]) < max_len_line_en):
    en_lines[i] = en_lines[i].ljust(max_len_line_en, padding_character)
  if (len(fr_lines[i]) < max_len_line_fr):
    fr_lines[i] = fr_lines[i].ljust(max_len_line_fr, padding_character)


source_vocab = sorted(set(''.join(en_lines)))
target_vocab = sorted(set(''.join(fr_lines)))

source_vocab_size = len(set(''.join(source_vocab)))
target_vocab_size = len(set(''.join(target_vocab)))

source_char_to_ix = {ch: i for i, ch in enumerate(source_vocab)}
source_ix_to_char = {i: ch for i, ch in enumerate(source_vocab)}

target_char_to_ix = {ch: i for i, ch in enumerate(target_vocab)}
target_ix_to_char = {i: ch for i, ch in enumerate(target_vocab)}

In [15]:
# Hyperparameters
embedding_dim = 64
seq_length = 80
learning_rate = 1e-1
batch_size = 20
num_heads = 4
head_size = 12
head_num = 4
layer_num = 4
dropout = 0.2

In [16]:
def line_to_tensor(line):
  result = []
  line_ten = torch.tensor([source_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

def target_line_to_tensor(line):
  result = []
  line_ten = torch.tensor([target_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

en_data = []
fr_data = []
for i in range(len(en_lines)):
  e = torch.tensor([source_char_to_ix[ch] for ch in en_lines[i]], dtype=torch.long).view(1, -1)
  en_data.append(e)
  f = torch.tensor([target_char_to_ix[ch] for ch in fr_lines[i]], dtype=torch.long).view(1, -1)
  fr_data.append(f)

en_data = torch.cat(en_data, dim=0)
fr_data = torch.cat(fr_data, dim=0)

In [17]:
class AttentionHead(nn.Module):
  def __init__(self, embed_size, head_size):
    super(AttentionHead, self).__init__()
    self.C = embed_size
    self.head_size = head_size
    self.q = nn.Linear(self.C, head_size, bias=False)
    self.k = nn.Linear(self.C, head_size, bias=False)
    self.v = nn.Linear(self.C, head_size, bias=False)

  def forward(self, x):
    B,T,C = x.shape
    q = self.q(x)
    k = self.k(x)
    v = self.v(x)

    wei = q @ k.transpose(-2, -1) * (self.head_size ** -0.5)
    wei = F.softmax(wei, dim=-1)

    out = wei @ v
    return out

class CrossAttentionHead(nn.Module):
  def __init__(self, embed_size, head_size):
    super(CrossAttentionHead, self).__init__()
    self.C = embed_size
    self.head_size = head_size
    self.q = nn.Linear(self.C, head_size, bias=False)
    self.k = nn.Linear(self.C, head_size, bias=False)
    self.v = nn.Linear(self.C, head_size, bias=False)
    self.register_buffer('tril',torch.tril(torch.ones(seq_length, seq_length)))

  def forward(self, x, q):
    B,T,C = q.shape
    q = self.q(q)
    k = self.k(x)
    v = self.v(x)

    wei = q @ k.transpose(-2, -1) * (self.head_size ** -0.5)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    out = wei @ v
    return out


class EncoderMultiHeadAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, head_size):
    super(EncoderMultiHeadAttention, self).__init__()
    self.num_heads = num_heads

    self.heads = nn.ModuleList([
        AttentionHead(embedding_size, head_size) for _ in range(num_heads)
    ])

    self.final_linear = nn.Linear(num_heads * head_size, embedding_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):

    head_outputs = [head(x) for head in self.heads]
    concatenated_output = torch.cat(head_outputs, dim=-1)
    final_output = self.final_linear(concatenated_output)
    final_output = self.dropout(final_output)
    return final_output

class DecoderMultiHeadAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, head_size):
    super(DecoderMultiHeadAttention, self).__init__()
    self.num_heads = num_heads

    self.heads = nn.ModuleList([
        CrossAttentionHead(embedding_size, head_size) for _ in range(num_heads)
    ])

    self.final_linear = nn.Linear(num_heads * head_size, embedding_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, encoder_x):
    #x, encoder_x if those two are same, then it is self-attention, else cross-attention
    head_outputs = [head(x, encoder_x) for head in self.heads]
    concatenated_output = torch.cat(head_outputs, dim=-1)
    final_output = self.final_linear(concatenated_output)
    final_output = self.dropout(final_output)
    return final_output



class FeedFoward(nn.Module):
  def __init__(self, embedding_size):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(embedding_size, 4 * embedding_size),
      nn.ReLU(),
      nn.Linear(4 * embedding_size, embedding_size),
      nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

class EncoderBlockAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, head_size):
    super(EncoderBlockAttention, self).__init__()
    self.multiheads = EncoderMultiHeadAttention(num_heads, embedding_size, head_size)
    self.fw = FeedFoward(embedding_size)
    self.norm1 = nn.LayerNorm(embedding_size)
    self.norm2 = nn.LayerNorm(embedding_size)

  def forward(self, x):
    inter_result = x + self.multiheads(self.norm1(x))
    final_output = x + self.fw(self.norm2(inter_result))
    return final_output


class DecoderBlockAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, head_size):
    super(DecoderBlockAttention, self).__init__()
    self.multiheads = DecoderMultiHeadAttention(num_heads, embedding_size, head_size)
    self.fw = FeedFoward(embedding_size)
    self.norm1 = nn.LayerNorm(embedding_size)
    self.norm2 = nn.LayerNorm(embedding_size)

  def forward(self, x, encoder_x):
    inter_result = x + self.multiheads(self.norm1(x), encoder_x)
    final_output = x + self.fw(self.norm2(inter_result))
    return final_output

In [18]:
class Decoder(nn.Module):
  def __init__(self, num_heads, vocab_size, embedding_size, head_size):
    super(Decoder, self).__init__()

    self.em = nn.Embedding(vocab_size, embedding_size)
    self.pos_encode = nn.Embedding(seq_length, embedding_size)
    self.selfattentionblocks = nn.ModuleList([DecoderBlockAttention(num_heads, embedding_size, head_size) for _ in range(4)])
    self.crossattentionblocks = nn.ModuleList([DecoderBlockAttention(num_heads, embedding_size, head_size) for _ in range(4)])
    self.f_norm = nn.LayerNorm(embedding_size)
    self.fw = nn.Linear(embedding_size, vocab_size, bias=False)

  def forward(self, x, encoder_x):
    B,T = x.shape
    x_em = self.em(x)
    p_em = self.pos_encode(torch.arange(T))
    x = x_em + p_em
    for block in self.selfattentionblocks:
      x = block(x, encoder_x)
    for block in self.crossattentionblocks:
      x = block(x, encoder_x)
    x = self.f_norm(x)
    x = self.fw(x)
    return x

class Encoder(nn.Module):
  def __init__(self, num_heads, vocab_size, embedding_size, head_size):
    super(Encoder, self).__init__()
    self.em = nn.Embedding(vocab_size, embedding_size)
    self.pos_encode = nn.Embedding(seq_length, embedding_size)
    self.blocks = nn.ModuleList([EncoderBlockAttention(num_heads, embedding_size, head_size) for _ in range(4)])
    self.f_norm = nn.LayerNorm(embedding_size)
    self.fw = nn.LayerNorm(embedding_size, vocab_size, bias=False)
  def forward(self, x):
    B,T = x.shape
    x_em = self.em(x)
    p_em = self.pos_encode(torch.arange(T))
    x = x_em + p_em
    for block in self.blocks:
      x = block(x)
    x = self.f_norm(x)
    x = self.fw(x)
    return x

class Transformer(nn.Module):
  def __init__(self, num_heads, embed_size, input_vocab_size, target_vocab_size, source_seq_length, target_seq_length, head_size):
    super(Transformer, self).__init__()
    self.encoder = Encoder(num_heads, input_vocab_size, embed_size, head_size)
    self.decoder = Decoder(num_heads, target_vocab_size, embed_size, head_size)

  def forward(self, encoder_input, decoder_input):
    encoder_output = self.encoder(encoder_input)
    decoder_output = self.decoder(decoder_input, encoder_output)
    return decoder_output

In [19]:
criterion = nn.CrossEntropyLoss()
model = Transformer(num_heads, embedding_dim, source_vocab_size, target_vocab_size, max_len_line_en, max_len_line_fr, head_size)
optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

In [20]:
#training
import torch.optim as optim

num_epochs = 30

# Training loop
for epoch in range(num_epochs):
  for p in range(0,len(en_data) - batch_size - 1,batch_size):

    source_batch = en_data[p:p+batch_size]
    target_batch = fr_data[p:p+batch_size]

    optimizer.zero_grad()
    output = model(source_batch, target_batch)

    B,T,C = output.shape

    # Compute the loss with the padding mask
    loss = criterion(output.view(B*T, -1), target_batch.view(-1))

    loss.backward()
    for param in model.parameters():
      if param.grad is not None:
        param.grad.data.clamp_(-5, 5)
    optimizer.step()

    if p%500 == 0:
      # Print or log the training loss for each epoch
      print(f'p {p}, Loss: {loss.item()}')


p 0, Loss: 4.861046314239502
p 500, Loss: 1.9869235754013062
p 1000, Loss: 1.2868788242340088
p 1500, Loss: 1.2082597017288208
p 2000, Loss: 1.5616120100021362
p 2500, Loss: 1.3507484197616577
p 3000, Loss: 1.0626707077026367
p 3500, Loss: 0.6152390837669373
p 4000, Loss: 0.04362526163458824
p 4500, Loss: 0.03119976632297039
p 5000, Loss: 0.05766305327415466
p 5500, Loss: 0.051149796694517136
p 6000, Loss: 0.008226211182773113


KeyboardInterrupt: 

In [45]:
test_line = "We're armed>&"

input = line_to_tensor(test_line)

outputs = model(input,target_line_to_tensor('<'))
outputs = outputs.squeeze(0)

softmax_probs = F.softmax(outputs, dim=-1)
max_indices = torch.argmax(softmax_probs, dim=-1)

result = []
for i in range(max_indices.size(0)):
  result.append(target_ix_to_char[max_indices[i].item()])


print(''.join(result))

# outputs = model(input,1)
# result = []
# for i in range(outputs.shape[0]):

#   p = nn.functional.softmax(outputs[i], dim=-1).detach().numpy().ravel()
#   ix = np.random.choice(range(target_vocab_size), p=p)

#   result.append(target_ix_to_char[ix])

# print(''.join(result))

&eere armed>&


It is about the Chinese New Year
So will stop that repo contemparaly

will continue after the festival