<a href="https://colab.research.google.com/github/archyyu/translation-from-RNN-to-transformer/blob/main/machine_translation_by_rnn_with_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

# Set random seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x79573d38c2b0>

In [7]:
hidden_size = 100
embedding_dim = 30
learning_rate = 0.001
batch_size = 50
beam_width = 3

In [3]:
url = "https://raw.githubusercontent.com/archyyu/publicResource/main/eng-fra.txt"
response = requests.get(url)
lines = response.text.split('\n')
en_lines = []
fr_lines = []

start_character = '<'
end_character = '>'
padding_character = '&'

for i in range(2000,3000):
  item = lines[i].split('\t')
  en_lines.append(item[0] + '>')
  fr_lines.append('<' + item[1] + '>')

max_len_line_en = max([len(l) for l in en_lines])
max_len_line_fr = max([len(l) for l in fr_lines])

for i in range(len(en_lines)):
  if (len(en_lines[i]) < max_len_line_en):
    en_lines[i] = en_lines[i].ljust(max_len_line_en, padding_character)
  if (len(fr_lines[i]) < max_len_line_fr):
    fr_lines[i] = fr_lines[i].ljust(max_len_line_fr, padding_character)


source_vocab = sorted(set(''.join(en_lines)))
target_vocab = sorted(set(''.join(fr_lines)))

source_vocab_size = len(set(''.join(source_vocab)))
target_vocab_size = len(set(''.join(target_vocab)))

source_char_to_ix = {ch: i for i, ch in enumerate(source_vocab)}
source_ix_to_char = {i: ch for i, ch in enumerate(source_vocab)}

target_char_to_ix = {ch: i for i, ch in enumerate(target_vocab)}
target_ix_to_char = {i: ch for i, ch in enumerate(target_vocab)}

padding_token_index = target_char_to_ix[padding_character]

In [4]:
def line_to_tensor(line):
  result = []
  line_ten = torch.tensor([source_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

def target_line_to_tensor(line):
  result = []
  line_ten = torch.tensor([target_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

en_data = []
fr_data = []
for i in range(len(en_lines)):
  e = torch.tensor([source_char_to_ix[ch] for ch in en_lines[i]], dtype=torch.long).view(1, -1)
  en_data.append(e)
  f = torch.tensor([target_char_to_ix[ch] for ch in fr_lines[i]], dtype=torch.long).view(1, -1)
  fr_data.append(f)

en_data = torch.cat(en_data, dim=0)
fr_data = torch.cat(fr_data, dim=0)

In [5]:
class Attention(nn.Module):
  def __init__(self, hidden_size):
    super(Attention, self).__init__()
    self.attn = nn.Linear(hidden_size * 2, hidden_size)
    self.v = nn.Parameter(torch.rand(hidden_size))

  def forward(self, hidden, encoder_outputs):
    seq_len = encoder_outputs.size(1)
    # Repeat the hidden state for all timesteps
    hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
    # Concatenate the hidden state and encoder outputs
    energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=-1)))
    # Calculate attention scores
    attention_scores = torch.matmul(energy, self.v)
    # Convert attention scores to probabilities
    attention_weights = torch.softmax(attention_scores, dim=1)
    # Calculate the attention-weighted sum of encoder outputs
    context_vector = torch.sum(attention_weights.unsqueeze(2) * encoder_outputs, dim=1)
    return context_vector

class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.i2h = nn.Linear(embedding_dim, hidden_size, bias=False)
    self.h2h = nn.Linear(hidden_size, hidden_size, bias=False)
    self.hb2 = nn.Parameter(torch.zeros(1, hidden_size))

  def forward(self, x):
    h = torch.zeros(1, self.hidden_size)
    h_list = []
    for i in range(x.shape[1]):
      t = self.embedding(x[:,i])
      h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
      h_list.append(h)
    return torch.stack(h_list, dim=0)


class Decoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size):
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.embedding_dim = embedding_dim
    self.embedding = nn.Embedding(vocab_size, self.embedding_dim)
    self.i2h = nn.Linear(self.embedding_dim, self.hidden_size, bias=False)
    self.h2h = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
    self.h2o = nn.Linear(self.hidden_size, vocab_size, bias=False)

    self.e2d = nn.Linear(self.hidden_size, vocab_size, bias=False)

    self.hb2 = nn.Parameter(torch.zeros(1, self.hidden_size))
    self.ob = nn.Parameter(torch.zeros(1, vocab_size))

    self.att = Attention(hidden_size)

  def init_state(self, encode_state):
    T,B,C = encode_state.shape
    self.encode_state = encode_state.reshape(B,T,C)


  def forward(self, batch_size):

    # if x is None:
    h = torch.zeros(batch_size, self.hidden_size)
    x = torch.tensor([target_char_to_ix[start_character] for _ in range(batch_size)],dtype=torch.long)
    output = []
    for i in range(max_len_line_fr):
      t = self.embedding(x)
      h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
      context_state = self.att(h, self.encode_state)
      y = self.e2d(context_state) + self.h2o(h) + self.ob
      p = nn.functional.softmax(y, dim=1)
      ix = torch.argmax(p, dim=-1)
      x = ix
      output.append(y)
    return torch.stack(output, dim=0).permute(1, 0, 2)

  def beam_search(self):
    """
    Perform beam search to generate sequences.
    """
    beams = [(torch.tensor([target_char_to_ix[start_character]], dtype=torch.long), 1.0)]
    h = torch.zeros(1, self.hidden_size)

    for i in range(max_len_line_fr):
      new_beams = []

      for seq, score in beams:
        x = seq[-1].view(1, -1)  # Take the last predicted token

        t = self.embedding(x)
        h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
        context_state = self.att(h.reshape(-1, self.hidden_size), self.encode_state)
        y = self.e2d(context_state) + self.h2o(h) + self.ob
        p = F.softmax(y, dim=-1)
        top_probs, top_ix = torch.topk(p, beam_width, dim=-1)

        for prob, token_ix in zip(top_probs[0][0], top_ix[0][0]):
          new_seq = torch.cat((seq, torch.tensor([token_ix], dtype=torch.long)), dim=0)
          new_beams.append((new_seq, score * prob.item()))

      beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

    return beams

class Seq2Seq(nn.Module):
  def __init__(self, source_vocab_size, target_vocab_size, embedding_dim, hidden_size):
    super(Seq2Seq, self).__init__()
    self.embedding_dim = embedding_dim
    self.hidden_size = hidden_size
    self.encoder = Encoder(source_vocab_size, self.embedding_dim, self.hidden_size)
    self.decoder = Decoder(target_vocab_size, self.embedding_dim, self.hidden_size)
  def forward(self, source, batch_size):
    hidden_state = self.encoder(source)
    self.decoder.init_state(hidden_state)
    output = self.decoder(batch_size)
    return output

  def translate(self, source):
    hidden_state = self.encoder(source)
    self.decoder.init_state(hidden_state)
    beams = self.decoder.beam_search()
    return beams


# Define your model, loss function, and optimizer
model = Seq2Seq(source_vocab_size, target_vocab_size, embedding_dim, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

In [27]:
#training
import torch.optim as optim

num_epochs = 3

# Training loop
for epoch in range(num_epochs):
  for p in range(len(en_data) - batch_size - 1):

    source_batch = en_data[p:p+batch_size]
    target_batch = fr_data[p:p+batch_size]

    optimizer.zero_grad()
    # encoder = Encoder(source_vocab_size, embedding_dim, hidden_size)
    # output = encoder(source_batch)
    output = model(source_batch, batch_size)

    output = output.reshape(-1, target_vocab_size)
    # remove the padding tokens when calculate the loss
    # Create a mask to ignore padding tokens
    padding_mask = (target_batch != padding_token_index).float()

    # Compute the loss with the padding mask
    loss = criterion(output, target_batch.view(-1))
    loss = (loss * padding_mask.view(-1)).sum() / padding_mask.sum()

    loss.backward()
    for param in model.parameters():
      if param.grad is not None:
        param.grad.data.clamp_(-5, 5)
    optimizer.step()

    if p%100 == 0:
      # Print or log the training loss for each epoch
      print(f'p {p}, Loss: {loss.item()}')

    p += batch_size


p 0, Loss: 0.912079930305481
p 100, Loss: 0.9531503915786743
p 200, Loss: 1.029158592224121
p 300, Loss: 0.7917560338973999
p 400, Loss: 0.834060788154602
p 500, Loss: 0.7916983962059021
p 600, Loss: 0.7827950716018677
p 700, Loss: 1.1014405488967896
p 800, Loss: 1.0037251710891724
p 900, Loss: 0.9997793436050415
p 0, Loss: 0.924619197845459
p 100, Loss: 0.9614687561988831
p 200, Loss: 1.0369371175765991
p 300, Loss: 0.7861080765724182


KeyboardInterrupt: 

In [31]:
test_line = "I cut myself>"

input = line_to_tensor(test_line)

outputs = model.translate(input)
for tensor,p in outputs:
  result = [target_ix_to_char[j.item()] for j in tensor]
  print(''.join(result))

# outputs = model(input,1)
# result = []
# for i in range(outputs.shape[0]):

#   p = nn.functional.softmax(outputs[i], dim=-1).detach().numpy().ravel()
#   ix = np.random.choice(range(target_vocab_size), p=p)

#   result.append(target_ix_to_char[ix])

# print(''.join(result))

<<Nn   s&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
<<Nn  o &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
<<Nn o &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&


In [20]:
en_lines

["I'm chicken.>&",
 "I'm chicken.>&",
 "I'm correct.>&",
 "I'm curious.>&",
 "I'm curious.>&",
 "I'm dancing.>&",
 "I'm dieting.>&",
 "I'm driving.>&",
 "I'm driving.>&",
 "I'm engaged.>&",
 "I'm engaged.>&",
 "I'm excited.>&",
 "I'm excited.>&",
 "I'm fasting.>&",
 "I'm fasting.>&",
 "I'm finicky.>&",
 "I'm finicky.>&",
 "I'm frantic.>&",
 "I'm frantic.>&",
 "I'm furious.>&",
 "I'm healthy.>&",
 "I'm humming.>&",
 "I'm in luck.>&",
 "I'm in luck.>&",
 "I'm jealous.>&",
 "I'm jittery.>&",
 "I'm kidding.>&",
 "I'm kidding.>&",
 "I'm leaving.>&",
 "I'm married.>&",
 "I'm married.>&",
 "I'm no fool.>&",
 "I'm no hero.>&",
 "I'm no liar.>&",
 "I'm not fat.>&",
 "I'm not mad.>&",
 "I'm not mad.>&",
 "I'm not old.>&",
 "I'm not old.>&",
 "I'm not sad.>&",
 "I'm not shy.>&",
 "I'm on duty.>&",
 "I'm patient.>&",
 "I'm patient.>&",
 "I'm popular.>&",
 "I'm psyched.>&",
 "I'm psychic.>&",
 "I'm psychic.>&",
 "I'm puzzled.>&",
 "I'm reading.>&",
 "I'm relaxed.>&",
 "I'm relaxed.>&",
 "I'm retire

I trained all day, but the performance is still poor. I'll figure out why later. Maybe the RNN is too simple? Let's see.

The problem is that in order to train the model by minibatch, I have to add a lot of padding tokens into the training set. This hurts the performance of the model. As seen in the above example, when I tried to translate the English sentence 'closer look' into French, there are a lot of padding tokens (&&&), which is annoying.

I think if I train the model with individual examples, then the problem could be relieved. However, the downside is that it could be time-consuming.

Anyway, I will try to update another file that will use attention mechanism.

See you in another Colab file

see you in the next week?

I have to take care the new baby in my family. so in the weekend, I didnot have the time to adjust my code, and train the model. I will try to do that in the next week. see you then.

I know, this is a small project, I just want to write those projects to be more familiar with the NLP, which is helpful for me in my future phd study.


I finally found the where the problem.

first I should not only use the final hidden state of the encoder. I should concate all the hidden state, and convert them into decoder.

second for the decoder, for the prediction, the predicted character should be conditioned by the previous characters, the current state, and the state of the encoder.

third, it is for the loss function. when I calcualte the loss, if encounter the padding, I should stop, because the error in the loss could cause the model to go at the wrong direction.

okay, I will ask for more advices from chatGPT, continue this project.

one more thing to mention is that, by doing the second step, I could easily add the attention machaism between the encoder and the decoder.