<a href="https://colab.research.google.com/github/archyyu/translation-from-RNN-to-transformer/blob/main/machine_translation_by_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

# Set random seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x7d12903743d0>

In [2]:
hidden_size = 100
embedding_dim = 30
learning_rate = 1e-1 * 0.1
batch_size = 50
beam_width = 3

In [3]:
url = "https://raw.githubusercontent.com/archyyu/publicResource/main/eng-fra.txt"
response = requests.get(url)
lines = response.text.split('\n')
en_lines = []
fr_lines = []

start_character = '<'
end_character = '>'
padding_character = '&'

for i in range(1000,2000):
  item = lines[i].split('\t')
  en_lines.append(item[0] + '>')
  fr_lines.append(item[1] + '>')

max_len_line_en = min([len(l) for l in en_lines])
max_len_line_fr = min([len(l) for l in fr_lines])

for i in range(len(en_lines)):
  if (len(en_lines[i]) > max_len_line_en):
    en_lines[i] = en_lines[i][0:max_len_line_en]
  if (len(fr_lines[i]) > max_len_line_fr):
    fr_lines[i] = fr_lines[i][0:max_len_line_fr]


source_vocab = sorted(set(''.join(en_lines)))
target_vocab = sorted(set(''.join(fr_lines)))
target_vocab.append('<')

source_vocab_size = len(set(''.join(source_vocab)))
target_vocab_size = len(set(''.join(target_vocab)))

source_char_to_ix = {ch: i for i, ch in enumerate(source_vocab)}
source_ix_to_char = {i: ch for i, ch in enumerate(source_vocab)}

target_char_to_ix = {ch: i for i, ch in enumerate(target_vocab)}
target_ix_to_char = {i: ch for i, ch in enumerate(target_vocab)}

# padding_token_index = target_char_to_ix[padding_character]

In [4]:
def line_to_tensor(line):
  result = []
  line_ten = torch.tensor([source_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

def target_line_to_tensor(line):
  result = []
  line_ten = torch.tensor([target_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

en_data = []
fr_data = []
for i in range(len(en_lines)):
  e = torch.tensor([source_char_to_ix[ch] for ch in en_lines[i]], dtype=torch.long).view(1, -1)
  en_data.append(e)
  f = torch.tensor([target_char_to_ix[ch] for ch in fr_lines[i]], dtype=torch.long).view(1, -1)
  fr_data.append(f)

en_data = torch.cat(en_data, dim=0)
fr_data = torch.cat(fr_data, dim=0)

In [38]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.i2h = nn.Linear(embedding_dim, hidden_size, bias=False)
    self.h2h = nn.Linear(hidden_size, hidden_size, bias=False)
    self.hb2 = nn.Parameter(torch.zeros(1, hidden_size))

  def forward(self, x):
    h = torch.zeros(1, self.hidden_size)
    for i in range(x.shape[1]):
      t = self.embedding(x[:,i])
      h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
    return h


class Decoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size):
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.embedding_dim = embedding_dim
    self.embedding = nn.Embedding(vocab_size, self.embedding_dim)
    self.i2h = nn.Linear(self.embedding_dim, self.hidden_size, bias=False)
    self.h2h = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
    self.h2o = nn.Linear(self.hidden_size * 2, vocab_size, bias=False)

    self.hb2 = nn.Parameter(torch.zeros(1, self.hidden_size))
    self.ob = nn.Parameter(torch.zeros(1, vocab_size))

  def init_state(self, encode_state):
    self.encode_state = encode_state


  # def forward(self, target):

  #   # if x is None:
  #   h = torch.zeros(1, self.hidden_size)
  #   output = []
  #   for i in range(max_len_line_fr):
  #     x = target[:,i]
  #     t = self.embedding(x)
  #     h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
  #     y = self.h2o(torch.cat((self.encode_state, h), dim=-1)) + self.ob
  #     output.append(y)
  #   return torch.stack(output, dim=0)

  def forward(self, batch_size):

    # if x is None:
    h = torch.zeros(1, self.hidden_size)
    x = torch.tensor([target_char_to_ix[start_character] for _ in range(batch_size)],dtype=torch.long)
    output = []
    for i in range(max_len_line_fr):
      t = self.embedding(x)
      h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
      y = self.h2o(torch.cat((self.encode_state, h), dim=-1)) + self.ob
      p = nn.functional.softmax(y, dim=1)
      ix = torch.argmax(p, dim=-1)
      x = ix
      output.append(y)
    return torch.stack(output, dim=0).permute(1, 0, 2)

  def beam_search(self):
    """
    Perform beam search to generate sequences.
    """
    beams = [(torch.tensor([target_char_to_ix[start_character]], dtype=torch.long), 1.0)]
    h = torch.zeros(1, self.hidden_size)

    for i in range(max_len_line_fr):
      new_beams = []

      for seq, score in beams:
        x = seq[-1].view(1, -1)  # Take the last predicted token

        t = self.embedding(x)
        h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
        y = self.h2o(torch.cat((self.encode_state.unsqueeze(1), h), dim=-1)) + self.ob
        p = F.softmax(y, dim=-1)
        top_probs, top_ix = torch.topk(p, beam_width, dim=-1)

        for prob, token_ix in zip(top_probs[0][0], top_ix[0][0]):
          new_seq = torch.cat((seq, torch.tensor([token_ix], dtype=torch.long)), dim=0)
          new_beams.append((new_seq, score * prob.item()))

      beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

    return beams

class Seq2Seq(nn.Module):
  def __init__(self, source_vocab_size, target_vocab_size, embedding_dim, hidden_size):
    super(Seq2Seq, self).__init__()
    self.embedding_dim = embedding_dim
    self.hidden_size = hidden_size
    self.encoder = Encoder(source_vocab_size, self.embedding_dim, self.hidden_size)
    self.decoder = Decoder(target_vocab_size, self.embedding_dim, self.hidden_size)
  def forward(self, source, batch_size):
    hidden_state = self.encoder(source)
    self.decoder.init_state(hidden_state)
    output = self.decoder(batch_size)
    return output

  def translate(self, source):
    hidden_state = self.encoder(source)
    self.decoder.init_state(hidden_state)
    beams = self.decoder.beam_search()
    return beams


# Define your model, loss function, and optimizer
model = Seq2Seq(source_vocab_size, target_vocab_size, embedding_dim, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

In [49]:
#training
import torch.optim as optim

num_epochs = 300

# Training loop
for epoch in range(num_epochs):
  for p in range(0,len(en_data) - batch_size - 1,batch_size):

    source_batch = en_data[p:p+batch_size]
    target_batch = fr_data[p:p+batch_size]

    optimizer.zero_grad()

    output = model(source_batch, batch_size)

    # remove the padding tokens when calculate the loss
    # Create a mask to ignore padding tokens
    # padding_mask = (target_batch != padding_token_index).float()
    output = output.reshape(-1, target_vocab_size)

    # Compute the loss with the padding mask
    loss = criterion(output, target_batch.view(-1))
    # loss = (loss * padding_mask.view(-1)).sum() / padding_mask.sum()

    # for i in range(target_batch.shape[0]):
    #   line = []
    #   for j in range(len(target_batch[i])):
    #     line.append(target_ix_to_char[target_batch[i][j].item()])
    #   print(''.join(line))

    # print("---")
    # probability_np = output.detach().numpy()
    # Find the index of the maximum probability along the last dimension
    # max_index_np = np.argmax(probability_np, axis=-1)
    # Convert the resulting NumPy array back to a PyTorch tensor
    # output = torch.tensor(max_index_np).reshape(target_batch.shape)
    # for i in range(output.shape[0]):
    #   line = []
    #   for j in range(len(output[i])):
    #     line.append(target_ix_to_char[output[i][j].item()])
    #   print(''.join(line))
    # print(loss.item())
    # print(target_batch.shape)
    # print(output.shape)
    # break

    loss.backward()
    for param in model.parameters():
      if param.grad is not None:
        param.grad.data.clamp_(-5, 5)
    optimizer.step()

    if p%300 == 0:
      # Print or log the training loss for each epoch
      print(f'p {p}, Loss: {loss.item()}')

    # break
  # break

p 0, Loss: 2.0768420696258545
p 300, Loss: 2.582000494003296
p 600, Loss: 2.5315101146698
p 900, Loss: 2.0100529193878174
p 0, Loss: 2.0279698371887207
p 300, Loss: 2.4813873767852783
p 600, Loss: 2.4581587314605713
p 900, Loss: 1.7988905906677246
p 0, Loss: 2.1648037433624268
p 300, Loss: 2.6059515476226807
p 600, Loss: 2.6002471446990967
p 900, Loss: 1.9809889793395996
p 0, Loss: 2.0555474758148193
p 300, Loss: 2.4510719776153564
p 600, Loss: 2.4972212314605713
p 900, Loss: 1.905595302581787
p 0, Loss: 2.194617748260498
p 300, Loss: 2.6806352138519287
p 600, Loss: 2.361604928970337
p 900, Loss: 2.4408352375030518
p 0, Loss: 1.993909239768982
p 300, Loss: 2.4522523880004883
p 600, Loss: 2.6946771144866943
p 900, Loss: 2.058340549468994
p 0, Loss: 2.3536906242370605
p 300, Loss: 2.731320858001709
p 600, Loss: 2.4768028259277344
p 900, Loss: 2.1593918800354004
p 0, Loss: 2.3043787479400635
p 300, Loss: 2.5196640491485596
p 600, Loss: 2.6683099269866943
p 900, Loss: 1.9398373365402222
p 

In [50]:
test_line = "I like art.>"

input = line_to_tensor(test_line)


outputs = model.translate(input)
for tensor,p in outputs:
  result = [target_ix_to_char[j.item()] for j in tensor]
  print(''.join(result))

# outputs = model(input,1)
# result = []
# for i in range(outputs.shape[0]):

#   p = nn.functional.softmax(outputs[i], dim=-1).detach().numpy().ravel()
#   ix = np.random.choice(range(target_vocab_size), p=p)

#   result.append(target_ix_to_char[ix])

# print(''.join(result))

<Je'JÊeJ
<JeeJMes
<JeeJMe 


In [None]:
en_lines

['I guess so.>',
 'I guess so.>',
 'I had help.>',
 'I hate you.>',
 'I hate you.>',
 'I have one.>',
 'I have one.>',
 'I have won.>',
 'I have won.>',
 'I help him.>',
 'I hope not.>',
 'I hope not.>',
 'I know CPR.>',
 'I know her.>',
 'I know him.>',
 'I like art.>',
 'I like him.>',
 'I like him.>',
 'I like tea.>',
 'I like you.>',
 'I like you.>',
 'I like you.>',
 'I liked it.>',
 'I liked it.>',
 'I love Tom.>',
 'I love tea.>',
 'I love you.>',
 'I love you.>',
 'I loved it.>',
 'I made tea.>',
 'I made two.>',
 'I made two.>',
 'I met them.>',
 'I met them.>',
 'I met them.>',
 'I must run.>',
 'I must run.>',
 'I need air.>',
 'I need air.>',
 'I need ice.>',
 'I need you.>',
 'I need you.>',
 'I panicked.>',
 'I promised.>',
 'I ran away.>',
 'I ran home.>',
 'I remember.>',
 'I remember.>',
 'I remember.>',
 'I said yes.>',
 'I sat down.>',
 'I saw that.>',
 'I saw them.>',
 'I saw them.>',
 'I saw them.>',
 'I screamed.>',
 'I see them.>',
 'I survived.>',
 'I threw up.>

I trained all day, but the performance is still poor. I'll figure out why later. Maybe the RNN is too simple? Let's see.

The problem is that in order to train the model by minibatch, I have to add a lot of padding tokens into the training set. This hurts the performance of the model. As seen in the above example, when I tried to translate the English sentence 'closer look' into French, there are a lot of padding tokens (&&&), which is annoying.

I think if I train the model with individual examples, then the problem could be relieved. However, the downside is that it could be time-consuming.

Anyway, I will try to update another file that will use attention mechanism.

See you in another Colab file

see you in the next week?

I have to take care the new baby in my family. so in the weekend, I didnot have the time to adjust my code, and train the model. I will try to do that in the next week. see you then.

I know, this is a small project, I just want to write those projects to be more familiar with the NLP, which is helpful for me in my future phd study.


I finally found the where the problem.

first I should not only use the final hidden state of the encoder. I should concate all the hidden state, and convert them into decoder.

second for the decoder, for the prediction, the predicted character should be conditioned by the previous characters, the current state, and the state of the encoder.

third, it is for the loss function. when I calcualte the loss, if encounter the padding, I should stop, because the error in the loss could cause the model to go at the wrong direction.

okay, I will ask for more advices from chatGPT, continue this project.

one more thing to mention is that, by doing the second step, I could easily add the attention machaism between the encoder and the decoder.

Now
I changed some code in here, but the performance is still poor.
what I change:
1: use the final hidden state of the encoder, when predict the next token.
2: when calculating the loss function, remove the padding tokens.


okay, I think I could pause now,
I will create another file to run the seq-2seq with attention, and check its performance. then use the transformers.

if the performance is still poor, then I will use the word-level token not the character level.

see you

I have changed the approach in which the hidden_state of the encoder influlence the decoder prediction, but the performance is still poor.
before using the word-level token, I will debug the training.
let see.

after debugging, I finnaly find the clue why the perforamcne is poor
after each training step. the return result of the forward function is [T,B,C].
when I use view(-1,C) to feed into the criterion. but then the order of the tokens is wrong. So I use permute to reshape the result.

the result is still not perfect, but get better know.
