<a href="https://colab.research.google.com/github/archyyu/translation-from-RNN-to-transformer/blob/main/machine_translation_by_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

# Set random seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x7a0dc909c390>

In [None]:
hidden_size = 100
embedding_dim = 30
learning_rate = 0.001
batch_size = 50
beam_width = 3

In [None]:
url = "https://raw.githubusercontent.com/archyyu/publicResource/main/eng-fra.txt"
response = requests.get(url)
lines = response.text.split('\n')
en_lines = []
fr_lines = []

start_character = '<'
end_character = '>'
padding_character = '&'

for i in range(0,3000):
  item = lines[i].split('\t')
  en_lines.append(item[0] + '>')
  fr_lines.append('<' + item[1] + '>')

max_len_line_en = max([len(l) for l in en_lines])
max_len_line_fr = max([len(l) for l in fr_lines])

for i in range(len(en_lines)):
  if (len(en_lines[i]) < max_len_line_en):
    en_lines[i] = en_lines[i].ljust(max_len_line_en, padding_character)
  if (len(fr_lines[i]) < max_len_line_fr):
    fr_lines[i] = fr_lines[i].ljust(max_len_line_fr, padding_character)


source_vocab = sorted(set(''.join(en_lines)))
target_vocab = sorted(set(''.join(fr_lines)))

source_vocab_size = len(set(''.join(source_vocab)))
target_vocab_size = len(set(''.join(target_vocab)))

source_char_to_ix = {ch: i for i, ch in enumerate(source_vocab)}
source_ix_to_char = {i: ch for i, ch in enumerate(source_vocab)}

target_char_to_ix = {ch: i for i, ch in enumerate(target_vocab)}
target_ix_to_char = {i: ch for i, ch in enumerate(target_vocab)}

padding_token_index = target_char_to_ix[padding_character]

In [None]:
def line_to_tensor(line):
  result = []
  line_ten = torch.tensor([source_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

def target_line_to_tensor(line):
  result = []
  line_ten = torch.tensor([target_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

en_data = []
fr_data = []
for i in range(len(en_lines)):
  e = torch.tensor([source_char_to_ix[ch] for ch in en_lines[i]], dtype=torch.long).view(1, -1)
  en_data.append(e)
  f = torch.tensor([target_char_to_ix[ch] for ch in fr_lines[i]], dtype=torch.long).view(1, -1)
  fr_data.append(f)

en_data = torch.cat(en_data, dim=0)
fr_data = torch.cat(fr_data, dim=0)

In [None]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.i2h = nn.Linear(embedding_dim, hidden_size, bias=False)
    self.h2h = nn.Linear(hidden_size, hidden_size, bias=False)
    self.hb2 = nn.Parameter(torch.zeros(1, hidden_size))

  def forward(self, x):
    h = torch.zeros(1, self.hidden_size)
    for i in range(x.shape[1]):
      t = self.embedding(x[:,i])
      h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
    return h


class Decoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size):
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.embedding_dim = embedding_dim
    self.embedding = nn.Embedding(vocab_size, self.embedding_dim)
    self.i2h = nn.Linear(self.embedding_dim, self.hidden_size, bias=False)
    self.h2h = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
    self.h2o = nn.Linear(self.hidden_size, vocab_size, bias=False)

    self.e2d = nn.Linear(self.hidden_size, vocab_size, bias=False)

    self.hb2 = nn.Parameter(torch.zeros(1, self.hidden_size))
    self.ob = nn.Parameter(torch.zeros(1, vocab_size))

  def init_state(self, encode_state):
    self.encode_state = encode_state


  def forward(self, target):

    # if x is None:
    h = torch.zeros(1, self.hidden_size)
    output = []
    for i in range(max_len_line_fr):
      x = target[:,i]
      t = self.embedding(x)
      h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
      y = self.e2d(self.encode_state) + self.h2o(h) + self.ob
      output.append(y)
    return torch.stack(output, dim=0)

  def forward1(self, batch_size):

    # if x is None:
    h = torch.zeros(1, self.hidden_size)
    x = torch.tensor([target_char_to_ix[start_character] for _ in range(batch_size)],dtype=torch.long)
    output = []
    for i in range(max_len_line_fr):
      t = self.embedding(x)
      h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
      y = self.e2d(self.encode_state) + self.h2o(h) + self.ob
      p = nn.functional.softmax(y, dim=1)
      ix = torch.argmax(p, dim=-1)
      x = ix
      output.append(y)
    return torch.stack(output, dim=0)

  def beam_search(self):
    """
    Perform beam search to generate sequences.
    """
    beams = [(torch.tensor([target_char_to_ix[start_character]], dtype=torch.long), 1.0)]
    h = torch.zeros(1, self.hidden_size)

    for i in range(max_len_line_fr):
      new_beams = []

      for seq, score in beams:
        x = seq[-1].view(1, -1)  # Take the last predicted token

        t = self.embedding(x)
        h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
        y = self.e2d(self.encode_state) + self.h2o(h) + self.ob
        p = F.softmax(y, dim=-1)
        top_probs, top_ix = torch.topk(p, beam_width, dim=-1)

        for prob, token_ix in zip(top_probs[0][0], top_ix[0][0]):
          new_seq = torch.cat((seq, torch.tensor([token_ix], dtype=torch.long)), dim=0)
          new_beams.append((new_seq, score * prob.item()))

      beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

    return beams

class Seq2Seq(nn.Module):
  def __init__(self, source_vocab_size, target_vocab_size, embedding_dim, hidden_size):
    super(Seq2Seq, self).__init__()
    self.embedding_dim = embedding_dim
    self.hidden_size = hidden_size
    self.encoder = Encoder(source_vocab_size, self.embedding_dim, self.hidden_size)
    self.decoder = Decoder(target_vocab_size, self.embedding_dim, self.hidden_size)
  def forward(self, source, batch_size):
    hidden_state = self.encoder(source)
    self.decoder.init_state(hidden_state)
    output = self.decoder(batch_size)
    return output

  def translate(self, source):
    hidden_state = self.encoder(source)
    self.decoder.init_state(hidden_state)
    beams = self.decoder.beam_search()
    return beams


# Define your model, loss function, and optimizer
model = Seq2Seq(source_vocab_size, target_vocab_size, embedding_dim, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
encoder = Encoder(source_vocab_size, embedding_dim, hidden_size)

p = 1

source_batch = en_data[p:p+batch_size]
target_batch = fr_data[p:p+batch_size]

hidden = encoder(source_batch)
hidden

tensor([[ 0.5469, -0.3315, -0.6971,  ...,  0.2071,  0.8874,  0.0376],
        [ 0.5469, -0.3315, -0.6971,  ...,  0.2071,  0.8874,  0.0376],
        [ 0.5468, -0.3316, -0.6972,  ...,  0.2073,  0.8874,  0.0374],
        ...,
        [ 0.5477, -0.3328, -0.6926,  ...,  0.2077,  0.8883,  0.0472],
        [ 0.5477, -0.3343, -0.6931,  ...,  0.2077,  0.8885,  0.0441],
        [ 0.5477, -0.3343, -0.6931,  ...,  0.2077,  0.8885,  0.0441]],
       grad_fn=<TanhBackward0>)

In [None]:
x = torch.tensor([target_char_to_ix['<'] for _ in range(batch_size)],dtype=torch.long)
print(x.shape)

torch.Size([50])


In [None]:
target_batch = fr_data[p:p+batch_size]

In [None]:
target_batch[:,10].shape

torch.Size([50])

In [None]:
#training
import torch.optim as optim

num_epochs = 3

# Training loop
for epoch in range(num_epochs):
  for p in range(len(en_data) - batch_size - 1):

    source_batch = en_data[p:p+batch_size]
    target_batch = fr_data[p:p+batch_size]

    optimizer.zero_grad()
    # encoder = Encoder(source_vocab_size, embedding_dim, hidden_size)
    # output = encoder(source_batch)
    output = model(source_batch, target_batch)

    # remove the padding tokens when calculate the loss
    # Create a mask to ignore padding tokens
    padding_mask = (target_batch != padding_token_index).float()

    # Compute the loss with the padding mask
    loss = criterion(output.view(-1, target_vocab_size), target_batch.view(-1))
    loss = (loss * padding_mask.view(-1)).sum() / padding_mask.sum()


    # loss = criterion(output.view(-1, target_vocab_size), target_batch.view(-1))

    loss.backward()
    for param in model.parameters():
      if param.grad is not None:
        param.grad.data.clamp_(-5, 5)
    optimizer.step()

    if p%100 == 0:
      # Print or log the training loss for each epoch
      print(f'p {p}, Loss: {loss.item()}')

    p += batch_size


p 0, Loss: 4.697800636291504
p 100, Loss: 1.4145365953445435
p 200, Loss: 1.5208317041397095
p 300, Loss: 1.4787399768829346
p 400, Loss: 1.4482678174972534
p 500, Loss: 1.4810984134674072
p 600, Loss: 1.5129822492599487
p 700, Loss: 1.478727102279663
p 800, Loss: 1.7247045040130615
p 900, Loss: 1.5345492362976074
p 1000, Loss: 1.5424134731292725
p 1100, Loss: 1.527912974357605
p 1200, Loss: 1.5975698232650757
p 1300, Loss: 1.4835697412490845
p 1400, Loss: 1.7118548154830933
p 1500, Loss: 1.6063047647476196
p 1600, Loss: 1.6118443012237549
p 1700, Loss: 1.6741050481796265
p 1800, Loss: 1.5356875658035278
p 1900, Loss: 1.5401149988174438
p 2000, Loss: 1.5905791521072388
p 2100, Loss: 1.7025511264801025
p 2200, Loss: 1.7367477416992188
p 2300, Loss: 1.468629002571106
p 2400, Loss: 1.72535240650177
p 2500, Loss: 1.5578110218048096
p 2600, Loss: 1.6707231998443604
p 2700, Loss: 1.7332000732421875
p 2800, Loss: 1.724569320678711
p 2900, Loss: 1.8029965162277222
p 0, Loss: 1.4752938747406006

In [94]:
test_line = "I fell>"

input = line_to_tensor(test_line)
start = target_line_to_tensor("<")


outputs = model.translate(input)
for tensor,p in outputs:
  result = [target_ix_to_char[j.item()] for j in tensor]
  print(''.join(result))

# outputs = model(input,1)
# result = []
# for i in range(outputs.shape[0]):

#   p = nn.functional.softmax(outputs[i], dim=-1).detach().numpy().ravel()
#   ix = np.random.choice(range(target_vocab_size), p=p)

#   result.append(target_ix_to_char[ix])

# print(''.join(result))

<&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
< &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
<e&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&


I trained all day, but the performance is still poor. I'll figure out why later. Maybe the RNN is too simple? Let's see.

The problem is that in order to train the model by minibatch, I have to add a lot of padding tokens into the training set. This hurts the performance of the model. As seen in the above example, when I tried to translate the English sentence 'closer look' into French, there are a lot of padding tokens (&&&), which is annoying.

I think if I train the model with individual examples, then the problem could be relieved. However, the downside is that it could be time-consuming.

Anyway, I will try to update another file that will use attention mechanism.

See you in another Colab file

see you in the next week?

I have to take care the new baby in my family. so in the weekend, I didnot have the time to adjust my code, and train the model. I will try to do that in the next week. see you then.

I know, this is a small project, I just want to write those projects to be more familiar with the NLP, which is helpful for me in my future phd study.


I finally found the where the problem.

first I should not only use the final hidden state of the encoder. I should concate all the hidden state, and convert them into decoder.

second for the decoder, for the prediction, the predicted character should be conditioned by the previous characters, the current state, and the state of the encoder.

third, it is for the loss function. when I calcualte the loss, if encounter the padding, I should stop, because the error in the loss could cause the model to go at the wrong direction.

okay, I will ask for more advices from chatGPT, continue this project.

one more thing to mention is that, by doing the second step, I could easily add the attention machaism between the encoder and the decoder.

Now
I changed some code in here, but the performance is still poor.
what I change:
1: use the final hidden state of the encoder, when predict the next token.
2: when calculating the loss function, remove the padding tokens.


okay, I think I could pause now,
I will create another file to run the seq-2seq with attention, and check its performance. then use the transformers.

if the performance is still poor, then I will use the word-level token not the character level.

see you