<a href="https://colab.research.google.com/github/archyyu/translation-from-RNN-to-transformer/blob/main/machine_translation_by_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

# Set random seed for reproducibility
torch.manual_seed(42)

In [24]:
hidden_size = 100
embedding_dim = 30
learning_rate = 0.001
batch_size = 50
beam_width = 3

In [104]:
url = "https://raw.githubusercontent.com/archyyu/publicResource/main/eng-fra.txt"
response = requests.get(url)
lines = response.text.split('\n')
en_lines = []
fr_lines = []

start_character = '<'
end_character = '>'
padding_character = '&'

for i in range(20000,30000):
  item = lines[i].split('\t')
  en_lines.append('<' + item[0] + '>')
  fr_lines.append(item[1] + '>')

max_len_line_en = max([len(l) for l in en_lines])
max_len_line_fr = max([len(l) for l in fr_lines])

for i in range(len(en_lines)):
  if (len(en_lines[i]) < max_len_line_en):
    en_lines[i] = en_lines[i].ljust(max_len_line_en, padding_character)
  if (len(fr_lines[i]) < max_len_line_fr):
    fr_lines[i] = fr_lines[i].ljust(max_len_line_fr, padding_character)


source_vocab = sorted(set(''.join(en_lines)))
target_vocab = sorted(set(''.join(fr_lines)))
target_vocab.append('<')

source_vocab_size = len(set(''.join(source_vocab)))
target_vocab_size = len(set(''.join(target_vocab)))

source_char_to_ix = {ch: i for i, ch in enumerate(source_vocab)}
source_ix_to_char = {i: ch for i, ch in enumerate(source_vocab)}

target_char_to_ix = {ch: i for i, ch in enumerate(target_vocab)}
target_ix_to_char = {i: ch for i, ch in enumerate(target_vocab)}

In [None]:
target_vocab_size

98

In [105]:
def line_to_tensor(line):
  result = []
  line_ten = torch.tensor([source_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

def target_line_to_tensor(line):
  result = []
  line_ten = torch.tensor([target_char_to_ix[ch] for ch in test_line], dtype=torch.long).view(1, -1)
  result.append(line_ten)
  return torch.cat(result, dim=0)

en_data = []
fr_data = []
for i in range(len(en_lines)):
  e = torch.tensor([source_char_to_ix[ch] for ch in en_lines[i]], dtype=torch.long).view(1, -1)
  en_data.append(e)
  f = torch.tensor([target_char_to_ix[ch] for ch in fr_lines[i]], dtype=torch.long).view(1, -1)
  fr_data.append(f)

en_data = torch.cat(en_data, dim=0)
fr_data = torch.cat(fr_data, dim=0)

In [106]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.i2h = nn.Linear(embedding_dim, hidden_size, bias=False)
    self.h2h = nn.Linear(hidden_size, hidden_size, bias=False)
    self.hb2 = nn.Parameter(torch.zeros(1, hidden_size))

  def forward(self, x):
    h = torch.zeros(1, self.hidden_size)
    for i in range(x.shape[1]):
      t = self.embedding(x[:,i])
      h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
    return h


class Decoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size):
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.embedding_dim = embedding_dim
    self.embedding = nn.Embedding(vocab_size, self.embedding_dim)
    self.i2h = nn.Linear(self.embedding_dim, self.hidden_size, bias=False)
    self.h2h = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
    self.h2o = nn.Linear(self.hidden_size, vocab_size, bias=False)

    self.hb2 = nn.Parameter(torch.zeros(1, self.hidden_size))
    self.ob = nn.Parameter(torch.zeros(1, vocab_size))

  def forward(self, h, batch_size):

    # if x is None:
    x = torch.tensor([target_char_to_ix[start_character] for _ in range(batch_size)],dtype=torch.long)
    output = []
    for i in range(max_len_line_fr):
      t = self.embedding(x)
      h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
      y = self.h2o(h) + self.ob
      p = nn.functional.softmax(y, dim=1)
      ix = torch.argmax(p, dim=-1)
      x = ix
      output.append(y)
    return torch.stack(output, dim=0)

  def beam_search(self, h):
    """
    Perform beam search to generate sequences.
    """
    beams = [(torch.tensor([target_char_to_ix[start_character]], dtype=torch.long), 1.0)]


    for i in range(max_len_line_fr):
      new_beams = []

      for seq, score in beams:
        x = seq[-1].view(1, -1)  # Take the last predicted token

        t = self.embedding(x)
        h = torch.tanh(self.i2h(t) + self.h2h(h) + self.hb2)
        y = self.h2o(h) + self.ob
        p = F.softmax(y, dim=-1)
        top_probs, top_ix = torch.topk(p, beam_width, dim=-1)

        for prob, token_ix in zip(top_probs[0][0], top_ix[0][0]):
          new_seq = torch.cat((seq, torch.tensor([token_ix], dtype=torch.long)), dim=0)
          new_beams.append((new_seq, score * prob.item()))

      beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

    return beams

class Seq2Seq(nn.Module):
  def __init__(self, source_vocab_size, target_vocab_size, embedding_dim, hidden_size):
    super(Seq2Seq, self).__init__()
    self.embedding_dim = embedding_dim
    self.hidden_size = hidden_size
    self.encoder = Encoder(source_vocab_size, self.embedding_dim, self.hidden_size)
    self.decoder = Decoder(target_vocab_size, self.embedding_dim, self.hidden_size)
  def forward(self, source, batch_size):
    hidden_state = self.encoder(source)
    output = self.decoder(hidden_state, batch_size)
    return output

  def translate(self, source):
    hidden_state = self.encoder(source)
    beams = self.decoder.beam_search(hidden_state)
    return beams


# Define your model, loss function, and optimizer
model = Seq2Seq(source_vocab_size, target_vocab_size, embedding_dim, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

In [117]:
#training
import torch.optim as optim

num_epochs = 3

# Training loop
for epoch in range(num_epochs):
  for p in range(len(en_data) - batch_size - 1):

    source_batch = en_data[p:p+batch_size]
    target_batch = fr_data[p:p+batch_size]

    optimizer.zero_grad()
    # encoder = Encoder(source_vocab_size, embedding_dim, hidden_size)
    # output = encoder(source_batch)
    output = model(source_batch, batch_size)

    loss = criterion(output.view(-1, target_vocab_size), target_batch.view(-1))

    loss.backward()
    for param in model.parameters():
      if param.grad is not None:
        param.grad.data.clamp_(-5, 5)
    optimizer.step()

    if p%100 == 0:
      # Print or log the training loss for each epoch
      print(f'p {p}, Loss: {loss.item()}')

    p += batch_size


p 0, Loss: 1.9107469320297241
p 100, Loss: 1.7525817155838013
p 200, Loss: 1.963807225227356
p 300, Loss: 1.8432202339172363
p 400, Loss: 1.8197369575500488
p 500, Loss: 1.9237337112426758
p 600, Loss: 2.0000388622283936
p 700, Loss: 1.980014681816101
p 800, Loss: 1.8810944557189941
p 900, Loss: 2.054216146469116
p 1000, Loss: 1.8385647535324097
p 1100, Loss: 1.6707831621170044
p 1200, Loss: 1.9431257247924805
p 1300, Loss: 1.7543889284133911
p 1400, Loss: 1.8599927425384521
p 1500, Loss: 1.8338054418563843
p 1600, Loss: 2.081207752227783
p 1700, Loss: 2.0136971473693848
p 1800, Loss: 1.9742344617843628
p 1900, Loss: 1.7932943105697632
p 2000, Loss: 2.209684371948242
p 2100, Loss: 1.912713646888733
p 2200, Loss: 1.8072633743286133
p 2300, Loss: 1.7550612688064575
p 2400, Loss: 1.7997758388519287
p 2500, Loss: 2.0657882690429688
p 2600, Loss: 2.0518338680267334
p 2700, Loss: 2.079054355621338
p 2800, Loss: 1.940521240234375
p 2900, Loss: 1.8966559171676636
p 3000, Loss: 1.85182976722717

KeyboardInterrupt: 

In [120]:
test_line = "closer look!>"

input = line_to_tensor(test_line)
start = target_line_to_tensor("<")


outputs = model.translate(input)
for tensor,p in outputs:
  result = [target_ix_to_char[j.item()] for j in tensor]
  print(''.join(result))

# outputs = model(input,1)
# result = []
# for i in range(outputs.shape[0]):

#   p = nn.functional.softmax(outputs[i], dim=-1).detach().numpy().ravel()
#   ix = np.random.choice(range(target_vocab_size), p=p)

#   result.append(target_ix_to_char[ix])

# print(''.join(result))

<&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
< &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
<& &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&


trained all day, but the performance is still poor, will figure out why later.
may be the rnn is too simple?
let see

the problem is that, in order to train the model by minibatch, I have to add a lot of padding tokens into the training set. this will hurt the performance of the model.  as seen the above example, I tried to translate the english sentence closer look into france, there is alot of padding token &&&. which is anonying.

I think If I train the model by individual example, then the problem could be relieved?

but the problem is that coudl be time consuming.

anyway, I will try to update an other file which will use attention machisum.

see you in another colab file.