# Translator
Here we implement 3+1 encoder-decoder models to understand neural machine translation
- 0: Reverse sentence encoder-decoder model(No Learning!)
- 1: Encoder-decoder model with the simplest architecture
- 2: Teacher forcing encoder-decoder model
- 3: Above models plus embedding layer instead of one-hot encoding

![Alt text](images/12_encoder_decoder_2.png)

In [24]:
# Import libraries

import numpy as np
import tensorflow.keras as keras
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

## 1. Basics

### 1.1. Onehot encoding

In [2]:
# Define a sample word2index dictionary
word2index = {'I': 0, 'like': 1, 'cats': 2}

# Create a list of words and convert them to indices
words = ['I', 'like', 'cats']
word_ids = [word2index[w] for w in words]
word_ids

[0, 1, 2]

In [3]:
def words2onehot(word_list, word2index):
  # Convert words to word IDs
  word_ids = [word2index[w] for w in word_list]
  # Convert word IDs to onehot vectors and return the onehot array
  # onehot = to_categorical(word_ids, num_classes=3)
  onehot = to_categorical(word_ids)
  return onehot

word2index = {"He":0, "drank": 1, "milk": 2, "like": 3, "cats": 4, "We": 5, "dogs": 6, "hates": 7, "rabbits": 8, "I": 9}

words_1 = ["I", "like", "cats", "We", "like", "dogs", "He", "hates", "rabbits"]
# Call words2onehot on words_1
length_1 = words2onehot(words_1, word2index).shape[1]

words_2 = ["We", "like", "dogs", "We", "like", "cats"]
# Call words2onehot on words_2
length_2 = words2onehot(words_2, word2index).shape[1]

# Print length_1 and length_2
print("length_1 =>", length_1, " and length_2 => ", length_2)

length_1 => 10  and length_2 =>  7


### 1.2. Text Reversing Model
a simple encoder-decoder model that simply do nothing but reversing the input sentence

In [4]:
word2index = {"I":0, "like": 1, "cats":2}
words = list(word2index.keys())
index2word = {v:k for k,v in word2index.items()}

# Convert words to onehot vectors using words2onehot
onehot = words2onehot(words, word2index)

print([(w,ohe.tolist()) for w,ohe in zip(words, onehot)])

[('I', [1.0, 0.0, 0.0]), ('like', [0.0, 1.0, 0.0]), ('cats', [0.0, 0.0, 1.0])]


In [5]:
def encoder(onehot):
  # Get word IDs from onehot vectors and return the IDs
  word_ids = np.argmax(onehot, axis=1)
  return word_ids

onehot = words2onehot(words, word2index)
# Get the context vector by using the encoder function
context = encoder(onehot)
print(context)

[0 1 2]


In [6]:
# Define the onehot2words function that returns words for a set of onehot vectors
def onehot2words(onehot, index2word):
  ids = np.argmax(onehot, axis=1)
  res = [index2word[id] for id in ids]
  return res

# Define the decoder function that returns reversed onehot vectors
def decoder(context_vector):
  word_ids_rev = context_vector[::-1]
  onehot_rev = to_categorical(word_ids_rev, num_classes=3)
  return onehot_rev

['cats', 'like', 'I']


In [None]:
# Convert context to reversed onehot vectors using decoder
onehot_rev = decoder(context)
# Get the reversed words using the onehot2words function
reversed_words = onehot2words(onehot_rev, index2word)

print(reversed_words)

## 2. Neural Machine Translation Model(V1)

![Alt text](images/ch28_full_model.png)

### 2.1. Load Data

In [7]:
def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    sentences = [sentence.strip() for sentence in sentences]
    return sentences

en_text = load_sentences('datasets/en2fr/vocab_en.txt')
fr_text = load_sentences('datasets/en2fr/vocab_fr.txt')

# en_text = load_sentences('datasets/en2fa/ak-test-1k.en')
# fr_text = load_sentences('datasets/en2fa/ak-test-1k.fa')

In [8]:
# Iterate through the first 5 English and French sentences in the dataset
for en_sent, fr_sent in zip(en_text[:2], fr_text[:2]):
  print('*'*50)
  print("English: ", en_sent)
  print("French: ", fr_sent)

**************************************************
English:  new jersey is sometimes quiet during autumn , and it is snowy in april .
French:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
**************************************************
English:  the united states is usually chilly during july , and it is usually freezing in november .
French:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


In [9]:
# Compute length of sentences
sent_lengths = [len(en_sent.split(' ')) for en_sent in en_text]
print('(English) Mean and Max sentence length: ', np.mean(sent_lengths), np.max(sent_lengths))

all_words = []
for sent in en_text:
  all_words.extend(sent.split(' '))

# Compute the length of the set containing all_words
vocab_size = len(set(all_words))
print("(English) Vocabulary size: ", vocab_size)

(English) Mean and Max sentence length:  13.225678224285508 17
(English) Vocabulary size:  228


In [10]:
# Compute length of sentences
sent_lengths = [len(fr_sent.split(' ')) for fr_sent in fr_text]
print('(French) Mean and Max sentence length: ', np.mean(sent_lengths), np.max(sent_lengths))

all_words = []
for sent in fr_text:
  all_words.extend(sent.split(' '))
  
# Compute the length of the set containing all_words
vocab_size = len(set(all_words))
print("(French) Vocabulary size: ", vocab_size)

(French) Mean and Max sentence length:  14.226730015958218 23
(French) Vocabulary size:  356


### 2.2. Fit Tokenizers

In [None]:
# set some parameters
en_len = 25
fr_len = 25

en_vocab = 250
fr_vocab = 250

In [None]:
en_tok = Tokenizer(num_words=en_vocab, oov_token='UNK')

# Fit the tokenizer on en_text
en_tok.fit_on_texts(en_text)

# Convert the sentence to a word ID sequence
seq_new = en_tok.texts_to_sequences(['she likes grapefruit , peaches , and lemons .'])
print('Word ID sequence (with UNK): ', seq_new)
print('The ID 1 represents the word: ', en_tok.index_word[1])

In [None]:
fr_text = [" ".join(['sos', sent, 'eos']) for sent in fr_text]
fr_tok = Tokenizer(num_words=fr_vocab, oov_token='UNK')

# Fit the tokenizer on fr_text
fr_tok.fit_on_texts(fr_text)

# Convert the sentence to a word ID sequence
seq_new = fr_tok.texts_to_sequences(['sos les états-unis est généralement froid en juillet . eos'])
print('Word ID sequence (with UNK): ', seq_new)
print('The ID 1 represents the word: ', fr_tok.index_word[1])

### 2.3. Encoder

In [11]:
hsize = 48

# Input layer
en_inputs = keras.layers.Input(shape=(en_len, en_vocab))

# GRU layer which returns the state and get the output and state from the GRU
en_out, en_state = keras.layers.GRU(hsize, return_state=True)(en_inputs)

encoder = keras.models.Model(inputs=en_inputs, outputs=en_state)
print(encoder.summary())

2023-09-08 13:34:21.482783: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 25, 250)]         0         
                                                                 
 gru (GRU)                   [(None, 48),              43200     
                              (None, 48)]                        
                                                                 
Total params: 43200 (168.75 KB)
Trainable params: 43200 (168.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


### 2.4. Decoder

In [12]:
de_inputs = keras.layers.RepeatVector(fr_len)(en_state) 

decoder_gru = keras.layers.GRU(hsize, return_sequences=True)
gru_outputs = decoder_gru(de_inputs, initial_state=en_state)

# Define a softmax dense layer that has fr_vocab outputs and wrap the dense layer in a TimeDistributed layer
de_dense = keras.layers.Dense(fr_vocab, activation='softmax')
de_dense_time = keras.layers.TimeDistributed(de_dense)

# Get the final prediction of the model
de_pred = de_dense_time(gru_outputs)
print("Prediction shape: ", de_pred.shape)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 25, 250)]            0         []                            
                                                                                                  
 gru (GRU)                   [(None, 48),                 43200     ['input_1[0][0]']             
                              (None, 48)]                                                         
                                                                                                  
 repeat_vector (RepeatVecto  (None, 25, 48)               0         ['gru[0][1]']                 
 r)                                                                                               
                                                                                            

### 2.5. Encode-Decoder Model

In [14]:
# Define a model with encoder input and decoder output
nmt = Model(inputs=en_inputs, outputs=de_pred)

# Compile the model with an optimizer and a loss
nmt.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

# View the summary of the model 
nmt.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 25, 250)]            0         []                            
                                                                                                  
 gru (GRU)                   [(None, 48),                 43200     ['input_1[0][0]']             
                              (None, 48)]                                                         
                                                                                                  
 repeat_vector (RepeatVecto  (None, 25, 48)               0         ['gru[0][1]']                 
 r)                                                                                               
                                                                                            

In [19]:
def sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):     
    if input_type == 'source':
      encoded_text = en_tok.texts_to_sequences(sentences)
    elif input_type == 'target':
      encoded_text = fr_tok.texts_to_sequences(sentences)
       
    preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=en_len)

    if reverse:
      preproc_text = preproc_text[:, ::-1]

    if onehot:
        preproc_text = to_categorical(preproc_text, num_classes=en_vocab)

    return preproc_text

	Reversed:  july during rainy never is california


In [None]:
sentences = ["california is never rainy during july ."]

pad_seq = sents2seqs('source', sentences, reverse=True, onehot=False)
rev_sent = [en_tok.index_word[wid] for wid in pad_seq[0] if wid != 0] 
print('\tReversed: ',' '.join(rev_sent))

In [20]:
sentences = ['she likes grapefruit , peaches , and lemons .'  ]

pad_seq = sents2seqs('source', sentences, pad_type='pre')
pad_seq

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, 28, 71, 29, 77,  8, 73]], dtype=int32)

### 2.6. Train and Evaluate NMT

In [21]:
train_size, valid_size = 50000, 5000

# Define a sequence of indices from 0 to len(en_text)
inds = np.arange(len(en_text))
np.random.shuffle(inds)
train_inds = inds[:train_size]

# Define valid_inds: last valid_size indices
valid_inds = inds[train_size: train_size + valid_size]

# Define tr_en (train EN sentences) and tr_fr (train FR sentences)
tr_en = [en_text[ti] for ti in train_inds]
tr_fr = [fr_text[ti] for ti in train_inds]

# Define v_en (valid EN sentences) and v_fr (valid FR sentences)
v_en = [en_text[vi] for vi in valid_inds]
v_fr = [fr_text[vi] for vi in valid_inds]
print('Training (EN):\n', tr_en[:3], '\nTraining (FR):\n', tr_fr[:3])
print('\nValid (EN):\n', v_en[:3], '\nValid (FR):\n', v_fr[:3])

Training (EN):
 ['paris is quiet during winter , and it is usually beautiful in november .', 'the united states is usually rainy during winter , but it is never chilly in spring .', 'the mango is his most loved fruit , but the grapefruit is their most loved .'] 
Training (FR):
 ["paris est calme pendant l' hiver , et il est généralement beau en novembre .", "les états-unis est généralement pluvieux pendant l' hiver , mais il est jamais froid au printemps .", 'la mangue est son fruit le plus cher , mais le pamplemousse est leur plus aimé .']

Valid (EN):
 ['paris is beautiful during august , and it is never quiet in july .', 'birds are his least favorite animals .', 'china is freezing during summer , and it is rainy in february .'] 
Valid (FR):
 ["paris est beau au mois d' août , et il est jamais tranquille en juillet .", 'les oiseaux sont moins ses animaux préférés .', "chine est le gel pendant l' été , et il pleut en février ."]


In [None]:
# Convert validation data to onehot
v_en_x = sents2seqs('source', v_en, onehot=True, reverse=True)
v_de_y = sents2seqs('target', v_fr, onehot=True)

n_epochs, bsize = 10, 250
for ei in range(n_epochs):
  for i in range(0,train_size,bsize):
    # Get a single batch of inputs and outputs
    en_x = sents2seqs('source', tr_en[i:i+bsize], onehot=True, reverse=True)
    de_y = sents2seqs('target', tr_fr[i:i+bsize], onehot=True)

    # Train the model on a single batch of data
    nmt.train_on_batch(en_x, de_y)    
    
    # Obtain the eval metrics for the training data
    res = nmt.evaluate(en_x, de_y, batch_size=bsize, verbose=0)
    print("{} => Train Loss:{}, Train Acc: {}".format(ei+1,res[0], res[1]*100.0)) 
    
  # Evaluate the trained model on the validation data
  res = nmt.evaluate(v_en_x, v_de_y, batch_size=valid_size, verbose=0)
  print("{} => Valid Loss:{}, Val Acc: {}".format(ei+1,res[0], res[1]*100.0))

In [23]:
en_st = ['the united states is sometimes chilly during december , but it is sometimes freezing in june .']
print('English: {}'.format(en_st))

# Convert the English sentence to a sequence
en_seq = sents2seqs('source', en_st, onehot=True, reverse=True)
# print(en_seq)

# Predict probabilities of words using en_seq
fr_pred = nmt.predict(en_seq, verbose=False)
# print(fr_pred)

# Get the sequence indices (max argument) of fr_pred
fr_seq = np.argmax(fr_pred, axis=-1)[0]
# print(fr_seq)

# Convert the sequence of IDs to a sentence and print
fr_id2word = reverse_word_map = dict(map(reversed, fr_tok.word_index.items()))

fr_sent = [fr_id2word[i] for i in fr_seq if i != 0]
# fr_sent = fr_tok.sequences_to_texts(fr_)
print("French (Custom): {}".format(' '.join(fr_sent)))
print("French (Google Translate): les etats-unis sont parfois froids en décembre, mais parfois gelés en juin")

English: ['the united states is sometimes chilly during december , but it is sometimes freezing in june .']
French (Custom): les états unis est parfois agréable en en mais il est parfois en en
French (Google Translate): les etats-unis sont parfois froids en décembre, mais parfois gelés en juin


## 3. Teacher Forcing NMT

![Alt text](images/ch41_teacher_force_encoder_decoder.png)

### 3.1. Encoder

In [None]:
# same as before
en_inputs = keras.layers.Input(shape=(en_len, en_vocab))
en_out, en_state = keras.layers.GRU(hsize, return_state=True)(en_inputs)

### 3.2 Decoder

In [None]:
# Decoder input layer
de_inputs = keras.layers.Input(shape=(fr_len-1, fr_vocab))
de_gru = keras.layers.GRU(hsize, return_sequences=True)
de_out = de_gru(de_inputs, initial_state=en_state)

# a TimeDistributed Dense softmax layer with fr_vocab nodes
de_dense = keras.layers.TimeDistributed(keras.layers.Dense(fr_vocab, activation='softmax'))
de_pred = de_dense(de_out)

### 3.3. Encoder-Decoder Model

In [None]:
# Define a model
nmt_tf = Model(inputs=[en_inputs, de_inputs], outputs=de_pred)

# Compile the model with optimizer and loss
nmt_tf.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["acc"])

# Print the summary of the model
nmt_tf.summary()

In [None]:
train_size, valid_size = 800, 200

# Define a sequence of indices from 0 to size of en_text
inds = np.arange(len(en_text))
np.random.shuffle(inds)

# Define train_inds as first train_size indices
train_inds = inds[:train_size]
valid_inds = inds[train_size:train_size+valid_size]

# Define tr_en (train EN sentences) and tr_fr (train FR sentences)
tr_en = [en_text[ti] for ti in train_inds]
tr_fr = [fr_text[ti] for ti in train_inds]

# Define v_en (valid EN sentences) and v_fr (valid FR sentences)
v_en = [en_text[vi] for vi in valid_inds]
v_fr = [fr_text[vi] for vi in valid_inds]

print('Training (EN):\n', tr_en[:3], '\nTraining (FR):\n', tr_fr[:3])
print('\nValid (EN):\n', v_en[:3], '\nValid (FR):\n', v_fr[:3])

In [None]:
n_epochs, bsize = 3, 250

for ei in range(n_epochs):
  for i in range(0,train_size,bsize):    
    en_x = sents2seqs('source', tr_en[i:i+bsize], onehot=True, reverse=True)
    de_xy = sents2seqs('target', tr_fr[i:i+bsize], onehot=True)

    de_x, de_y = de_xy[:,:-1,:], de_xy[:,1:,:]
    nmt_tf.train_on_batch([en_x,de_x], de_y)      

  v_en_x = sents2seqs('source', v_en, onehot=True, reverse=True)
  v_de_xy = sents2seqs('target', v_en, onehot=True)
  v_de_x, v_de_y = v_de_xy[:,:-1,:], v_de_xy[:,1:,:]

  # Evaluate the trained model on the validation data
  res = nmt_tf.evaluate([v_en_x,v_de_x], v_de_y, batch_size=valid_size, verbose=0)
  print("{} => Loss:{}, Val Acc: {}".format(ei+1,res[0], res[1]*100.0))

### 3.4. Inference Model

The inference model decoder is different to the decoder of the training model. We can't feed the decoder with French words because that is what we want to predict.
However, we can use the predicted French word from the previous time step to feed the inference model decoder. 

Therefore, when you want to generate a translation, the decoder needs to generate one word at a time, while consuming the previous output as an input

In [None]:
# Define an input layer that accepts a single onehot encoded word
de_inputs = keras.layers.Input(shape=(1, fr_vocab))
# Define an input to accept the t-1 state
de_state_in = keras.layers.Input(shape=(hsize,))
de_gru = keras.layers.GRU(hsize, return_state=True)
# Get the output and state from the GRU layer
de_out, de_state_out = de_gru(de_inputs, initial_state=de_state_in)
de_dense = keras.layers.Dense(fr_vocab, activation='softmax')
de_pred = de_dense(de_out)

# Define a model
decoder = Model(inputs=[de_inputs, de_state_in], outputs=[de_pred,de_state_out])
print(decoder.summary())

In [None]:
# You may load and set wights of the trained model by uncommenting this cell


# # Load the weights to the encoder GRU from the trained model
# en_gru_w = tr_en_gru.get_weights()
# # Set the weights of the encoder GRU of the inference model
# en_gru.set_weights(en_gru_w)
# # Load and set the weights to the decoder GRU
# de_gru.set_weights(tr_de_gru.get_weights())
# # Load and set the weights to the decoder Dense
# de_dense.set_weights(tr_de_dense.get_weights())

In [None]:
def probs2word(probs, tok):
    wid = np.argmax(probs[0,:], axis=-1)
    w = tok.index_word[wid]
    return w

def word2onehot(tokenizer, word, vocab_size):
    de_seq = tokenizer.texts_to_sequences([[word]])
    de_onehot = to_categorical(de_seq, num_classes=vocab_size)
    de_onehot = np.expand_dims(de_onehot, axis=1)    
    return de_onehot

In [None]:
en_sent = ['the united states is sometimes chilly during december , but it is sometimes freezing in june .']
print('English: {}'.format(en_sent))
en_seq = sents2seqs('source', en_sent, onehot=True, reverse=True)
# Predict the initial decoder state with the encoder
de_s_t = encoder.predict(en_seq)
de_seq = word2onehot(fr_tok, 'sos', fr_vocab)
fr_sent = ''
for i in range(fr_len):    
  # Predict from the decoder and recursively assign the new state to de_s_t
  de_prob, de_s_t = decoder.predict([de_seq, de_s_t])
  # Get the word from the probability output using probs2word
  de_w = probs2word(de_prob, fr_tok)
  # Convert the word to a onehot sequence using word2onehot
  de_seq = word2onehot(fr_tok, de_w, fr_vocab)
  if de_w == 'eos': break
  fr_sent += de_w + ' '
print("French (Ours): {}".format(fr_sent))
print("French (Google Translate): les etats-unis sont parfois froids en décembre, mais parfois gelés en juin")

### 3.5. Embedding layer
Add embedding layer as an alternative to one-hot encoding for capturing word's meaning.

In [None]:
# Define an input layer which accepts a sequence of word IDs
en_inputs = keras.layers.Input(shape=(en_len,))

# Define an Embedding layer which accepts en_inputs
en_emb = keras.layers.Embedding(en_vocab, 96, input_length=en_len)(en_inputs)
en_out, en_state = keras.layers.GRU(hsize, return_state=True)(en_emb)

de_inputs = keras.layers.Input(shape=(fr_len-1,))
# Define an Embedding layer which accepts de_inputs
de_emb = keras.layers.Embedding(fr_vocab, 96, input_length=fr_len-1)(de_inputs)
de_out, _ = keras.layers.GRU(hsize, return_sequences=True, return_state=True)(de_emb, initial_state=en_state)
de_pred = keras.layers.TimeDistributed(Dense(fr_vocab, activation='softmax'))(de_out)

# Define the Model which accepts encoder/decoder inputs and outputs predictions 
nmt_emb = Model([en_inputs, de_inputs], de_pred)
nmt_emb.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
for ei in range(3):
  for i in range(0, train_size, bsize):    
    en_x = sents2seqs('source', tr_en[i:i+bsize], onehot=False, reverse=True)
    # Get a single batch of French sentences with no onehot encoding
    de_xy = sents2seqs('target', tr_fr[i:i+bsize], onehot=False)
    # Get all words except the last word in that batch
    de_x = de_xy[:,:-1]
    de_xy_oh = sents2seqs('target', tr_fr[i:i+bsize], onehot=True)
    # Get all words except the first from de_xy_oh
    de_y = de_xy_oh[:,1:,:]
    # Training the model on a single batch of data
    nmt_emb.train_on_batch([en_x,de_x], de_y)    
    res = nmt_emb.evaluate([en_x, de_x], de_y, batch_size=bsize, verbose=0)
    print("{} => Loss:{}, Train Acc: {}".format(ei+1,res[0], res[1]*100.0))

## Acknowledgements
This notebook is adopted from Thushan Ganegedara's valuable course on Datacamp. spacial thanks to him for providing slides and resources on understanding machine translation models.