# Machine Translation using Applied Deep Learning (RNN)

Setting everything up.

In [0]:
!pip install -q tensorflow-gpu==2.0.0-alpha0

In [2]:
!pip install sacrebleu # https://github.com/mjpost/sacreBLEU



In [0]:
# importing necessary packages
import numpy as np
import pandas as pd
import re
import sacrebleu
import tensorflow as tf
import time
import unicodedata

## English to Spanish Model
Training a model to translate between two languages (from English to Spanish) using a couple thousand sentences. 

First, we read in the txt file with a number of spanish-english translations available.

In [4]:
# reading in the spanish-english dataset
data = pd.read_csv('spa.txt', delimiter="\t", header=None, names=['eng_sen', 'spa_sen'])
# randomly selecting 2 thousand sentences for the working dataset
data_sub = data.sample(n=2000)
# looking into the subset
data_sub.head(3)

Unnamed: 0,eng_sen,spa_sen
92457,"For more information, visit our website.","Para más información, visite nuestro sitio."
78484,"When in Rome, do as the Romans do.","Allá donde fueres, haz lo que vieres."
54463,This song's in the key of G.,Esta canción está en clave de sol.


In [0]:
train_indexes = data_sub.index.tolist()
data_test = data.drop(data.index[train_indexes])
data_test = data_test.sample(n=1000)

In [0]:
# convering the dataframe into a list representation
sentences = data_sub.values.tolist()
sentences_test = data_test.values.tolist()

Next, we apply the necessary data preprocessing.

In [0]:
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
  s = s.strip()
  s = '<start> ' + s + ' <end>'
  return s

In [8]:
# looking into what preprocessing does exactly
print("Original:", sentences[0])
sentences = [(preprocess(source), preprocess(target)) for (source, target) in sentences]
sentences_test = [(preprocess(source), preprocess(target)) for (source, target) in sentences_test]
print("Preprocessed:", sentences[0])

Original: ['For more information, visit our website.', 'Para más información, visite nuestro sitio.']
Preprocessed: ('<start> For more information , visit our website . <end>', '<start> Para mas informacion , visite nuestro sitio . <end>')


In [0]:
# creating a target and source zip view
source_sentences, target_sentences = list(zip(*sentences))

source_sentences_test, target_sentences_test = list(zip(*sentences_test))

In [10]:
# fitting tokenization
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='outofvocab')
source_tokenizer.fit_on_texts(source_sentences)
source_data = source_tokenizer.texts_to_sequences(source_sentences)
source_data_test = source_tokenizer.texts_to_sequences(source_sentences_test)
print("Sequence:", source_data[0])
# adding some padding
source_data = tf.keras.preprocessing.sequence.pad_sequences(source_data, padding='post')
source_data_test = tf.keras.preprocessing.sequence.pad_sequences(source_data_test, padding='post')
print("Padded:", source_data[0])

Sequence: [2, 29, 76, 639, 21, 286, 112, 966, 4, 3]
Padded: [  2  29  76 639  21 286 112 966   4   3   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0]


In [0]:
# do the same with target
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='outofvocab')
target_tokenizer.fit_on_texts(target_sentences)
target_data = target_tokenizer.texts_to_sequences(target_sentences)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')
target_data_test = target_tokenizer.texts_to_sequences(target_sentences_test)
target_data_test = tf.keras.preprocessing.sequence.pad_sequences(target_data_test, padding='post')

In [12]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels = np.zeros(target_data.shape)
target_labels[:,0:target_data.shape[1] -1] = target_data[:,1:]

target_labels_test = np.zeros(target_data_test.shape)
target_labels_test[:,0:target_data_test.shape[1] -1] = target_data_test[:,1:]

print("Target sequence", target_data[0])
print("Target label", target_labels[0])

Target sequence [  2  30  31 637  20 445 186 368   4   3   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]
Target label [ 30.  31. 637.  20. 445. 186. 368.   4.   3.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.]


In [0]:
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [14]:
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
decode(source_data[0], source_tokenizer)

2 -> <start>
29 -> for
76 -> more
639 -> information
21 -> ,
286 -> visit
112 -> our
966 -> website
4 -> .
3 -> <end>


I chose the batch size to be 16 instead of 5 based on the advanced tutorial documentation (they are using 56 there).

In [0]:
batch_size = 16
dataset = tf.data.Dataset.from_tensor_slices((source_data, target_data, target_labels)).batch(batch_size)

In [16]:
# looking at a specific example
example_batch = next(iter(dataset))
source, target, taget_labels = example_batch
print("Shapes:", source.shape, target.shape, taget_labels.shape)

Shapes: (16, 30) (16, 29) (16, 29)


For the RNN model, I have settled on 128 depth guided by the advanced documentation as well (their default in actually 1024). In addition, the embedding size is set to 256.

In [0]:
# choosing our parameters
embedding_size = 256
rnn_size = 128

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))

In [19]:
# Create a batch of one sentence
ex_sentence = tf.expand_dims(source_data[0], axis=0)
ex_translation = tf.expand_dims(target_data[0], axis=0)
ex_labels = tf.expand_dims(target_labels[0], axis=0)
print(ex_sentence.shape)

encoder = Encoder()
hidden_state = encoder.init_state(batch_size=1)
print(hidden_state.shape)

output, hidden_state = encoder(ex_sentence, hidden_state)
print(output.shape)

(1, 30)
(1, 128)
(1, 30, 128)


In [0]:
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

In [21]:
# calling the decoder
decoder = Decoder()
decoder_output, decoder_state = decoder(ex_labels, hidden_state)
print(decoder_output.shape)

(1, 29, 2846)


In [22]:
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

print("Loss", calc_loss(ex_labels, decoder_output))

Loss tf.Tensor(2.4681187, shape=(), dtype=float32)


In [0]:
def translate(sentences, source_data, idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(sentences))
    
    input_sent = source_data[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder.init_state(batch_size=1)
    output, hidden_state = encoder(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    out_words = []
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return sentences[idx][0], sentences[idx][1], translation

In [24]:
input_sent, target_sent, translation = translate(sentences, source_data)
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Input: <start> Do you have an alarm clock in your room ? <end>
Target: <start> ¿ Tienes despertador en tu dormitorio ? <end>
Translation: sente subirse investigacion haga dieron perdisteis casada asi dolares cena atrapar perdisteis casada hijos tratado fatigas pintores hasta anciano tus



Since we have not yet trained our model, the translation looks pretty crazy and impossible to comprehend (to be expected though). Nonetheless, we expect that to change once the model is trained. Therefore, let us train the model.

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function # remove this annotation when debugging
def train_step(source_seq, target_seq, target_labels, initial_state):
  
  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder(source_seq, initial_state)
    logits, decoder_state = decoder(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [27]:
EPOCHS = 101

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states = encoder.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels) in enumerate(dataset):
      loss = train_step(source_seq, target_seq, target_labels, en_initial_states)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate(sentences, source_data)
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Epoch #0, Loss 1.6153, Time 23.18 sec
Input: <start> I ve waited for this too long . <end>
Target: <start> He esperado demasiado tiempo para esto . <end>
Translation: no no no . <end>

Epoch #10, Loss 0.9153, Time 19.02 sec
Input: <start> I don t have your number . <end>
Target: <start> No tengo tu numero . <end>
Translation: ¿ que es el unico que esta ? <end>

Epoch #20, Loss 0.6159, Time 19.11 sec
Input: <start> I came here to see if there was something I could do to help , but there doesn t seem to be anything for me to do . <end>
Target: <start> Vine aqui para ver si habia algo que pudiera hacer para ayudar , pero parece que no hay nada para que yo haga . <end>
Translation: ¿ que es el unico que esta pasando ? <end>

Epoch #30, Loss 0.4169, Time 19.16 sec
Input: <start> My neighbor s dog won t eat dry dog food . <end>
Target: <start> El perro de mi vecino no come comida de perros seca . <end>
Translation: el hombre confeso que el se dijo que se abstuviera de tomar vino . <end>

Epo

In [28]:
# calculating BLEAU score
references, hypotheses = [], []

for i in range(len(sentences)):
  input_sent, target_sent, translation = translate(sentences, source_data)
  references.append(target_sent)
  hypotheses.append("<start> " + translation)
  
results = sacrebleu.raw_corpus_bleu(hypotheses, [references])
print(results)

BLEU(score=53.457128343533576, counts=[13812, 9304, 6899, 5408], totals=[18727, 16727, 14727, 12727], precisions=[73.75447215250708, 55.622646021402524, 46.845929245603315, 42.492339121552604], bp=1.0, sys_len=18727, ref_len=18486)


Looks like we got a relatively decent score BLEAU of **54** on the training set. It is important to mention that since we assignment did not specify, I did not apply the model on a hidden test set.

In [0]:
# saving the results
input_text, targets, translations = [], [], []

for i in range(len(sentences)):
  input_sent, target_sent, translation = translate(sentences, source_data, idx=i)
  targets.append(target_sent)
  input_text.append(input_sent)
  translations.append("<start> " + translation)

In [30]:
df = pd.DataFrame(input_text, columns=['input'])
df['target'] = targets
df['translation'] = translations
df.to_csv('eng_to_spa.csv')
df.head()

Unnamed: 0,input,target,translation
0,"<start> For more information , visit our websi...","<start> Para mas informacion , visite nuestro ...","<start> para mas informacion , visite nuestro ..."
1,"<start> When in Rome , do as the Romans do . <...","<start> Alla donde fueres , haz lo que vieres ...","<start> alla donde fueres , haz lo que vieres ..."
2,<start> This song s in the key of G . <end>,<start> Esta cancion esta en clave de sol . <end>,<start> esta cancion esta en clave de sol . <end>
3,<start> I ll always love you . <end>,<start> Siempre te amare . <end>,<start> siempre te amare . <end>
4,<start> I would like to repay your kindness in...,<start> Me gustaria compensar su gentileza ape...,<start> me gustaria compensar su gentileza ape...


Looking at the resulting dataframe above, we can see that the translation is relatively accurate and captures most of the context.

In [0]:
from google.colab import files
files.download('eng_to_spa.csv')

## Spanish to English Model
Here, we will train a second model to translate between the same two languages in reverse order.

In [0]:
# pulling the data again 
# convering the dataframe into a list representation
sentences2 = data_sub.values.tolist()

Using the same data but flipping between targets and sources to reverse the languages. Next, we are following the same logic as in the previous task.

In [33]:
print("Original:", sentences2[0])
sentences2 = [(preprocess(target), preprocess(source)) for (source, target) in sentences2]
print("Preprocessed:", sentences2[0])

Original: ['For more information, visit our website.', 'Para más información, visite nuestro sitio.']
Preprocessed: ('<start> Para mas informacion , visite nuestro sitio . <end>', '<start> For more information , visit our website . <end>')


In [0]:
# creating a target and source zip view
source_sentences2, target_sentences2 = list(zip(*sentences2))

In [35]:
# fitting tokenization
source_tokenizer2 = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='outofvocab')
source_tokenizer2.fit_on_texts(source_sentences2)
source_data2 = source_tokenizer2.texts_to_sequences(source_sentences2)
print("Sequence:", source_data2[0])
# adding some padding
source_data2 = tf.keras.preprocessing.sequence.pad_sequences(source_data2, padding='post')
print("Padded:", source_data2[0])

Sequence: [2, 30, 31, 637, 20, 445, 186, 368, 4, 3]
Padded: [  2  30  31 637  20 445 186 368   4   3   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]


In [0]:
# do the same with target
target_tokenizer2 = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='outofvocab')
target_tokenizer2.fit_on_texts(target_sentences2)
target_data2 = target_tokenizer2.texts_to_sequences(target_sentences2)
target_data2 = tf.keras.preprocessing.sequence.pad_sequences(target_data2, padding='post')

In [37]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels2 = np.zeros(target_data2.shape)
target_labels2[:,0:target_data2.shape[1] -1] = target_data2[:,1:]

print("Target sequence", target_data2[0])
print("Target label", target_labels2[0])

Target sequence [  2  29  76 639  21 286 112 966   4   3   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0]
Target label [ 29.  76. 639.  21. 286. 112. 966.   4.   3.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.]


In [0]:
source_vocab_size2 = len(source_tokenizer2.word_index) + 1
target_vocab_size2 = len(target_tokenizer2.word_index) + 1

In [39]:
batch_size = 16
dataset2 = tf.data.Dataset.from_tensor_slices((source_data2, target_data2, target_labels2)).batch(batch_size)
# looking at a specific example
example_batch = next(iter(dataset2))
source, target, taget_labels = example_batch
print("Shapes:", source.shape, target.shape, taget_labels.shape)

Shapes: (16, 29) (16, 30) (16, 30)


We are using the same model parameters for this task as well for it to be an apples-to-apples comparison between this and the English model.

In [0]:
# choosing our parameters
embedding_size = 256
rnn_size = 128

In [0]:
class Encoder2(tf.keras.Model):
  def __init__(self):
    super(Encoder2, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size2,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))
  
class Decoder2(tf.keras.Model):
  def __init__(self):
    super(Decoder2, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size2, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size2)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

In [42]:
# Create a batch of one sentence
ex_sentence = tf.expand_dims(source_data2[0], axis=0)
ex_translation = tf.expand_dims(target_data2[0], axis=0)
ex_labels = tf.expand_dims(target_labels2[0], axis=0)
print(ex_sentence.shape)

encoder2 = Encoder2()
hidden_state = encoder2.init_state(batch_size=1)
print(hidden_state.shape)

output, hidden_state = encoder2(ex_sentence, hidden_state)
print(output.shape)

(1, 29)
(1, 128)
(1, 29, 128)


In [43]:
# calling the decoder
decoder2 = Decoder2()
decoder_output, decoder_state = decoder2(ex_labels, hidden_state)
print(decoder_output.shape)

(1, 30, 2088)


In [44]:
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

print("Loss", calc_loss(ex_labels, decoder_output))

Loss tf.Tensor(2.2935064, shape=(), dtype=float32)


In [45]:
# defining a new translation
def translate2(sentences2, source_data2, idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(sentences2))
    
    input_sent = source_data2[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder2.init_state(batch_size=1)
    output, hidden_state = encoder2(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([target_tokenizer2.word_index['<start>']], 0)
    out_words = []
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder2(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer2.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return sentences2[idx][0], sentences2[idx][1], translation
  

input_sent, target_sent, translation = translate2(sentences2, source_data2)
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Input: <start> Ayer fui a nadar al rio . <end>
Target: <start> I went to swim in the river yesterday . <end>
Translation: egg swim modern silk enjoying behind modest commanding smelled paris thought green theft injured golf draw floor floor disobey floor



In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function # remove this annotation when debugging
def train_step2(source_seq, target_seq, target_labels, initial_state):
  
  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder2(source_seq, initial_state)
    logits, decoder_state = decoder2(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder2.trainable_variables + decoder2.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [48]:
EPOCHS = 101

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states = encoder2.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels) in enumerate(dataset2):
      loss = train_step2(source_seq, target_seq, target_labels, en_initial_states)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate2(sentences2, source_data2)
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Epoch #0, Loss 1.4276, Time 21.95 sec
Input: <start> Dejale que pague . <end>
Target: <start> Let him pay for it . <end>
Translation: i i the <end>

Epoch #10, Loss 0.8836, Time 17.81 sec
Input: <start> El tren llego con retraso por culpa de la nieve . <end>
Target: <start> Owing to the snow , the train was delayed . <end>
Translation: i m not a lot of the city . <end>

Epoch #20, Loss 0.6167, Time 17.65 sec
Input: <start> Nadie me esta escuchando . <end>
Target: <start> No one is listening to me . <end>
Translation: i m not a lot of rain in . <end>

Epoch #30, Loss 0.4590, Time 17.90 sec
Input: <start> Ahora esta leyendo una novela . <end>
Target: <start> He s reading a novel now . <end>
Translation: i m not as stupid as you as the party . <end>

Epoch #40, Loss 0.3767, Time 17.75 sec
Input: <start> ¿ Esto es lo que quereis ? <end>
Target: <start> Is this what you want ? <end>
Translation: i m not as stupid as you as tom s in your hand . <end>

Epoch #50, Loss 0.2688, Time 17.89 sec
I

In [49]:
# calculating BLEAU score
references2, hypotheses2 = [], []

for i in range(len(sentences2)):
  input_sent, target_sent, translation = translate2(sentences2, source_data2)
  references2.append(target_sent)
  hypotheses2.append("<start> " + translation)
  
results2 = sacrebleu.raw_corpus_bleu(hypotheses2, [references2])
print(results2)

BLEU(score=65.09860706495068, counts=[16120, 11541, 9184, 7278], totals=[19336, 17336, 15336, 13336], precisions=[83.3678113363674, 66.57245039224735, 59.88523735002608, 54.57408518296341], bp=0.9975206624286667, sys_len=19336, ref_len=19384)


We are getting yet another decent BLEAU score of **61** (similar to the English model developed).

In [0]:
# saving the results
input_text2, targets2, translations2 = [], [], []

for i in range(len(sentences2)):
  input_sent, target_sent, translation = translate2(sentences2, source_data2, i)
  targets2.append(target_sent)
  input_text2.append(input_sent)
  translations2.append("<start> " + translation)

In [0]:
df2 = pd.DataFrame(input_text2, columns=['input'])
df2['target'] = targets2
df2['translation'] = translations2
df2.to_csv('spa_to_eng.csv')
df2.head()

In [0]:
from google.colab import files
files.download('spa_to_eng.csv')

## Playing with back-translation
Using the two models to translate a sentence from English to Spanish, and then
back to English.

In [52]:
# USING ENGLISH TO SPANISH MODEL
# translating a random sentence form a test set 
input_sent, target_sent, translation_test = translate(sentences_test, source_data_test, idx=560)
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation_test))

Input: <start> Tom wrote some country songs . <end>
Target: <start> Tom escribio unas canciones country . <end>
Translation: tom bebio un sorbo de su trago y puso el vaso de vuelta en la mesa . <end>



In [53]:
# USING SPANISH TO ENGLISH MODEL
# running quick pre-processing on the translated sentence
translation_test_ex = 'quiero matar a alguien.'
target_test_ex = 'I want your opinion'
sentences_ex = [[translation_test_ex, target_test_ex]]
sentences_ex = [(preprocess(source), preprocess(target)) for (source, target) in sentences_ex]
source_sentences_ex, target_sentences_ex = list(zip(*sentences_ex))
source_data_ex = source_tokenizer2.texts_to_sequences(source_sentences_ex)
source_data_ex = tf.keras.preprocessing.sequence.pad_sequences(source_data_ex, padding='post')

# translating the resulting sentence back into English
input_sent, target_sent, translation_test = translate2(sentences_ex, source_data_ex, idx=0)
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation_test))

Input: <start> quiero matar a alguien . <end>
Target: <start> I want your opinion <end>
Translation: i want to be alone ! <end>



Looking at a single sentence expriment, the results of back-transaltion are not very good. The final sentence does not capture the original very well! Let us see how bad the score is when working with 1000 sentences.

In [0]:
# USING ENGLISH TO SPANISH MODEL
# translating the entire test corpus
input_text_test1, targets_test1, translations_test1 = [], [], []

for i in range(len(sentences_test)):
  input_sent, target_sent, translation = translate(sentences_test, source_data_test, idx=i)
  targets_test1.append(target_sent)
  input_text_test1.append(input_sent)
  translations_test1.append("<start> " + translation)
  
df_test = pd.DataFrame(input_text_test1, columns=['input'])
df_test['target'] = targets_test1
df_test['translation'] = translations_test1

In [0]:
df_test.to_csv('eng2spa_test.csv')
df_test.head()

In [0]:
from google.colab import files
files.download('eng2spa_test.csv')

In [0]:
# running necessary preprocessing
sentences_test = df_test[['translation', 'input']]
sentences_test = [tuple(x) for x in sentences_test.values]
source_sentences_test, target_sentences_test = list(zip(*sentences_test))
source_data_test = source_tokenizer2.texts_to_sequences(source_sentences_test)
source_data_test = tf.keras.preprocessing.sequence.pad_sequences(source_data_test, padding='post')

In [56]:
# calculating BLEAU score
references3, hypotheses3 = [], []

for i in range(len(sentences_test)):
  input_sent, target_sent, translation = translate2(sentences_test, source_data_test)
  references2.append(target_sent)
  hypotheses2.append("<start> " + translation)
  
results3 = sacrebleu.raw_corpus_bleu(hypotheses3, [references3])
print(results3)

BLEU(score=0.0, counts=[0, 0, 0, 0], totals=[0, 0, 0, 0], precisions=[0, 0, 0, 0], bp=1.0, sys_len=0, ref_len=0)


As expected, the score has dropped quite a bit (now down to **~0**) when working with back-translation:(. This is to be expected since we are now trying to translate a translated sentence which is a much harder task.