In [0]:
!pip install -q tensorflow-gpu==2.0.0-alpha0

In [0]:
!pip install sacrebleu # https://github.com/mjpost/sacreBLEU



In [0]:
import numpy as np
import re
import sacrebleu
import tensorflow as tf
import time
import unicodedata

In [0]:

import random
lines = [line.rstrip('\n') for line in open('spa.txt')]

my_data = []
for line in lines:
  cols = line.split('\t')
  my_data.append(tuple(cols))
        
print(my_data[1:10])

random.seed(42)
random.shuffle(my_data)
print(my_data[1:10])

en_es_sentences = my_data[10:2010]



[('Go.', 'Vete.'), ('Go.', 'Vaya.'), ('Go.', 'Váyase.'), ('Hi.', 'Hola.'), ('Run!', '¡Corre!'), ('Run.', 'Corred.'), ('Who?', '¿Quién?'), ('Fire!', '¡Fuego!'), ('Fire!', '¡Incendio!')]
[('Do you really want to be here?', '¿Realmente querés estar acá?'), ('She is as beautiful as Snow White.', 'Ella es bella como Blancanieves.'), ("There are few men who don't know that.", 'Hay pocos hombres que no lo saben.'), ('Tom changes channels during commercials.', 'Tom cambia de canal durante los comerciales.'), ("It's a wonder they're still awake.", 'Es un milagro que sigan despiertas.'), ('I will clean this up later.', 'Lo limpiaré más tarde.'), ('Let him go!', '¡Déjale irse!'), ('I study math harder than you do.', 'Yo estudio mates mucho más que tú.'), ('We need to clean the car.', 'Necesitamos lavar el auto.')]


In [0]:
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
  s = s.strip()
  s = '<start> ' + s + ' <end>'
  return s

# Part 1: Converting English to Spanish

To convert English to Spanish, we simply follow the method given in the sample tutorial. We train the model on 2000 rows of randomly selected data. The tuples containing the English and Spanish translations is stored in variable *sentences*.

The BLEU score achieved is 67.4

We also store the hypotheses and the corresponding inputs given tot the model as this will later be used for back translation. 


Note: As discussed in the OH, I have not used a train and test split. Instead it is tested on the entire dataset. 


In [0]:
sentences = en_es_sentences
sentences = [(preprocess(source), preprocess(target)) for (source, target) in sentences]
source_sentences, target_sentences = list(zip(*sentences))

source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer.fit_on_texts(source_sentences)
source_data = source_tokenizer.texts_to_sequences(source_sentences)
source_data = tf.keras.preprocessing.sequence.pad_sequences(source_data, padding='post')
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_sentences)
target_data = target_tokenizer.texts_to_sequences(target_sentences)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')
target_labels = np.zeros(target_data.shape)
target_labels[:,0:target_data.shape[1] -1] = target_data[:,1:]

source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [0]:
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))
  
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state
  
def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

def translate(idx=None):

  if idx == None: 
    idx = np.random.choice(len(sentences))

  input_sent = source_data[idx]
  input_sent = tf.expand_dims(input_sent, axis=0)

  hidden_state = encoder.init_state(batch_size=1)
  output, hidden_state = encoder(input_sent, hidden_state)

  decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
  out_words = []

  decoder_state = hidden_state

  while True:

      decoder_output, decoder_state = decoder(decoder_input, decoder_state)
      decoder_input = tf.argmax(decoder_output, -1)
      word_idx = decoder_input.numpy()[0][0]
      # if we've predicted 0 (which is reserved, usually this will only happen
      # before the decoder is trained, just stop translating and return
      # what we have)
      if word_idx == 0: 
        out_words.append('<end>')
      else:
        out_words.append(target_tokenizer.index_word[word_idx])

      if out_words[-1] == '<end>' or len(out_words) >= 20:
        break

  translation = ' '.join(out_words)    
  return sentences[idx][0], sentences[idx][1], translation
@tf.function
def train_step(source_seq, target_seq, target_labels, initial_state):

  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder(source_seq, initial_state)
    logits, decoder_state = decoder(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [0]:
batch_size = 5
dataset = tf.data.Dataset.from_tensor_slices((source_data, target_data, target_labels)).batch(batch_size)
embedding_size = 32
rnn_size = 64
optimizer = tf.keras.optimizers.Adam()
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

EPOCHS = 300
encoder = Encoder()
decoder = Decoder()
for epoch in range(EPOCHS):
  start = time.time()

  en_initial_states = encoder.init_state(batch_size)

  for batch, (source_seq, target_seq, target_labels) in enumerate(dataset):
    loss = train_step(source_seq, target_seq, target_labels, en_initial_states)
    elapsed = time.time() - start

  if epoch % 10 == 0:
    print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
    input_sent, target_sent, translation = translate(idx=None)
    print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

references_eng, hypotheses_eng, input_eng = [], [], []

for i in range(len(sentences)):
  input_sent, target_sent, translation = translate()
  references_eng.append(target_sent)
  hypotheses_eng.append("<start> " + translation)
  input_eng.append(input_sent)
results = sacrebleu.raw_corpus_bleu(hypotheses_eng, [references_eng])
print(results)


Epoch #0, Loss 1.4114, Time 6.57 sec
Input: <start> It s getting harder for me to concentrate . <end>
Target: <start> Cada vez me cuesta mas concentrarme . <end>
Translation: tom no que . <end>

Epoch #10, Loss 0.9640, Time 2.20 sec
Input: <start> Tom cut his finger on a piece of glass . <end>
Target: <start> Tom se corto el dedo con un pedazo de vidrio . <end>
Translation: tom no es un poco . <end>

Epoch #20, Loss 0.7012, Time 2.06 sec
Input: <start> I know who you want to talk to . <end>
Target: <start> Se con quien quieres hablar . <end>
Translation: no me gusta el vacuno . <end>

Epoch #30, Loss 0.5251, Time 2.05 sec
Input: <start> No one knows the answer . <end>
Target: <start> Nadie sabe la respuesta . <end>
Translation: la noticia la entristecio . <end>

Epoch #40, Loss 0.3845, Time 2.08 sec
Input: <start> Maybe they will come and maybe they won t . <end>
Target: <start> Puede que vengan y puede que no . <end>
Translation: a mi tambien es una persona nueva . <end>

Epoch #50, L

In [0]:
references_eng, hypotheses_eng, input_eng = [], [], []

for i in range(len(sentences)):
  input_sent, target_sent, translation = translate()
  references_eng.append(target_sent)
  hypotheses_eng.append("<start> " + translation)
  input_eng.append(input_sent)
results = sacrebleu.raw_corpus_bleu(hypotheses_eng, [references_eng])
print(results)

BLEU(score=67.43784658833205, counts=[15983, 11649, 9150, 7220], totals=[18741, 16741, 14741, 12741], precisions=[85.28360279600875, 69.5836568902694, 62.07177260701445, 56.66745153441645], bp=0.9977081955412436, sys_len=18741, ref_len=18784)


# Part 2: Converting Spanish to English

To convert Spanish to English, we first need to change the order of tuples in *sentences* . We then train a new model (overwriting the previous one ).

The BLEU score achieved is 65.4

In [0]:
es_en_sentences = []

for i in range(0,len(en_es_sentences)):
  sentence = en_es_sentences[i]
  data_point = []
  data_point.append(sentence[1])
  data_point.append(sentence[0])
  es_en_sentences.append(tuple(data_point))

sentences = es_en_sentences
sentences = [(preprocess(source), preprocess(target)) for (source, target) in sentences]
source_sentences, target_sentences = list(zip(*sentences))

source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer.fit_on_texts(source_sentences)
source_data = source_tokenizer.texts_to_sequences(source_sentences)
source_data = tf.keras.preprocessing.sequence.pad_sequences(source_data, padding='post')
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_sentences)
target_data = target_tokenizer.texts_to_sequences(target_sentences)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')
target_labels = np.zeros(target_data.shape)
target_labels[:,0:target_data.shape[1] -1] = target_data[:,1:]

source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [0]:
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))
  
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state
  
def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

def translate(idx=None):

  if idx == None: 
    idx = np.random.choice(len(sentences))

  input_sent = source_data[idx]
  input_sent = tf.expand_dims(input_sent, axis=0)

  hidden_state = encoder.init_state(batch_size=1)
  output, hidden_state = encoder(input_sent, hidden_state)

  decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
  out_words = []

  decoder_state = hidden_state

  while True:

      decoder_output, decoder_state = decoder(decoder_input, decoder_state)
      decoder_input = tf.argmax(decoder_output, -1)
      word_idx = decoder_input.numpy()[0][0]
      # if we've predicted 0 (which is reserved, usually this will only happen
      # before the decoder is trained, just stop translating and return
      # what we have)
      if word_idx == 0: 
        out_words.append('<end>')
      else:
        out_words.append(target_tokenizer.index_word[word_idx])

      if out_words[-1] == '<end>' or len(out_words) >= 20:
        break

  translation = ' '.join(out_words)    
  return sentences[idx][0], sentences[idx][1], translation
@tf.function
def train_step(source_seq, target_seq, target_labels, initial_state):

  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder(source_seq, initial_state)
    logits, decoder_state = decoder(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [0]:
batch_size = 5
dataset = tf.data.Dataset.from_tensor_slices((source_data, target_data, target_labels)).batch(batch_size)
embedding_size = 32
rnn_size = 64
optimizer = tf.keras.optimizers.Adam()
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

EPOCHS = 300
encoder = Encoder()
decoder = Decoder()
for epoch in range(EPOCHS):
  start = time.time()

  en_initial_states = encoder.init_state(batch_size)

  for batch, (source_seq, target_seq, target_labels) in enumerate(dataset):
    loss = train_step(source_seq, target_seq, target_labels, en_initial_states)
    elapsed = time.time() - start

  if epoch % 10 == 0:
    print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
    input_sent, target_sent, translation = translate(idx=None)
    print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))



Epoch #0, Loss 1.5785, Time 4.39 sec
Input: <start> ¿ Te sorprendiste ? <end>
Target: <start> Were you surprised ? <end>
Translation: i i the lot . <end>

Epoch #10, Loss 1.0896, Time 2.10 sec
Input: <start> Habia muchos ninos en la plaza . <end>
Target: <start> There were many children in the square . <end>
Translation: i m not a lot of the party is the same castle . <end>

Epoch #20, Loss 0.8588, Time 2.38 sec
Input: <start> Nadie quiere responder esa pregunta . <end>
Target: <start> No one wants to answer that question . <end>
Translation: i m not a little restless . <end>

Epoch #30, Loss 0.6938, Time 2.10 sec
Input: <start> Me das miedo . <end>
Target: <start> You scare me . <end>
Translation: i m not a bit tired . <end>

Epoch #40, Loss 0.5445, Time 2.12 sec
Input: <start> Tuve un mal sueno anoche . <end>
Target: <start> I had a bad dream last night . <end>
Translation: i m not a bit tired . <end>

Epoch #50, Loss 0.4342, Time 2.12 sec
Input: <start> ¿ Me vas a echar la culpa de 

In [0]:
references_spanish, hypotheses_spanish, inputs_spanish = [], [],[]

for i in range(len(sentences)):
  input_sent, target_sent, translation = translate()
  references_spanish.append(target_sent)
  hypotheses_spanish.append("<start> " + translation)
  inputs_spanish.append(input_sent)
  
results = sacrebleu.raw_corpus_bleu(hypotheses_spanish, [references_spanish])
print(results)

BLEU(score=65.46592398170365, counts=[16298, 11744, 9363, 7364], totals=[19525, 17525, 15525, 13525], precisions=[83.47247119078105, 67.01283880171184, 60.309178743961354, 54.44731977818854], bp=1.0, sys_len=19525, ref_len=19483)


# PART 3 - Back Translation

For back translation the following workflow is used:

Translate English to Spanish ----> Store translated Spanish sentence in *hypotheses* array ----> Translate *hypotheses* array to English ----> Compare  with original English sentence


We first create a new array of tuples which will be our new *sentences* array. The input sentence is given as the hypotheses, and target as the original English sentence. We then ttokenise the new input sentences on the **same** tokenizer used in the Spanish to English model above. 

The rest of the procedure that's followed is the same as above. The BLEU score is calculated based on the original English sentence and the new translated English sentence. 

The BLEU score acheived is 63.4 (lower than the previous two BLEU scores)

In [0]:
tr = []

for i in range(0,len(hypotheses_eng)):
  data_point = []
  data_point.append(hypotheses_eng[i])
  data_point.append(input_eng[i])
  tr.append(tuple(data_point))
  
sentences = tr

source_sentences = hypotheses_eng
source_data = source_tokenizer.texts_to_sequences(source_sentences)
source_data = tf.keras.preprocessing.sequence.pad_sequences(source_data, padding='post')

source_vocab_size = len(source_tokenizer.word_index) + 1

In [0]:

references_back, hypotheses_back, inputs_back = [], [],[]

for i in range(len(hypotheses_eng)):
  input_sent, target_sent, translation = translate()
  references_back.append(target_sent)
  hypotheses_back.append("<start> " + translation)
  inputs_back.append(input_sent)
  
results = sacrebleu.raw_corpus_bleu(hypotheses_back, [references_back])
print(results)

BLEU(score=63.44463190858362, counts=[15889, 11362, 8986, 7060], totals=[19435, 17435, 15435, 13435], precisions=[81.75456650373039, 65.16776598795526, 58.21833495302883, 52.54931149981392], bp=0.9985089593756523, sys_len=19435, ref_len=19464)


In [23]:
for i in range(0,10):
  print("Input2:\t", references_back[i])
  print("Input2: \t", inputs_back[i])
  print("Output: \t", hypotheses_back[i])
  print("\n")

Input2:	 <start> What is the book about ? <end>
Input2: 	 <start> ¿ de que trata el libro ? <end>
Output: 	 <start> what is the book about ? <end>


Input2:	 <start> Let s put this near the door . <end>
Input2: 	 <start> pongamos esto cerca de la puerta . <end>
Output: 	 <start> let s put this near the door . <end>


Input2:	 <start> Hundreds of soldiers ate in silence around their campfires . <end>
Input2: 	 <start> cientos de soldados comieron en silencio alrededor de sus fogatas . <end>
Output: 	 <start> hundreds of soldiers ate in silence around their campfires . <end>


Input2:	 <start> I am repairing the washing machine . <end>
Input2: 	 <start> estoy arreglando el lavarropa . <end>
Output: 	 <start> i am repairing the washing machine . <end>


Input2:	 <start> Tom didn t come to the last meeting . <end>
Input2: 	 <start> tom no vino a la ultima reunion . <end>
Output: 	 <start> tom didn t come to the last meeting . <end>


Input2:	 <start> Sleeping in class is not allowed . <end