In [1]:
!pip install -q tensorflow-gpu==2.0.0-alpha0

[K    100% |████████████████████████████████| 332.1MB 67kB/s 
[K    100% |████████████████████████████████| 419kB 12.7MB/s 
[K    100% |████████████████████████████████| 61kB 31.0MB/s 
[K    100% |████████████████████████████████| 3.0MB 10.0MB/s 
[?25h

In [2]:
!pip install sacrebleu # https://github.com/mjpost/sacreBLEU

Collecting sacrebleu
  Downloading https://files.pythonhosted.org/packages/12/5b/7196b11bca204cb6ca9000b5dc910e809081f224c73ef28e9991080e4e51/sacrebleu-1.3.1.tar.gz
Building wheels for collected packages: sacrebleu
  Building wheel for sacrebleu (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/56/c0/fb/1c7f9b3a71f64cdf86291cc645596f71746807bf2f72b3c1dd
Successfully built sacrebleu
Installing collected packages: sacrebleu
Successfully installed sacrebleu-1.3.1


In [3]:
!unzip spa-eng

Archive:  spa-eng.zip
   creating: spa-eng/
  inflating: spa-eng/_about.txt      
  inflating: spa-eng/spa.txt         


In [4]:
!ls spa-eng

_about.txt  spa.txt


In [0]:
import numpy as np
import re
import sacrebleu
import tensorflow as tf
import time
import unicodedata

In [6]:
tf.__version__

'2.0.0-alpha0'

In [7]:
with open('spa-eng/spa.txt') as f:
  sentences=[]
  sentences_reverse=[]
  for _ in range(1000):
    tmp=f.readline()
    tmpl=tmp.split('\t')
    sentences.append(tmpl)
sentences[:5]

[['Go.', 'Ve.\n'],
 ['Go.', 'Vete.\n'],
 ['Go.', 'Vaya.\n'],
 ['Go.', 'Váyase.\n'],
 ['Hi.', 'Hola.\n']]

In [0]:
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
  s = s.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  s = '<start> ' + s + ' <end>'
  return s

In [9]:
print("Original:", sentences[0])
sentences = [(preprocess(source), preprocess(target)) for (source, target) in sentences]
print("Preprocessed:", sentences[0])

Original: ['Go.', 'Ve.\n']
Preprocessed: ('<start> Go . <end>', '<start> Ve . <end>')


In [10]:
#pair english & spanish
source_sentences, target_sentences = list(zip(*sentences))
source_sentences[0],target_sentences[0]

('<start> Go . <end>', '<start> Ve . <end>')

In [11]:
#tokenize source data
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer.fit_on_texts(source_sentences)
source_data = source_tokenizer.texts_to_sequences(source_sentences)
print("Sequence:", source_data[0])
source_data = tf.keras.preprocessing.sequence.pad_sequences(source_data, padding='post')
print("Padded:", source_data[0])

Sequence: [1, 10, 3, 2]
Padded: [ 1 10  3  2  0  0  0]


In [0]:
#tokenize target data 
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_sentences)
target_data = target_tokenizer.texts_to_sequences(target_sentences)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')

In [13]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels = np.zeros(target_data.shape)
target_labels[:,0:target_data.shape[1] -1] = target_data[:,1:]

print("Target sequence", target_data[0])
print("Target label", target_labels[0])

Target sequence [ 1 39  3  2  0  0  0  0  0]
Target label [39.  3.  2.  0.  0.  0.  0.  0.  0.]


In [14]:
# reverse: Create labels for the decoder by shifting the target sequence
# one to the right.
target_labels_reverse = np.zeros(source_data.shape)
target_labels_reverse[:,0:source_data.shape[1] -1] = source_data[:,1:]

print("Target sequence reversed", source_data[0])
print("Target label reversed", target_labels_reverse[0])

Target sequence reversed [ 1 10  3  2  0  0  0]
Target label reversed [10.  3.  2.  0.  0.  0.  0.]


In [15]:
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
source_vocab_size,target_vocab_size

(373, 822)

In [16]:
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
decode(source_data[0], source_tokenizer)

1 -> <start>
10 -> go
3 -> .
2 -> <end>


In [0]:
batch_size = 5
#dataset for 2 cond separately
dataset = tf.data.Dataset.from_tensor_slices((source_data, target_data, target_labels)).batch(batch_size)
dataset_reverse = tf.data.Dataset.from_tensor_slices((target_data, source_data, target_labels_reverse)).batch(batch_size)

In [18]:
example_batch = next(iter(dataset_reverse))
source, target, taget_labels = example_batch
print("Shapes:", source.shape, target.shape, taget_labels.shape)

Shapes: (5, 9) (5, 7) (5, 7)


In [19]:
example_batch = next(iter(dataset))
source, target, taget_labels = example_batch
print("Shapes:", source.shape, target.shape, taget_labels.shape)

Shapes: (5, 7) (5, 9) (5, 9)


In [0]:
embedding_size = 32
rnn_size = 64

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self,size):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, size):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

In [0]:
#init encoder, decoder
encoder = Encoder(source_vocab_size)
decoder = Decoder(target_vocab_size)

encoder_reverse=Encoder(target_vocab_size)
decoder_reverse=Decoder(source_vocab_size)

In [0]:
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

In [0]:
def translate(idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(sentences))

    
    input_sent = source_data[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder.init_state(batch_size=1)
    output, hidden_state = encoder(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    out_words = []
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return sentences[idx][0], sentences[idx][1], translation

In [0]:
def translate_reverse(idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(sentences))
    
    input_sent = target_data[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder_reverse.init_state(batch_size=1)
    output, hidden_state = encoder_reverse(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([source_tokenizer.word_index['<start>']], 0)
    out_words = []
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder_reverse(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(source_tokenizer.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return sentences[idx][1], sentences[idx][0], translation

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function # remove this annotation when debugging
def train_step(source_seq, target_seq, target_labels, initial_state):
  
  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder(source_seq, initial_state)
    logits, decoder_state = decoder(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [0]:
@tf.function # remove this annotation when debugging
def train_step_reverse(source_seq, target_seq, target_labels, initial_state):
  
  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder_reverse(source_seq, initial_state)
    logits, decoder_state = decoder_reverse(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder_reverse.trainable_variables + decoder_reverse.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [30]:
EPOCHS = 200

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states = encoder.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels) in enumerate(dataset):
      loss = train_step(source_seq, target_seq, target_labels, en_initial_states)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate()
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Epoch #0, Loss 1.9348, Time 6.78 sec
Input: <start> Look away . <end>
Target: <start> Miren para otro lado . <end>
Translation: . <end>

Epoch #10, Loss 1.2568, Time 0.85 sec
Input: <start> Go . <end>
Target: <start> Vayase . <end>
Translation: ven a tomas . <end>

Epoch #20, Loss 0.9815, Time 0.86 sec
Input: <start> I moved . <end>
Target: <start> Me he mudado . <end>
Translation: tom se quedo . <end>

Epoch #30, Loss 0.7357, Time 0.85 sec
Input: <start> We saw it . <end>
Target: <start> Lo vimos . <end>
Translation: lo echo de falta . <end>

Epoch #40, Loss 0.5728, Time 0.85 sec
Input: <start> Sit down ! <end>
Target: <start> Sentate ! <end>
Translation: vuelve otra vez . <end>

Epoch #50, Loss 0.4719, Time 0.88 sec
Input: <start> Seize him ! <end>
Target: <start> Cogedlo ! <end>
Translation: buen trabajo ! <end>

Epoch #60, Loss 0.3352, Time 0.86 sec
Input: <start> Bring help . <end>
Target: <start> Traed ayuda . <end>
Translation: traed comida . <end>

Epoch #70, Loss 0.2671, Time 

In [31]:
#calculate BLEU score
references, hypotheses = [], []

for i in range(len(sentences)):
  input_sent, target_sent, translation = translate()
  references.append(target_sent)
  hypotheses.append("<start> " + translation)
  
results = sacrebleu.raw_corpus_bleu(hypotheses, [references])
print(results)

BLEU(score=28.890513844757614, counts=[3901, 1813, 752, 207], totals=[5223, 4223, 3223, 2223], precisions=[74.68887612483248, 42.931565237982475, 23.33229910021719, 9.31174089068826], bp=1.0, sys_len=5223, ref_len=5172)


In [32]:
#train reverse
for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states = encoder_reverse.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels) in enumerate(dataset_reverse):
      loss_re = train_step_reverse(source_seq, target_seq, target_labels, en_initial_states)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate_reverse()
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

Epoch #0, Loss 0.1234, Time 2.97 sec
Input: <start> Estoy debil . <end>
Target: <start> I m weak . <end>
Translation: tom . <end>

Epoch #10, Loss 0.1234, Time 0.87 sec
Input: <start> Hace frio . <end>
Target: <start> It s cold . <end>
Translation: i m first . <end>

Epoch #20, Loss 0.1234, Time 0.86 sec
Input: <start> Lo echo en falta . <end>
Target: <start> I miss it . <end>
Translation: i ll wait . <end>

Epoch #30, Loss 0.1234, Time 0.88 sec
Input: <start> Espere . <end>
Target: <start> I waited . <end>
Translation: i ll wait . <end>

Epoch #40, Loss 0.1234, Time 0.86 sec
Input: <start> Estoy perfectamente . <end>
Target: <start> I m okay . <end>
Translation: i m alone . <end>

Epoch #50, Loss 0.1234, Time 1.00 sec
Input: <start> Preguntele a cualquiera . <end>
Target: <start> Ask anyone . <end>
Translation: ask anyone . <end>

Epoch #60, Loss 0.1234, Time 0.87 sec
Input: <start> Tomas miente . <end>
Target: <start> Tom lies . <end>
Translation: tom s tom . <end>

Epoch #70, Loss 0

In [33]:
#calculate BLEU score for reversed translation
references_reverse, hypotheses_reverse = [], []

for i in range(len(sentences)):
  input_sent, target_sent, translation = translate_reverse()
  references_reverse.append(target_sent)
  hypotheses_reverse.append("<start> " + translation)
  
results = sacrebleu.raw_corpus_bleu(hypotheses_reverse, [references_reverse])
print(results)

BLEU(score=36.063774172316585, counts=[4161, 2129, 1137, 299], totals=[5325, 4325, 3325, 2325], precisions=[78.14084507042253, 49.225433526011564, 34.19548872180451, 12.86021505376344], bp=1.0, sys_len=5325, ref_len=5310)


In [0]:
#translate to spanish
input_list,eng_spa=[],[]
for i in range(len(sentences)):
  input_sent,_, translation = translate(idx=i) 
  eng_spa.append('<start> '+translation)
  input_list.append(input_sent)

In [0]:
#tokenize translated sentences
target_data = target_tokenizer.texts_to_sequences(eng_spa)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')

In [0]:
#translate back to english
spa_eng=[]
for i in range(len(sentences)):
  _,_, translation = translate_reverse(idx=i) 
  spa_eng.append('<start> '+translation)

In [37]:
#examples
print('Target | To_Spanish | To_English:')
for i in range(5,15):
  print(input_list[i][7:-5],'|',eng_spa[i][7:-5],'|',spa_eng[i][7:-5])

Target | To_Spanish | To_English:
 Run !  |  corre !  |  run ! 
 Run .  |  corred .  |  run . 
 Who ?  |  ¿ quien ?  |  who ? 
 Fire !  |  fuego !  |  fire ! 
 Fire !  |  fuego !  |  fire ! 
 Fire !  |  fuego !  |  fire ! 
 Help !  |  auxilio !  |  help ! 
 Help !  |  auxilio !  |  help ! 
 Help !  |  auxilio !  |  help ! 
 Jump !  |  salta !  |  jump ! 


In [39]:
#calculate final BLEU score
results = sacrebleu.raw_corpus_bleu(spa_eng, [input_list])
print(results)

BLEU(score=36.29426557834601, counts=[4171, 2137, 1149, 302], totals=[5326, 4326, 3326, 2326], precisions=[78.31393165602704, 49.39898289412852, 34.54600120264582, 12.98366294067068], bp=1.0, sys_len=5326, ref_len=5317)
