In [1]:
'''!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
!unzip -q spa-eng.zip'''

'!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\n!unzip -q spa-eng.zip'

In [2]:
text_file = r"path to spa.txt"
with open(text_file, encoding='utf-8') as f:
    lines = f.read().split("\n")[:-1]
text_pairs = [] 

for line in lines: 
    english, spanish = line.split("\t") 
    spanish = "[start] " + spanish + " [end]" 
    text_pairs.append((english, spanish))

In [3]:
import random
print(random.choice(text_pairs))

('You had better hurry. The train leaves at three.', '[start] Sería mejor que os dierais prisa, el tren sale a las tres. [end]')


In [4]:
#Splitting into train-test-val

random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2*num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples: num_train_samples+num_val_samples]
test_pairs = text_pairs[num_train_samples+num_val_samples:]

In [5]:
#Two different TextVectorization layers for English and Spanish; Spanish has the extra ulta ? to be stripped, 
#and we want to preserve [start] and [end] tokens that we've inserted

#Because toy example, we're stripping all punctuation; otherwise, we would tokenise them too so that generated
#output has correct punctuation

import tensorflow as tf
import string
import re

#Find a way to get ulta question mark
#strip_chars = string.punctuation + '?'

strip_chars = string.punctuation
strip_chars = strip_chars.replace('[', '')
strip_chars = strip_chars.replace(']', '')

def custom_standardization(input_string):
  lowercase = tf.strings.lower(input_string)
  return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", '')

In [6]:
vocab_size=15000
sequence_length=20

from tensorflow.keras import layers as layers
import tensorflow.keras as keras

source_vectorization = layers.TextVectorization(max_tokens=vocab_size, output_mode='int', output_sequence_length=sequence_length)
target_vectorization = layers.TextVectorization(max_tokens=vocab_size, output_mode='int', output_sequence_length=sequence_length+1,
                                                standardize=custom_standardization)

train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

In [7]:
#Preparing datasets for translation

batch_size = 64

def format_dataset(eng, spa):
  eng = source_vectorization(eng)
  spa = target_vectorization(spa)
  return ({'english': eng, 'spanish':spa[:, :-1]}, spa[:, 1:])


def make_dataset(pairs):
  eng_texts, spa_texts = zip(*pairs)
  eng_texts, spa_texts = list(eng_texts), list(spa_texts)
  dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
  dataset = dataset.batch(batch_size)
  dataset = dataset.map(format_dataset, num_parallel_calls=4)
  return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [8]:
for inputs, targets in train_ds.take(1):
  print(f"inputs['english'].shape: {inputs['english'].shape}")
  print(f"inputs['spanish'].shape: {inputs['spanish'].shape}")

inputs['english'].shape: (64, 20)
inputs['spanish'].shape: (64, 20)


In [9]:
#Seq2Seq with RNN

inputs = keras.Input(shape=(sequence_length, ), dtype='int64')
x = layers.Embedding(input_dim=vocab_size, output_dim=128)(inputs)
x = layers.LSTM(32, return_sequences=True)(x)
outputs = layers.Dense(vocab_size, activation='softmax')(x)
model = keras.Model(inputs, outputs)
model.summary()

#Just plain bad, we don't bother actually implementing it

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 20)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 20, 128)           1920000   
_________________________________________________________________
lstm (LSTM)                  (None, 20, 32)            20608     
_________________________________________________________________
dense (Dense)                (None, 20, 15000)         495000    
Total params: 2,435,608
Trainable params: 2,435,608
Non-trainable params: 0
_________________________________________________________________


In [10]:
#GRU-Based Encoder

embed_dim=256
latent_dim=1024

source = keras.Input(shape=(None, ), dtype='int64', name='english')
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode='sum')(x)


#GRU-Based Decoder

past_target = keras.Input(shape=(None, ), dtype='int64', name='spanish')
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocab_size, activation='softmax')(x)
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

In [11]:
seq2seq_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
seq2seq_rnn.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
english (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
spanish (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    3840000     english[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    3840000     spanish[0][0]                    
____________________________________________________________________________________________

In [12]:
'''callbacks = keras.callbacks.ModelCheckpoint('seq2seq_rnn.keras', save_best_only=True)
seq2seq_rnn.fit(train_ds, epochs=10, validation_data=val_ds, callbacks=callbacks)'''

"callbacks = keras.callbacks.ModelCheckpoint('seq2seq_rnn.keras', save_best_only=True)\nseq2seq_rnn.fit(train_ds, epochs=10, validation_data=val_ds, callbacks=callbacks)"

In [13]:
#Using Transfomers

#Implementing transformer encoder from scratch

class TransformerEncoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.denseProj = keras.Sequential([layers.Dense(dense_dim, activation='relu'), layers.Dense(embed_dim), ])
    self.layerNorm1 = layers.LayerNormalization()
    self.layerNorm2 = layers.LayerNormalization()

  def call(self, inputs, mask=None):
    if mask is not None:
      mask=mask[:, tf.newaxis, :]
    attention_output = self.attention(inputs, inputs, attention_mask=mask)
    proj_input = self.layerNorm1(inputs+attention_output)
    proj_output = self.denseProj(proj_input)
    return self.layerNorm2(proj_input + proj_output)

  def get_config(self):
    config = super().get_config()
    config.update({'embed_dim':self.embed_dim, 'num_heads':self.num_heads, 'dense_dim': self.dense_dim})
    return config

In [14]:
class PositionalEmbedding(layers.Layer):

 def __init__(self, sequence_length, input_dim, output_dim, **kwargs): 
  super().__init__(**kwargs)
  self.token_embeddings = layers.Embedding(input_dim=input_dim, output_dim=output_dim)
  self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=output_dim) 
  self.sequence_length = sequence_length
  self.input_dim = input_dim
  self.output_dim = output_dim

 def call(self, inputs):
  length = tf.shape(inputs)[-1]
  positions = tf.range(start=0, limit=length, delta=1)
  embedded_tokens = self.token_embeddings(inputs)
  embedded_positions = self.position_embeddings(positions)
  return embedded_tokens + embedded_positions 

 def compute_mask(self, inputs, mask=None): 
  return tf.math.not_equal(inputs, 0) 

 def get_config(self): 
  config = super().get_config()
  config.update({"output_dim": self.output_dim,"sequence_length": self.sequence_length, "input_dim": self.input_dim})
  return config

In [15]:
class TransformerDecoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.attention2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.denseProj = keras.Sequential([layers.Dense(dense_dim, activation='relu'), layers.Dense(embed_dim), ])
    self.layernorm1 = layers.LayerNormalization()
    self.layernorm2 = layers.LayerNormalization()
    self.layernorm3 = layers.LayerNormalization()
    self.supports_masking=True

  def get_config(self):
    config = super().get_config()
    config.update({'embed_dim': self.embed_dim, 'dense_dim': self.dense_dim, 'num_heads': self.num_heads})
    return config

  def get_casual_attention_mask(self, inputs):
    input_shape = tf.shape(inputs)
    batch_size, sequence_length = input_shape[0], input_shape[1]
    i = tf.range(sequence_length)[:, tf.newaxis]
    j = tf.range(sequence_length)
    mask = tf.cast(i >= j, dtype='int32')
    mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
    mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], axis=0)
    return tf.tile(mask, mult)

  def call(self, inputs, encoder_outputs, mask=None):
    casual_mask = self.get_casual_attention_mask(inputs)
    if mask is not None:
      padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype='int32')
      padding_mask = tf.minimum(padding_mask, casual_mask)
    attention_output_1 = self.attention1(query=inputs, value=inputs, key=inputs, attention_mask=casual_mask)
    attention_output_1 = self.layernorm1(inputs + attention_output_1)
    attention_output_2 = self.attention2(query=attention_output_1, value=encoder_outputs, key=encoder_outputs, attention_mask=padding_mask)
    attention_output_2 = self.layernorm2(attention_output_1 + attention_output_2)
    proj_output = self.denseProj(attention_output_2)
    return self.layernorm3(attention_output_2 + proj_output)

In [16]:
embed_dim = 256
dense_dim = 2048
num_heads = 8
sequence_length = 600


encoder_inputs = keras.Input(shape=(None, ), dtype='int64', name='english')
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None, ), dtype='int64', name='spanish')
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs) 
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x) 
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [17]:
transformer.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
english (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
spanish (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
positional_embedding (Positiona (None, None, 256)    3993600     english[0][0]                    
__________________________________________________________________________________________________
positional_embedding_1 (Positio (None, None, 256)    3993600     spanish[0][0]                    
____________________________________________________________________________________________

In [18]:
transformer.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
callbacks = keras.callbacks.ModelCheckpoint('transformer.keras', save_best_only=True)
transformer.fit(train_ds, epochs=25, validation_data=val_ds, callbacks=callbacks)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x19123051490>

In [19]:
import numpy as np
spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20
def decode_sequence(input_sentence):
 tokenized_input_sentence = source_vectorization([input_sentence])
 decoded_sentence = "[start]"
 for i in range(max_decoded_sentence_length):
  tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
  predictions = transformer([tokenized_input_sentence, tokenized_target_sentence]) 
  sampled_token_index = np.argmax(predictions[0, i, :]) 
  sampled_token = spa_index_lookup[sampled_token_index] 
  decoded_sentence += " " + sampled_token 
  if sampled_token == "[end]": 
   break 
 return decoded_sentence
test_eng_texts = [pair[0] for pair in test_pairs] 
for _ in range(20):
 input_sentence = random.choice(test_eng_texts)
 print("-")
 print(input_sentence)
 print(decode_sequence(input_sentence))

-
Tom has been waiting three hours now.
[start] tom ha estado esperando tres horas de ahora [end]
-
I was married once.
[start] estuve casado una vez [end]
-
I'm right, aren't I?
[start] tengo razón yo ¿verdad [end]
-
At the funeral, the widow looked very dignified, with her black suit, hat and gloves.
[start] en el funeral la viuda de la fábrica con una negro y el jersey amarillo [end]
-
The car's antenna is built into the windshield.
[start] el coches registro es construí para la [UNK] [end]
-
He fell asleep behind the wheel and had an accident.
[start] Él se quedó dormido cerca del volante había accidente y un accidente [end]
-
I just hope nothing goes wrong this time.
[start] solo espero que no pase nada en este momento [end]
-
It's not worth reading any further.
[start] no vale la pena leer nada [end]
-
My nose itches.
[start] me duele el [UNK] [end]
-
He will notice sooner or later.
[start] Él se [UNK] tarde o temprano [end]
-
I'll expect you at 2:30.
[start] te espero a las dos 