# Seq2seq
- En este notebook se define una arquitectura seq2seq para traducir oraciones del inglés al español.

<img src="../img/seq-to-seq.png" width="700"/>

__Imagen tomada de Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27.__



In [85]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import time

## 1.- Dataset

In [86]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [87]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []
for line in lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

In [88]:
for _ in range(5):
    print(random.choice(text_pairs))

('He grew old.', '[start] Él envejeció. [end]')
("You're a good friend.", '[start] Sos un buen amigo. [end]')
('Where did you see those women?', '[start] ¿Dónde viste a esas mujeres? [end]')
('He is working in AIDS research.', '[start] Él trabaja en investigación de SIDA. [end]')
("This shouldn't be repeated.", '[start] Esto no debería repetirse. [end]')


In [89]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


## 2.- Pipeline

In [171]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 20000 #Tamaño de vocabulario
maxlen = 10
batch_size = 64

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", 
    output_sequence_length=maxlen,
)
spa_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=maxlen,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)

In [172]:
eng_vectorization([['my name is'], ['my dog is']])

<tf.Tensor: shape=(2, 10), dtype=int64, numpy=
array([[ 18, 233,   8,   0,   0,   0,   0,   0,   0,   0],
       [ 18, 165,   8,   0,   0,   0,   0,   0,   0,   0]])>

In [173]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [174]:
def preprocess(eng, spa):
    eng = eng_vectorization(eng)  #Codificador en ingles
    spa = spa_vectorization(spa)  #Decodificador en espaniol
    return tf.reverse(eng, [1]), spa[:, :-1], spa[:, 1:]


def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(preprocess)
    return dataset.shuffle(2048).prefetch(AUTOTUNE).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [175]:
for inp_enc, inp_dec, tar_dec in train_ds.take(1):
    print(inp_enc[0], inp_dec[0], tar_dec[0])

tf.Tensor([  0   0   0   0   0 432   2 307  22  79], shape=(10,), dtype=int64) tf.Tensor([   2    7 4005    9  394   16   65    3    0], shape=(9,), dtype=int64) tf.Tensor([   7 4005    9  394   16   65    3    0    0], shape=(9,), dtype=int64)


## 3.- Modelo

In [176]:
emb_dim = 256
model_dim = 512

### Encoder

In [177]:
class Encoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(voc_size,
                                                   emb_dim)
        self.gru = tf.keras.layers.GRU(model_dim,
                                       return_sequences=False,
                                       return_state=True)
        
    def call(self, x, state=None):
        x = self.embedding(x)
        x, state = self.gru(x, initial_state=state)
        return x, state
    
    
encoder = Encoder(eng_vectorization.vocabulary_size(),
                  emb_dim, model_dim)
output, enc_state = encoder(inp_enc)
enc_state

<tf.Tensor: shape=(64, 512), dtype=float32, numpy=
array([[-0.00268189,  0.00138597,  0.00644545, ..., -0.00335744,
        -0.02368064, -0.00172296],
       [-0.0038465 ,  0.017239  , -0.00164245, ...,  0.00845426,
         0.0014715 ,  0.00419005],
       [ 0.00493488, -0.00029366, -0.01341123, ..., -0.00229503,
         0.00400272, -0.00774842],
       ...,
       [-0.00468742, -0.0175637 ,  0.00667318, ..., -0.01767453,
        -0.00385535, -0.00040287],
       [ 0.00865721,  0.00904964, -0.00212193, ...,  0.01302145,
        -0.00205267, -0.00147501],
       [ 0.0029376 ,  0.01555982, -0.00907552, ...,  0.00416165,
        -0.00531697,  0.00069319]], dtype=float32)>

In [178]:
output.shape

TensorShape([64, 512])

In [179]:
encoder.summary()

Model: "encoder_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    multiple                  3087104   
                                                                 
 gru_12 (GRU)                multiple                  1182720   
                                                                 
Total params: 4,269,824
Trainable params: 4,269,824
Non-trainable params: 0
_________________________________________________________________


### Decoder

In [180]:
class Decoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, x, states, return_state=False, training=False):
        x = self.embedding(x, training=training)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 


decoder = Decoder(voc_size=spa_vectorization.vocabulary_size(),
                  emb_dim=emb_dim,
                  model_dim=model_dim)

decoder(inp_dec[:, :1], enc_state)

<tf.Tensor: shape=(64, 1, 20000), dtype=float32, numpy=
array([[[ 3.4991712e-03,  4.2493772e-04,  7.3836133e-04, ...,
          1.3449894e-03, -7.5503933e-04, -2.0426055e-04]],

       [[ 2.6393642e-03,  1.0523257e-05,  5.1834190e-04, ...,
         -1.5131941e-03, -3.0605118e-03, -1.3780618e-03]],

       [[ 2.8696274e-03, -2.9542629e-04,  9.6997595e-04, ...,
          5.4154196e-04, -1.7812977e-03, -2.3649007e-04]],

       ...,

       [[ 4.3738480e-03,  2.6886776e-04,  5.0340447e-04, ...,
          4.5838271e-04, -1.2118408e-03, -4.2968406e-04]],

       [[ 1.2096075e-03,  6.4217864e-04, -6.5309013e-04, ...,
         -5.8217096e-04,  7.2521070e-04, -3.3441249e-03]],

       [[ 2.3013330e-03, -8.5063197e-04, -1.0327965e-03, ...,
         -2.1219230e-03, -2.9115155e-04, -1.3098387e-03]]], dtype=float32)>

In [181]:
decoder.summary()

Model: "decoder_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    multiple                  5120000   
                                                                 
 gru_13 (GRU)                multiple                  1182720   
                                                                 
 dense_6 (Dense)             multiple                  10260000  
                                                                 
Total params: 16,562,720
Trainable params: 16,562,720
Non-trainable params: 0
_________________________________________________________________


## 4.- Entrenamiento

In [182]:
optimizer = tf.keras.optimizers.Adam(0.001)

def loss_function(label, pred): #Funcion de pérdida para el ejercicio de tarea
    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
    loss = loss_object(label, pred)

    mask = tf.cast(mask, dtype=loss.dtype)  #Mascara porque no todas las oraciones tienen la misma longitud, en el GPU hay que paralelizar. (zero padding, añadiendo ceros al final de palabras muy cortas)
    loss *= mask

    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss

In [183]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

In [184]:
for inp_enc, inp_dec, tar_dec in train_ds.take(1):
    print(inp_enc[:3], inp_dec[:3], tar_dec[:3])  # Oraciones terminan en ceros por padding

tf.Tensor(
[[  0   0   0   0  12  37   4   5 155   6]
 [  0   0   0 249 510  33  80  12  60   3]
 [  0   0   0   0   0 345 341  56 193  78]], shape=(3, 10), dtype=int64) tf.Tensor(
[[   2    8  126    5 1850   41    3    0    0]
 [   2   88    5    7   86  762   17  364    3]
 [   2   14 1527   50  146   32  270    3    0]], shape=(3, 9), dtype=int64) tf.Tensor(
[[   8  126    5 1850   41    3    0    0    0]
 [  88    5    7   86  762   17  364    3    0]
 [  14 1527   50  146   32  270    3    0    0]], shape=(3, 9), dtype=int64)


In [185]:
_, state = encoder(inp_enc, training=True)
state.shape, inp_dec.shape, tar_dec.shape

(TensorShape([64, 512]), TensorShape([64, 9]), TensorShape([64, 9]))

In [186]:
@tf.function
def train_step(inp_enc, inp_dec, tar_dec):  #Agregar loop de validación
    with tf.GradientTape() as tape:
        _, state = encoder(inp_enc, training=True)
        pred = decoder(inp_dec, state, training=True)
        loss_value = loss_function(tar_dec, pred)
        
    weights = encoder.trainable_weights + decoder.trainable_weights #Se asignan variables tanto del codificador como decodificador para calcular pesos
    gradients = tape.gradient(loss_value, weights)
    optimizer.apply_gradients(zip(gradients, weights))
    train_loss(loss_value)


######################################################
validation_loss = tf.keras.metrics.Mean(name="validation_loss")
validation_acc = tf.keras.metrics.Accuracy(name='validation_accuracy')

@tf.function
def test_step(inp_enc, inp_dec, tar_dec):
    _, state = encoder(inp_enc, training=False)
    pred = decoder(inp_dec, state, training=False)
    loss_value = loss_function(tar_dec, pred)
    # validation_acc(tar_dec, tf.math.round(pred))
    validation_loss(loss_value)

#####################################################     

In [187]:

# def test_step(batch, model):
#     x, y = batch            #Textos x, Etiquetas y
#     _, state = encoder(inp_enc, training=False)
#     pred = decoder(inp_dec, state, training=False)
#     #output = model(x, training=False) #Pasamos texto por modelo y nos regresa lotes
#         # Compute loss
#     loss_value = loss_function(y, pred)
#     validation_loss_avg(loss_value)
#     validation_acc_avg(y, tf.math.round(pred))
#     train_loss(loss_value)
#     # print(loss_value,validation_loss_avg,validation_acc_avg)

In [188]:
ids_to_text = tf.keras.layers.StringLookup(
                vocabulary=spa_vectorization.get_vocabulary(),
                mask_token='',
                invert=True)

In [189]:
sentences = ['i love my dog',
             'i love to sleep',
             'the cat wants to eat']

def print_translation(sentence):
    inp = eng_vectorization([sentence])
    inp = tf.reverse(inp, [1])
    _, state = encoder(inp, training=False)
    dec_inp = spa_vectorization(['[start]'])[:, :1]
    output = []
    pred_index = ''

#Mientras la palabra predecida no sea "End", entonces no para
    while pred_index != '[end]': #En el programa 3 no hay codificador ni decodificador, sólo es un transformador, por eso esto hay que modificarlo
        logits, state = decoder(dec_inp, state, return_state=True, training=False)  #Vamos guardando estado oculto 
        dec_inp = tf.argmax(logits, axis=-1)  #Palabra que tiene la mayor probabilidad, para eso es el argmax
        pred_index = ids_to_text(dec_inp)
        output.append(pred_index[0][0].numpy().decode('utf-8'))

    text = ' '.join(output[:-1])
    print(f'Input: {sentence}')
    print(f'Prediction: {text}')

In [190]:
epochs = 7

for epoch in range(1, epochs):
    start = time.time()
    for inp_enc, inp_dec, tar_dec in train_ds:
        train_step(inp_enc, inp_dec, tar_dec)
        
    print(f'\nTime taken for epoch {epoch} is: {time.time() - start:.2f} secs', end=' ')
    print(f'Loss: {train_loss.result():.4f}')
    train_loss.reset_states()
    
    for s in sentences:
        print_translation(s)


    for inp_enc, inp_dec, tar_dec in val_ds:
        test_step(inp_enc, inp_dec, tar_dec)
        
    print(f'\nTime taken for validation epoch {epoch} is: {time.time() - start:.2f} secs', end=' ')
    print(f'Loss validation: {validation_loss.result():.4f}')
    validation_loss.reset_states()
    
    for s in sentences:
        print_translation(s)


Time taken for epoch 1 is: 81.97 secs Loss: 4.9614
Input: i love my dog
Prediction: me encontré mi libro
Input: i love to sleep
Prediction: me encanta el trabajo
Input: the cat wants to eat
Prediction: el perro me dijo que me ayudaré

Time taken for validation epoch 1 is: 86.38 secs Loss validation: 3.6173
Input: i love my dog
Prediction: me encontré mi libro
Input: i love to sleep
Prediction: me encanta el trabajo
Input: the cat wants to eat
Prediction: el perro me dijo que me ayudaré

Time taken for epoch 2 is: 26.11 secs Loss: 3.0584
Input: i love my dog
Prediction: me encanta mi perro
Input: i love to sleep
Prediction: me encanta dormir
Input: the cat wants to eat
Prediction: el profesor me parece que se [UNK]

Time taken for validation epoch 2 is: 28.60 secs Loss validation: 2.7646
Input: i love my dog
Prediction: me encanta mi perro
Input: i love to sleep
Prediction: me encanta dormir
Input: the cat wants to eat
Prediction: el profesor me parece que se [UNK]

Time taken for epoc

## Ejercicio
- Agregar loop de evaluación.
- Mejorar el modelo con las técnicas propuestas en _Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27._
- Agreagar mecanismo de atención de _Bahdanau_.

- Tamaño de vocabulario
- Modelo más grande
- Modelo más profundo