In [117]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

## Importar dataset

In [None]:
!pip install -q tensorflow-datasets tensorflow --user

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-text 2.10.0 requires tensorflow<2.11,>=2.10.0; platform_machine != "arm64" or platform_system != "Darwin", but you have tensorflow 2.12.0 which is incompatible.
tensorflow-gpu 2.10.0 requires keras<2.11,>=2.10.0, but you have keras 2.12.0 which is incompatible.
tensorflow-gpu 2.10.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 4.23.2 which is incompatible.
tensorflow-gpu 2.10.0 requires tensorboard<2.11,>=2.10, but you have tensorboard 2.12.3 which is incompatible.
tensorflow-gpu 2.10.0 requires tensorflow-estimator<2.11,>=2.10.0, but you have tensorflow-estimator 2.12.0 which is incompatible.


In [118]:
import tensorflow_datasets as tfds

In [119]:
dataset = tfds.load('imdb_reviews', as_supervised=True)

In [120]:
raw_train_ds, raw_test_ds = dataset['train'], dataset['test']

In [121]:
for text, label in raw_train_ds.take(1):
    print(text.numpy(), label.numpy())

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it." 0


## Preparar dataset

In [122]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import Sequential
from tensorflow.keras import layers 
import re

In [123]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BUFFER_SIZE = tf.data.experimental.cardinality(raw_train_ds)
BUFFER_SIZE.numpy()

25000

In [124]:
batch_size = 128
voc_size = 5000

train_ds = raw_train_ds.shuffle(BUFFER_SIZE).batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

test_ds = raw_test_ds.batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

In [125]:
for text, label in train_ds.take(1):
    print(text[0])

tf.Tensor(b'If it were not for the "Oh So Gourgous," Natassia Malthe, this B- movie would not have been worth one sector of my Tivo disk space! In what low rent, back lot warehouse was the supposed space port filmed in? "Continuity People!" It\'s a basic principle in real movie making! By night an alleged space port and by day (night and day on a space station?) a warehouse!??!? People Please! The only thing I will commend this movie for, is the wardrobe dept. for continuously, keeping Natassia in those tight shape revealing outfits! Even the women who saw this bomb had to appreciate the outfits that she obviously spent some time getting into, each day of filming! The Sci-fi channel would have been better off showing SpaceBalls! At least there would have been some real humor in watching something so unbelievable.<br /><br />P.S. Michael Ironside, please Fire Your Agent ASAP! You are so much better of an actor, to be even associated with this level of movie making.', shape=(), dtype=str

## Tokenización

In [126]:
maxlen = 128
vectorize_layer = layers.TextVectorization(
    max_tokens=voc_size,
    output_mode='int',
    output_sequence_length=maxlen)

- Adaptar la capa

In [127]:
vectorize_layer_ds = train_ds.map(lambda text, label: text)
vectorize_layer.adapt(vectorize_layer_ds)

In [128]:
vectorize_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

- Probar vectorize_layer con batch de prueba

In [129]:
test_batch = tf.constant([['Hi there']])
vectorize_layer(test_batch)

<tf.Tensor: shape=(1, 128), dtype=int64, numpy=
array([[ 1, 48,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>

## Definir LSTM

RNN:
\begin{equation}
h_t = f(Wx_t + Uh_{t-1} + b)
\end{equation}



LSTM:

\begin{align}
i_t & = \sigma(W^ix_t + U^ih_{t-1} + b^i) \\
f_t & = \sigma(W^fx_t + U^fh_{t-1} + b^f) \\
o_t & = \sigma(W^ox_t + U^oh_{t-1} + b^o) \\
g_t & = \text{tanh}(W^gx_t + U^gh_{t-1} + b^g) \\
c_t & = f_t \odot c_{t-1} + i_t \odot g_t\\
h_t & = o_t \odot \text{tanh}(c_t) \\
\end{align}

In [130]:
lstm = tf.keras.Sequential([
    vectorize_layer,
    layers.Embedding(
        input_dim=voc_size, output_dim=128),
    layers.Bidirectional(layers.LSTM(128)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

- Probar LSTM con batch de prueba

In [131]:
lstm(test_batch)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.00345709]], dtype=float32)>

- Información del modelo

In [132]:
lstm.summary()

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 128)              0         
 ectorization)                                                   
                                                                 
 embedding_23 (Embedding)    (None, 128, 128)          640000    
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              263168    
 nal)                                                            
                                                                 
 dense_82 (Dense)            (None, 64)                16448     
                                                                 
 dense_83 (Dense)            (None, 1)                 65        
                                                                 
Total params: 919,681
Trainable params: 919,681
Non-t

## Entrenamiento LSTM

In [133]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [134]:
lstm_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [135]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')
val_loss_avg = tf.keras.metrics.Mean(name='val_loss')

In [136]:
epochs = 5

In [137]:
@tf.function
def train_step(text, target):
    with tf.GradientTape() as tape:
        logits = lstm(text, training=True)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    gradients = tape.gradient(loss_value, lstm.trainable_weights)
    lstm_opt.apply_gradients(zip(gradients, lstm.trainable_weights))
    train_loss_avg(loss_value)
    
@tf.function
def test_step(text, target):
    with tf.GradientTape() as tape:
        logits = lstm(text, training=False)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    val_loss_avg(loss_value)

In [138]:
for text, target in train_ds.take(1):
    print(target)

tf.Tensor(
[1 0 1 1 0 0 0 1 1 1 0 0 1 1 1 0 1 0 1 1 1 0 0 1 0 0 0 1 0 1 0 1 0 1 0 1 1
 1 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0 0
 1 1 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0 0
 0 1 1 1 1 0 0 0 0 1 0 1 1 0 0 1 0], shape=(128,), dtype=int64)


In [139]:
for epoch in range(epochs):
    for text, target in train_ds:
        train_step(text, target)
        
    print(f'Epoch: {epoch} Train loss: {train_loss_avg.result():.4f}')
    train_loss_avg.reset_states()
    
    for text, target in test_ds:
        test_step(text, target)
        
    print(f'Val loss: {val_loss_avg.result():.4f}')
    val_loss_avg.reset_states()

Epoch: 0 Train loss: 0.6523
Val loss: 0.5014
Epoch: 1 Train loss: 0.4057
Val loss: 0.3892
Epoch: 2 Train loss: 0.3279
Val loss: 0.3821
Epoch: 3 Train loss: 0.2923
Val loss: 0.3806
Epoch: 4 Train loss: 0.2724
Val loss: 0.3899


## Definir Transformer

<img src="../img/dot_product.png" width="500"/>

__Imagen tomada de Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30.__

\begin{equation}
\mbox{MultiHead}(Q, K, V) = \text{Concat}(\mbox{head}_1,\mbox{head}_2,\ldots,\mbox{head}_h)W^O,
\end{equation}

\begin{equation}
\mbox{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) = \text{softmax}\left[\frac{QW_i^Q(KW_i^K)^T}{\sqrt{d_k}}\right]VW_i^V,
\label{eq:selfattention}
\end{equation}

In [140]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, model_dim, n_heads, rate=0.1, initializer='glorot_uniform'):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.model_dim = model_dim

        assert model_dim % self.n_heads == 0

        self.head_dim = model_dim // self.n_heads

        self.wq = layers.Dense(model_dim, kernel_initializer=initializer)
        self.wk = layers.Dense(model_dim, kernel_initializer=initializer)
        self.wv = layers.Dense(model_dim, kernel_initializer=initializer)
        
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        
        self.wo = layers.Dense(model_dim, kernel_initializer=initializer)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.n_heads, self.head_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  
        k = self.wk(k)  
        v = self.wv(v)  

        q = self.split_heads(q, batch_size) 
        k = self.split_heads(k, batch_size)  
        v = self.split_heads(v, batch_size) 

        # print(q.shape(), k.shape(), v.shape()) #Imprime tamaño matrices

        dh = tf.cast(self.head_dim, tf.float32)
        qk = tf.matmul(q, k, transpose_b=True)
        scaled_qk =  qk / tf.math.sqrt(dh)

        attn = self.dropout1(tf.nn.softmax(scaled_qk, axis=-1))
        attn = tf.matmul(attn, v) 

        attn = tf.transpose(attn, perm=[0, 2, 1, 3]) 
        original_size_attention = tf.reshape(attn, (batch_size, -1, self.model_dim)) 

        output = self.dropout2(self.wo(original_size_attention))
        return output
    
x = tf.ones([1, 16, 128]) #Longitud de secuencia 16
MultiHeadAttention(128,2)(x,x,x)

<tf.Tensor: shape=(1, 16, 128), dtype=float32, numpy=
array([[[ 1.7492265 ,  0.15875828, -1.2607868 , ...,  0.41106725,
         -0.24703759, -1.7047443 ],
        [ 1.7492265 ,  0.15875828, -1.2607868 , ...,  0.41106725,
         -0.24703759, -1.7047443 ],
        [ 1.7492265 ,  0.15875828, -1.2607868 , ...,  0.41106725,
         -0.24703759, -1.7047443 ],
        ...,
        [ 1.7492265 ,  0.15875828, -1.2607868 , ...,  0.41106725,
         -0.24703759, -1.7047443 ],
        [ 1.7492265 ,  0.15875828, -1.2607868 , ...,  0.41106725,
         -0.24703759, -1.7047443 ],
        [ 1.7492265 ,  0.15875828, -1.2607868 , ...,  0.41106725,
         -0.24703759, -1.7047443 ]]], dtype=float32)>

- Definir embedding de posición

In [141]:
class TokenEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, emb_dim, 
                 rate=0.0):
        super(TokenEmbedding, self).__init__()
        self.max_len = maxlen
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=emb_dim)
        self.position_emb = layers.Embedding(
            input_dim=maxlen, output_dim=emb_dim)
        self.dropout = layers.Dropout(rate)

    def call(self, x):
        token_embeddings = self.token_emb(x)
        positions = tf.range(start=0, limit=self.max_len, delta=1)
        positions = self.position_emb(positions)
        return self.dropout(token_embeddings + positions) 

In [142]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, model_dim, n_heads=2, mlp_dim=512, 
                 rate=0.0, eps=1e-6):
        super(TransformerBlock, self).__init__()
        self.attn = MultiHeadAttention(model_dim, n_heads, rate)
        self.mlp = tf.keras.Sequential([
            layers.Dense(mlp_dim, activation='gelu'), 
            layers.Dense(model_dim),
            layers.Dropout(rate)
        ])
        self.ln1 = layers.LayerNormalization(epsilon=eps)
        self.ln2 = layers.LayerNormalization(epsilon=eps)
        self.drop1 = layers.Dropout(rate)

    def call(self, inputs, training):  
        x = self.drop1(self.attn(inputs, inputs, inputs), training=training) 
        x = self.ln1(x + inputs)
        return self.ln2(self.mlp(x) + x)
    
block = TransformerBlock(128)

In [143]:
class Transformer(tf.keras.models.Model):
    def __init__(self, model_dim, voc_size, mlp_dim=256, 
                 maxlen=128, heads=4):
        super(Transformer, self).__init__()
        self.emb = TokenEmbedding(maxlen, voc_size, model_dim, rate=0.25)
        self.block = TransformerBlock(model_dim, heads, mlp_dim, rate=0.2)
        self.out = tf.keras.Sequential([
            layers.GlobalAveragePooling1D(),
            layers.Dense(1)
        ])
    
    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        x = self.block(x)
        x = self.out(x)
        return x
    
transformer = Transformer(512, voc_size, heads = 8)
transformer(test_batch)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[1.4275923]], dtype=float32)>

In [144]:
transformer.summary()

Model: "transformer_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_embedding_11 (TokenEm  multiple                 2625536   
 bedding)                                                        
                                                                 
 transformer_block_13 (Trans  multiple                 1315584   
 formerBlock)                                                    
                                                                 
 sequential_25 (Sequential)  (1, 1)                    513       
                                                                 
Total params: 3,941,633
Trainable params: 3,941,633
Non-trainable params: 0
_________________________________________________________________


## Entrenamiento Transformer
- Utilizar los mismos parámteros de LSTM

In [145]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [146]:
trans_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [147]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')
val_loss_avg = tf.keras.metrics.Mean(name='val_loss')

In [148]:
epochs = 2

In [149]:
@tf.function
def train_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=True)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    gradients = tape.gradient(loss_value, transformer.trainable_weights)
    trans_opt.apply_gradients(zip(gradients, transformer.trainable_weights))
    train_loss_avg(loss_value)
    
@tf.function
def test_step(text, target):
    with tf.GradientTape() as tape:
        logits = transformer(text, training=False)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    val_loss_avg(loss_value)

In [150]:
for epoch in range(epochs):
    for text, target in train_ds:
        train_step(text, target)
        
    print(f'Epoch: {epoch} Train loss: {train_loss_avg.result():.4f}')
    train_loss_avg.reset_states()
    
    for text, target in test_ds:
        test_step(text, target)
        
    print(f'Val loss: {val_loss_avg.result():.4f}')
    val_loss_avg.reset_states()

Epoch: 0 Train loss: 0.5326
Val loss: 0.4240
Epoch: 1 Train loss: 0.3606
Val loss: 0.3708


## Ejercicio

- Modificar los hiperparámetros de los modelos para obtener mejores resultados.
- Modificar las arquitecturas, comparar resultados con GRU.
- Agregar y modificar regularización.

Tamaño Embedding, perceptrón multicapa, dropout, profundidad.