# Natural Language Processing with RNN and Attention

Natural language processing is the set of techniques used for machine to be able to understand human language. In this chapter we are going to discuss how
to use RNNs to build text classifier, text prediction... after that we are going to try to do the same thing using __attention__ which is a type of model
specifically tailored for this type of task and finally we are going to try to use __transformers_: the newwest and most powerful architecture used for NLP.

## Next character prediction with Char-RNN

We are going to use character prediction to create a model capable of predicting the next word in a Shakespare's text. First let's load the dataset:

In [1]:
import tensorflow as tf

def load_data():
    url = "https://homl.info/shakespeare"
    filepath = tf.keras.utils.get_file("shakespeare.txt", url)
    with open(filepath) as f:
        text = f.read()
    print(text[:80])
    return text

text = load_data()

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


Now we can use the _TextVectorization_ layer to encode this text by character(we have to set the split parameter to "character" the default value being
"word" and set the standardize to "lower" to transform the text to lowercase)

In [10]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([text])
encoded = text_vec_layer([text])[0]

tf.Tensor([21  7 10 ... 22 28 12], shape=(1115394,), dtype=int64)


Each character will be mapped to an integer starting with 2(0 is used for padding tokens and 1 for unknown character). We will not be using either of them 
so we can drop them:

In [3]:
encoded -= 2
n_tokens = text_vec_layer.vocabulary_size() - 2 # The number of different characters in the dataset.
dataset_size = len(encoded)

And as we did previously we can split long sequences into smaller windows and use them to train a sequence-to-sequence network. The target is the inputs +
one time step into the future(we are only trying to predict one character into the future).

In [5]:
# This function takes a sequence as input (the encoded text), and creates a dataset containing all the windows of the desired length.
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(buffer_size=10_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

# Now let's try to split the dataset
length=100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

# Let's build and train a model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
model_cpkt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True
)
history = model.fit(train_set, validation_data=valid_set, epochs=10, callbacks=[model_cpkt])

# this model does not handle text processing yet let's wrap it adding the text vectorization layer from earlier
text_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda x: x - 2)
])

# now let's try to use it for predictions
y_proba = text_model.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba) # Get the prediction with the highest probability
print(text_vec_layer.get_vocabulary()[y_pred + 2])

Epoch 1/10
     77/Unknown [1m18s[0m 117ms/step - accuracy: 0.1410 - loss: 3.3486

KeyboardInterrupt: 

To generate new text using this model, we could generate a character add it to the text and ask it to generate the next character again repeatedly(this
method is called __greedy decoding__). But in practice this method produce very poor results instead we can sample the next character randomly, assign it
a probability equal to the estimated probability guaranteeing more diverse text.

In [None]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]])
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8) # retrieve 8 samples

To get even more diverse text we can divide the probabilities by a number called _temperature_. A low temperature favors words with high probabilities and a
high temperature gives all the characters equal probabilities. A lower temperature is generally used when generating fairly rigid text such as mathematical
equations. The following code will implement a function using this approach:

In [None]:
def next_char(text, temperature=1.0):
    y_proba = text_model.predict([text])[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

# let's write a function that will append the predicted character and return the new text
def extend_text(text, n_chars=50, temperature=1.0):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

# let's try to do some predictions using different values of temperature
tf.random.set_seed(42)
print(extend_text("To be or not to be", temperature=0.01))
print("-" * 50)
print(extend_text("To be or not to be", temperature=1))
print("-" * 50)
print(extend_text("To be or not to be", temperature=100))

To generate more convicing texts, one solution is to sample only from the top _k_ characters or from the smallest set of top characters whose probability
exceed a certain threshold(this is called __nucleus sampling__). We could also try adding neurons or GRU layers and regularize the model if necessary.

## Using stateful RNN

Up until now we have used stateless RNN: at each iteration the model that with a hidden state full of zeros and keep updating it at each time step and after
the last time step it delete it. But we can modify this behavior and ask the RNN to preserve this final state and use it as the initial state of the next
training batch. This is called __stateful RNN__. The first condition is to use a sequential, non overlapping input sequences(so we must not call the
_shuffle_ method).

In [None]:
def to_dataset_for_stateful_rnn(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=length, drop_remainder=True) # Dont forget to put the shift parameter to length instead of 1
    ds = ds.flat_map(lambda window: window.batch(length + 1)).batch(1) # Also set batch to 1 to make the batches sequential
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

stateful_train_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length)
stateful_valid_set = to_dataset_for_stateful_rnn(encoded[1_000_000:1_060_000], length)
stateful_test_set = to_dataset_for_stateful_rnn(encoded[1_060_000:], length)

# Now let's create the stateful rnn
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16, batch_input_shape=[1, None]),
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

# We need to reset the state at the end of each epoch, we are going to write it as a callback
class ResetStateCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
model.fit(stateful_train_set, validation_set=stateful_valid_set, epochs=10, callbacks=[ResetStateCallback, model_cpkt])

Now we can explore _word_ level training for sentiment analysis. 

## Sentiment Analysis

To train a sentiment analysis, we are going to use the Imdb dataset, a dataset of movie reviews along with a simple binary target value (0 for negative
and 1 for positive).

In [5]:
import tensorflow_datasets as tfds
import tensorflow as tf

raw_train_set, raw_valid_set, raw_test_set = tfds.load(name="imdb_reviews", split=["train[:90%]", "train[90%:]", "test"], as_supervised=True) # type: ignore
tf.random.set_seed(42)
train_set = raw_train_set.shuffle(5000, seed=42)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)

# Let's look at a few instances
for review, label in raw_train_set.take(4):
    print(review.numpy().decode("utf-8"))
    print("Label: ", label.numpy())

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
Label:  0
I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot developmen

To build a model for this task, we need to preprocess the text, but this time we will slice it into words instead of characters. Note that we cannot really
uses spaces as a way to split word because in some languages it doesn't work well. One solution found by scientist is to tokenize and detokenize text at
the subword level. One of those technique is called __byte pair encoding__(BPE). It works by splitting the words into characters(including spaces) then
repeatedly merging the most frequent adjacent pairs until the vocabulary reaches the desired size. However, for the IMDb task in English, using spaces for 
token boundaries should be good enough. 

In [None]:
vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))

# Buiding the model and training it
embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(vocab_size, embed_size),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=2)

But this code actually does very poor performances because the reviews have different lengths, so when the TextVectorization layer converts them to 
sequences of token IDs, it pads the shorter sequences using the padding token (with ID 0) to make them as long as the longest sequence in the batch. As a 
result, most sequences end with many padding tokens—often dozens or even hundreds of them. One solution is to feed the model with batches of equal-length 
sentences (which also speeds up training). Another solution is to make the RNN ignore the padding tokens. This can be done using __masking__. Here is a
code that will handle masking manually:

In [None]:
inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
token_ids = text_vec_layer(inputs)
mask = tf.math.not_equal(token_ids, 0)
Z = tf.keras.layers.Embedding(vocab_size, embed_size)(token_ids)
Z = tf.keras.layers.GRU(128, dropout=0.2)(Z, mask=mask)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(Z)
model = tf.keras.Model(inputs=[inputs], outputs=[outputs])

Another way to implement it is to use ragged tensors.

In [None]:
text_vec_layer_ragged = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, ragged=True
)
text_vec_layer_ragged.adapt(train_set.map(lambda reviews, labels: reviews))

## Using an encoder-decoder for neural machine translation

We are going to build a model capable of translating an english sentence into spanish. The architecture can be resumed like this: the inputs are feeded to
the encoder and the decoder outputs the spanish translation. Note that spanish sentences are also feeded to the network during training but shifted one 
time step back, meaning the decoder is given the output it should have at the previous time step. This method is called __teacher forcing__. But at inference time (after training) we don't feed it the correct word instead we feed it its previous output.

In [None]:
from pathlib import Path
import numpy as np

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets", extract=True)
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()
text = text.replace("¡", "").replace("¿", "") # The text vectorization layer doesn't support these characters
pairs = [line.split("\t") for line in text.splitlines()]
np.random.shuffle(pairs)
sentence_eng, sentence_sp = zip(*pairs)

# Let's create 2 text vectorization layers to handle both set of sequences
vocab_size = 1000
max_length=50
text_vec_layer_en = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)
text_vec_layer_sp = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)

text_vec_layer_en.adapt(sentence_eng)
text_vec_layer_sp.adapt([f"startofseq {s} endofseq" for s in sentence_sp])

# Let's split the data into training, testing and validation set
X_train = tf.constant(sentence_eng[:100_000])
X_valid = tf.constant(sentence_eng[100_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentence_sp[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentence_sp[100_000:]])
Y_train = text_vec_layer_en([f"{s} endofseq" for s in sentence_sp[:100_000]])
Y_valid = text_vec_layer_en([f"{s} endofseq" for s in sentence_sp[100_000:]])

encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

embed_size = 128
encoder_inputs_ids = text_vec_layer_en(encoder_inputs)
decoder_inputs_ids = text_vec_layer_sp(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_inputs_ids)
decoder_embeddings = decoder_embedding_layer(decoder_inputs_ids)

encoder = tf.keras.layers.LSTM(512, return_sequences=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizers="nadam", metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10, validation_data=((X_valid, X_valid_dec), Y_valid))

But for this type of model translating is not as simple as calling _predict()_ method since the decoder expect the last sentence it has outputed we are
going to write a helper function to do translation for us.

In [None]:
def translate(sentence_eng):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_eng])
        X_dec = np.array(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_sp.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

This model would work very well with small sentences but struggles with longer ones. We can use bidirectional RNNs to mitigate that problem. Regular RNNs
reads inputs from the past and present at each time step to create an output. In order for our model to be able to learn things like context for a word
we actually need it to be able to look at the future(look at the next word). For that we can use 2 recurrent layers on the same inputs where one reads the
text for left to right and the other from right to left then combine their output at each time step.

In [None]:
encoder = tf.keras.layers.Bidirectional([
    tf.keras.layers.LSTM(256, return_state=True)
])

# But the output returns 4 states, 2 per recurrent layer so we need to concatenate their outputs
encoder_output, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1), tf.concat(encoder_state[1::2], axis=-1)]

Another method to improve models accuracy durring training is to give it a chance to go back and make corrections to the mistakes it has made. We can try
a method called __beam search__. It keeps track of the _k_(this parameter is called _beam width_) most promising words.

## Attention 

Attention is a way to make decoder focus on the appropriated words(as encoded by the encoder) at each time step. Instead of just sending the encoder’s 
final hidden state to the decoder, as well as the previous target word at each step, we now send all of the encoder’s outputs to the decoder as well. Since 
the decoder cannot deal with all these encoder outputs at once, they need to be aggregated: at each time step, the decoder’s memory cell computes a 
weighted sum of all the encoder outputs. This determines which words it will focus on at this step. All the weights for a given decoder time step add up to 1. Keras possess an Attention layer

In [None]:
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
)

attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
y_proba = output_layer(attention_outputs)

## Building transformers using attention

Transformer is one of the newest architecture which does not use any recurrent or convolutional layer just attention mechanisms(normalization layers,
dense layers and all the others). At inference time, you must call the transformer multiple times, producing the translations one word at a time and 
feeding the partial translations to the decoder at each round, just like we did earlier in the _translate()_ function. It also introduce novel components
such as.

### Positional encoding

A positional encoding encode the position of a word in a sentence. The easiest way to do that is to use the Embedding layer and make it encode all the 
positions from 0 to the maximum sequence length in the batch, then add the result to the word embedding.

In [None]:
max_length = 50
embed_size = 128
pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size)
batch_max_len_enc = tf.shape(encoder_embeddings)[1]
encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))
batch_max_len_dec = tf.shape(decoder_embeddings)[1]
decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_dec))

# Now we need to create a positional encoding class since keras does not poseess it
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        assert embed_size % 2 == 0
        p, i = np.meshgrid(np.arange(max_length), 2 * np.arange(embed_size // 2))
        pos_embed = np.empty((1, max_length, embed_size))
        pos_embed[0, :, ::2] = np.sin(p / 10_000 ** (i / embed_size)).T
        pos_embed[0, :, 1::2] = np.cos(p / 10_000 ** (i / embed_size)).T
        self.post_encodings = tf.constant(pos_embed.astype(self.dtype))
        self.supports_masking = True

    def call(self, inputs):
        batch_max_length = tf.shape(inputs)[1]
        return inputs + self.post_encodings[0, :batch_max_length]

# We can add the positional encoding layer
pos_embed_layer = PositionalEncoding(max_length, embed_size, supports_masking=True)
encoder_in = pos_embed_layer(encoder_embeddings)
decoder_in = pos_embed_layer(decoder_embeddings)

### Multihead attention layer

A multihead attention layer is based on the scaled dot product attention layer which is defined as the following equation:
$$\text{Attention}(Q, K, V) = softmax(\frac{QK^{T}}{\sqrt{d_{keys}}})V $$
Where:
- Q is the matrix containing one row per query. Its shape is [$n_{queries}$, $d_{keys}$] where $n_{queries}$ is the number of queries and $d_{keys}$ is the 
 number dimensions of each queries and each keys.
- K is a matrix containing one row per key. 
- V is the matrix containing one row per value.
-  The shape of $QK^T$ is [$n_{queries}$ , $n_{keys}$ ]: it contains one similarity score for each query/key pair. To prevent this matrix from being huge,
 the input sequences must not be too long.
- The scaling factor 1 / ($\sqrt{d_{keys}}$) scales down the similarity scores to avoid saturating the softmax function, which would lead to tiny gradients.

Keras possess a MultiHeadAttention layer and we are going to try to build the transformer.

In [None]:
N = 2
num_heads = 8
dropout_rate = 0.1
n_units = 128
encoder_pad_mask = tf.math.not_equal(encoder_inputs_ids, 0)[:, tf.newaxis]
Z = encoder_in
for _ in range(N):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.Dense(dropout_rate)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))


decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
causal_mask = tf.linalg.band_part( # creates a lower triangular matrix
tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0)

# This is the decoder
encoder_outputs = Z # let's save the encoder's final outputs
Z = decoder_in # the decoder starts with its own inputs
for _ in range(N):
skip = Z
attn_layer = tf.keras.layers.MultiHeadAttention(
num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
Z = attn_layer(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)
Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
skip = Z
attn_layer = tf.keras.layers.MultiHeadAttention(
num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
Z = attn_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
skip = Z
Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
Z = tf.keras.layers.Dense(embed_size)(Z)
Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

Y_proba = tf.keras.layers.Dense(vocab_size, activation="softmax")(Z)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10, validation_data=((X_valid, X_valid_dec), Y_valid))
