This is a pet project where I will be trying to build a chatbot using a transformer architecture based neural network. I'll be using the famous [Cornell Movie Dialog Dataset](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html) to train my model. Lets see how it goes

In [1]:
# Importing some pretty important dependencies
import tensorflow as tf
assert tf.__version__.startswith('2')

import tensorflow_datasets as tfds

import os
import re
import numpy as np
import pickle

import matplotlib.pyplot as plt
tf.random.set_seed(1234)


#### Download Data

In [2]:
# Need to download the dataset and save it offile for the first time
# Keras utils have very useful functions and get_file is one of them.
# Function brings the file and stores in cache(temp) directory.
zip_path = tf.keras.utils.get_file(fname='cornell_movie_dialogs.zip',
    origin='http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip',
    extract=True)

# Follow thedirectory structure to the input files
dataset_path = os.path.join(
    os.path.dirname(zip_path), "cornell movie-dialogs corpus")
print(dataset_path)

C:\Users\ameys\.keras\datasets\cornell movie-dialogs corpus


In [3]:
movie_lines_path = os.path.join(dataset_path, 'movie_lines.txt')
movie_conversations_path = os.path.join(dataset_path, "movie_conversations.txt")
print(movie_lines_path, "\n"+movie_conversations_path)

C:\Users\ameys\.keras\datasets\cornell movie-dialogs corpus\movie_lines.txt 
C:\Users\ameys\.keras\datasets\cornell movie-dialogs corpus\movie_conversations.txt


#### Processing Data

**Below are the steps that I am going to follow to preprocess the data**

    -Process each input text and remove all the special characters except for .,!
    
    -Build a tokenizer 
    
    -Tokenize each input and also add a START and END token
    
    -Choose a MAX_LENGTH and pad/strip the inputs accordingly
    

In [4]:
MAX_SAMPLES = 100000
def process_text(text):
    text = text.lower().strip()
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    text = re.sub(r"([?.!,])", r" \1 ", text)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    text = re.sub(r"[^a-zA-Z0-9?.,!]+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def load_data():
    id2line = {}
    with open(movie_lines_path, errors='ignore') as fp:
        for line in fp.readlines():
            parts = line.replace('\n', '').split(' +++$+++ ')
            id2line[parts[0]] = parts[4]

    inputs, outputs = [], []
    with open(movie_conversations_path, 'r') as fp:
        for line in fp.readlines():
            parts = line.replace('\n', '').split(' +++$+++ ')
            # get conversation in a list of line ID
            conversation = [line[1:-1] for line in parts[3][1:-1].split(', ')]
            for i in range(len(conversation) - 1):
                inputs.append(process_text(id2line[conversation[i]]))
                outputs.append(process_text(id2line[conversation[i + 1]]))
            if len(inputs) > MAX_SAMPLES:
                return inputs, outputs
            
questions, answers = load_data()

In [5]:
print('Sample question: {}'.format(questions[595]))
print('Sample answer: {}'.format(answers[595]))

Sample question: i should not even be listening to you , since my council said no . but santangel tells me you are a man of honor and sincerity . . . and sanchez , that you are not a fool .
Sample answer: no more than the woman who said she would take granada from the moors .


In [None]:
# Build a tokenizer with the vocabulary in both questions and answers
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(questions + answers, 
                                                                    target_vocab_size=2**15)

In [None]:
# Define start and end token to indicate the start and end of a sentence
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

# Vocabulary size plus start and end token
VOCAB_SIZE = tokenizer.vocab_size + 2

In [None]:
# Setting Maximum Sequence Length
input_lengths = [len(text.split()) for text in questions+answers]
plt.hist(input_lengths, bins = 100, histtype='step', label='Sequence Length Distribution')
plt.axvline(np.quantile(input_lengths, [0.95]))
plt.legend();
print("MEAN: ",np.quantile(input_lengths, [0.95]))

In [None]:
# If we set the sequence length to 40 words then we can consider 95% of data without stripping

In [None]:
# Maximum sentence length
MAX_LENGTH = 40


# Tokenize, filter and pad sentences
def tokenize_and_filter(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []

    for (sentence1, sentence2) in zip(inputs, outputs):
        # tokenize sentence
        sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
        sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN
        tokenized_inputs.append(sentence1)
        tokenized_outputs.append(sentence2)
        
    # pad tokenized sentences
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(tokenized_inputs,
                                                                     maxlen=MAX_LENGTH,
                                                                     truncating='post',
                                                                     padding='post',
                                                                    )
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(tokenized_outputs,
                                                                      maxlen=MAX_LENGTH,
                                                                      padding='post',
                                                                      truncating='post',
                                                                     )

    return tokenized_inputs, tokenized_outputs

questions, answers = tokenize_and_filter(questions, answers)

In [None]:
print('Vocab size: {}'.format(VOCAB_SIZE))
print('Number of samples: {}'.format(len(questions)))

#### Create a Tensorflow Dataset

In [None]:
BATCH_SIZE = 128
BUFFER_SIZE = 30000

# decoder inputs use the previous target as input
# removing the END_TOKEN from answers
# removing START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': questions,
        'dec_inputs': answers[:, :-1]
    },
    {
        'outputs': answers[:, 1:]
    },
))


dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

print(dataset)

#### Building a Scaled dot product Attention mechanism

In [None]:
"""
Scaled dot product attention mechanism has 3 inputs Query(Q), Key(K) and Value(V).

Among these 3 inputs firstly, Q and K^T undergo a dot product

the result of dot product is scaled by dividing with sqrt(dimensions)

the scaled result is them passed through a softmax function which yields the attension weights 
for each word with rest of the words in that input sequence.

At the end the attention weights undergo dot product with the V
"""
def scaled_dot_product_attention(query, key, value, mask):
    """Calculate the attention weights. """
    matmul_qk = tf.matmul(query, key, transpose_b=True)

    # scale matmul_qk
    depth = tf.cast(tf.shape(key)[-1], tf.float32)
    logits = matmul_qk / tf.math.sqrt(depth)

    # add the mask to zero out padding tokens
    if mask is not None:
        logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k)
    attention_weights = tf.nn.softmax(logits, axis=-1)

    output = tf.matmul(attention_weights, value)

    return output

#### Building a MultiHead Attention layer

In [None]:
class MultiHeadAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, scale_d, num_heads, name="Multi Head Attention Layer"):
        super(MultiHeadAttentionLayer, self).__init__(name=name)
        self.scale_d = scale_d
        self.num_heads = num_heads
        # scale_d should e a multiple of num_heads
        assert (self.scale_d % self.num_heads == 0)
        self.depth = self.scale_d // self.num_heads
        
        self.query_dense = tf.keras.layers.Dense(units = self.scale_d)
        self.key_dense = tf.keras.layers.Dense(units = self.scale_d)
        self.value_dense = tf.keras.layers.Dense(units = self.scale_d)
        
        self.dense = tf.keras.layers.Dense(units = self.scale_d)
    
    def split_heads(self, inputs, batch_size):
        # understand what this method does and why we are reshaping and transposing the input matrix
        # Find out out the shape of the input before and after this method call
        inputs = tf.reshape(inputs, 
                            shape = (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm = [0,2,1,3])
    
    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']
        batch_size = tf.shape(query)[0]
        
        # Linear layers
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)
        
        # converting to split heads
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)
        
        # embedding attention mechanism
        scaled_attention = scaled_dot_product_attention(query, key, value, mask)
        scaled_attention = tf.transpose(scaled_attention, perm = [0,2,1,3])
        
        # concating all the heads
        concat_attention = tf.reshape(scaled_attention, 
                                      shape=(batch_size, -1, self.scale_d))
        
        # Final Dense layer
        outputs = self.dense(concat_attention)
        
        return outputs

### Transformer

Since the multihead attention layer is now ready, I'll use it to build the transformer.

But before passing any input to a transformer we need to mask certain token in our input. We do this masking so that the model doesn't treat those tokens as inputs.

In [None]:
# During preprocessing I have padded the sequences that were of length less than 40, 
# now I have to mask those padded tokens
def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]

In [None]:
print(create_padding_mask(tf.constant([[1, 2, 0, 3, 0], [0, 0, 0, 4, 5]])))

In [None]:
# Now I'll write another method that will help me in masking the future tokens in the sequence.
# So that to predict the third word onlyh the first and second words will be used
def create_lookahead_mask(x):
    seq_len = tf.shape(x)[-1]
    lookahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    padding_mask = create_padding_mask(x)
    return tf.maximum(padding_mask, lookahead_mask)

In [None]:
print(create_lookahead_mask(tf.constant([[1,2,0,4,5]])))

### Positional encoding

Since a transformer doesn't consist of convolution operation like CNN and recurrence mechanism like RNN/LSTM it needs a new way to understand the relation between tokens in a sequence. Positional encoding does this job, it consists of the relative information of tokens in a sequence.

Positional encoding vector is added to the embedding vector. Embeddings represent a word in a d-dimensional space in such a way that words with similar meaning are closer to each other. But the information about how close the words in a sentence is not present in embeddings, by adding the positional encoding to those embeddings, words will be closer to each other based on the <i>similarity of their meaning</i> and <i>their position in the sentence</i> both.

$$PE_{(pos, 2i+1)} = cos(\frac{pos}{10000^{\frac{2i}{d_{scale}}}})$$

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, scale_d, name = 'PositionalEncoding'):
        super(PositionalEncoding, self).__init__(name='PositionalEncoding')
        self.positional_encoding = self.positional_ecnoding(position, scale_d)
    
    def get_angles(self, position, i, scale_d):
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(scale_d, tf.float32))
        return position * angles
    
    def positional_ecnoding(self, position, scale_d):
        angle_radians = self.get_angles(position = tf.range(position, dtype = tf.float32)[:, tf.newaxis], 
                                        i = tf.range(scale_d, dtype=tf.float32)[tf.newaxis, :], 
                                        scale_d = scale_d)
        # apply sin over the even indices
        sines = tf.math.sin(angle_radians[:, 0::2])
        # apply cos over the odd indices
        cosines = tf.math.cos(angle_radians[:, 1::2])
        
        positional_encoding = tf.concat([sines, cosines], axis = -1)
        positional_encoding = positional_encoding[tf.newaxis, ...]
        return tf.cast(positional_encoding, dtype=tf.float32)
    
    def call(self, inputs):
        return inputs + self.positional_encoding[:, :tf.shape(inputs)[1], :]

In [None]:
sample_pos_encoding = PositionalEncoding(50, 512)

plt.pcolormesh(sample_pos_encoding.positional_encoding.numpy()[0], cmap='RdBu')
plt.xlabel('Depth')
plt.xlim((0, 512))
plt.ylabel('Position')
plt.colorbar()
plt.show()

### Encoder Layer

All the bare metal code is ready now I will create an encoder layer and its work will be to consume a sequence as input and give us encoded representation as output.

The encoder layer will have 2 steps,
1) MultiHeadAttention Layer
2) 2 Dense layers followed by a dropout layer

Each of the sub layers above have residual connections around them which are followed by layer normalization. Yes it is not BatchNormalization. Layer normalization and weight normalization are [different](https://mlexplained.com/2018/01/13/weight-normalization-and-layer-normalization-explained-normalization-in-deep-learning-part-2/).

The residual connections play the same role that they play in CNNs. The skip connections inside them help us to train deeper networks by not letting out model fall into the pits of vanishing gradient.

In [None]:
def encoder_layer(units, scale_d, num_heads, dropout, name = "Encoder"):
    inputs = tf.keras.Input(shape = (None, scale_d), name = 'Input')
    # Also create a tensor for MASK
    padding_mask = tf.keras.Input(shape=(1,1,None), name = "MASK")
    
    attention = MultiHeadAttentionLayer(scale_d=scale_d, 
                                        num_heads=num_heads, 
                                        name='attention')(
        {'query' : inputs, 
         'key' : inputs, 
         'value' : inputs, 
         'mask' : padding_mask})
    attention = tf.keras.layers.Dropout(dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention)
    
    output = tf.keras.layers.Dense(units=units, activation='relu')(attention)
    output = tf.keras.layers.Dense(units=scale_d)(output)
    
    output = tf.keras.layers.Dropout(dropout)(output)
    output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention + output)
    
    return tf.keras.Model(inputs = [inputs, padding_mask], outputs = output, name = name)

In [None]:
text_encoder = encoder_layer(units=512, 
                       scale_d=128, 
                       num_heads=4, 
                       dropout=0.3, 
                       name='Encoder')

In [None]:
tf.keras.utils.plot_model(text_encoder, show_shapes=True)
# ,to_file='encoder_layer.png')

### Encoder

use the encoder layer above to create an encoder. It should take the concatenated vector of sequences passed through an embedding layer and positional encodings as inputs.

1) Input Embedding

2) Positional Encoding

3) # of Encoder layers


The input embeddings are passed through an embedding layer as then their positional encoding vector is also calculated. Both of them are cancatenated and added passed as input to Encoder

In [None]:
def encoder(vocab_size, 
            num_layers, 
            units, 
            scale_d, 
            num_heads, 
            dropout, 
            name = 'Encoder'):
    inputs = tf.keras.Input(shape=(None,), name = "inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='MASK')
    
    embeddings = tf.keras.layers.Embedding(vocab_size, scale_d)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(scale_d, dtype=tf.float32))
    embeddings = PositionalEncoding(vocab_size, scale_d)(embeddings)
    
    output = tf.keras.layers.Dropout(dropout)(embeddings)
    
    for i in range(num_layers):
        output = encoder_layer(units=units, 
                                scale_d=scale_d, 
                                num_heads=num_heads, 
                                dropout=dropout, 
                                name="EncoderLayer{}".format(i))([output, padding_mask])
    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=output, name=name)

In [None]:
sample_encoder = encoder(vocab_size=8000,
                         units=512,
                         num_layers=2,
                         scale_d=128,
                         num_heads=4,
                         dropout=0.3,
                         name='sample_encoder')

In [None]:
tf.keras.utils.plot_model(sample_encoder, show_shapes=True)
# ,to_file="")

### Decoder Layer

Now it is tome to create a Decode layer. It will consist of,

1) Masked Multi-head attention layer (With both look ahead and Padding mask)

2) Multi head attention layer (With padding mask). Key and Value receive the encoder outputs as inputs and Query will receive the out from Masked Multi-head attention layer

3) 2 Dense layers followed by Dropout


Each of these sublayers has a residual connection around them which is followed by Layer Normalization. 

Query receives the decoder's attention layer output as input and key receives the encoder's output as input. The attention scores act as the weights or represent the importance of decoders input(nothing but the encoders output). In other words the decoder predicts the next word by looking at the encoder's output and self-attending its own output.

In [None]:
def decoder_layer(units, scale_d, num_heads, dropout, name='Decoder Layer'):
    inputs = tf.keras.Input(shape=(None, scale_d), name="inputs")
    enc_outputs = tf.keras.Input(shape=(None, scale_d), name="encoder_input")
    
    look_ahead_mask = tf.keras.Input(shape=(1,None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1,1,None), name = "padding_mask")
    
    attention1 = MultiHeadAttentionLayer(scale_d, 
                                         num_heads, 
                                         name="attention1")(inputs = {
        'query' : inputs,
        'key' : inputs,
        'value' : inputs,
        'mask' : look_ahead_mask
    })
    attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention1 + inputs)
    
    attention2 = MultiHeadAttentionLayer(scale_d, 
                                         num_heads, 
                                         name="Attention2")(inputs = {
        'query' : attention1,
        'key' : enc_outputs,
        'value' : enc_outputs,
        'mask' : padding_mask
    })
    
    attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention2 + attention1)
    
    outputs = tf.keras.layers.Dense(units = units, activation = 'relu')(attention2)
    outputs = tf.keras.layers.Dense(units = scale_d)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(outputs + attention2)
    
    return tf.keras.Model(inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], 
                                 outputs=outputs,
                                 name = name)

In [None]:
sample_decoder = decoder_layer(units=512, 
                               scale_d=128, 
                               num_heads=4, 
                               dropout=0.3, 
                               name = 'decoder_layer')

In [None]:
tf.keras.utils.plot_model(sample_decoder, show_shapes=True)

### Decoder

Decode will consist of,

1) Output Embeddings

2) Positional Encoding

3) N decoder layers

The target is passed through an embedding layer which is then summed with the positional encoding. The output of this summation is the input to the decoder layers. The output of the decoder is the input to the final linear layer

In [None]:
def decoder(vocab_size, 
            num_layers, 
            units, 
            scale_d, 
            num_heads, 
            dropout, 
            name = "Decoder"):
    inputs = tf.keras.Input(shape = (None,), name="inputs")
    enc_outputs = tf.keras.Input(shape = (None,scale_d), name = "encoder_output")
    
    look_ahead_mask = tf.keras.Input(shape=(1,None,None), name = "look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1,1,None), name = "padding_mask")
    
    embeddings = tf.keras.layers.Embedding(vocab_size, scale_d)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(scale_d, dtype=tf.float32))
    embeddings = PositionalEncoding(vocab_size, scale_d)(embeddings)
    
    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)
    
    for i in range(num_layers):
        outputs = decoder_layer(units=units, 
                                scale_d=scale_d, 
                                num_heads=num_heads, 
                                dropout=dropout, 
                                name = "decoder_layer_{}".format(i)
                               )(inputs = [outputs, enc_outputs, look_ahead_mask, padding_mask])
    return tf.keras.Model(inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], 
                                 outputs=outputs, 
                                 name=name)

In [None]:
sample_decoder = decoder(vocab_size=8000, 
                        units=512,
                        scale_d=128,
                        num_layers=2, 
                        num_heads=4,
                        dropout=0.3,
                        name="Decoder")

In [None]:
tf.keras.utils.plot_model(sample_decoder, show_shapes = True)

### Transformer

Transformer is a combination of encoder, decoder and a final layer. The output of the decoder is the input to the linear layer and its output is returned.

In [None]:
def transformer(vocab_size, 
                num_layers, 
                units, 
                scale_d, 
                num_heads, 
                dropout, 
                name = "Transformer"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")
    
    enc_padding_mask = tf.keras.layers.Lambda( create_padding_mask, 
                                             output_shape=(1,1,None), 
                                             name = "enc_padding_mask")(inputs)
    # Now mask the future tokens in he decoder input at first attention layer
    look_ahead_mask = tf.keras.layers.Lambda(create_lookahead_mask, 
                                             output_shape=(1, None, None), 
                                             name="look_ahead_mask")(dec_inputs)
    # mask the encoder outputs for second attention layer
    dec_padding_mask = tf.keras.layers.Lambda(create_padding_mask, 
                                              output_shape=(1,1,None), 
                                              name='dec_padding_mask')(inputs)
    
    enc_outputs = encoder(vocab_size=vocab_size, 
                        num_layers=num_layers,
                        units=units,
                        scale_d=scale_d, 
                        num_heads=num_heads,
                        dropout=dropout)(inputs=[inputs, enc_padding_mask])
    dec_outputs = decoder(vocab_size=vocab_size,
                         num_layers=num_layers,
                         units=units,
                         scale_d=scale_d,
                         num_heads=num_heads,
                         dropout=dropout)(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])
    outputs = tf.keras.layers.Dense(units = vocab_size, name='outputs')(dec_outputs)
    return tf.keras.Model(inputs=[inputs, dec_inputs], outputs = outputs, name = name)

In [None]:
sample_transformer = transformer(vocab_size=8000,
                                num_layers=2,
                                units = 512,
                                scale_d=128,
                                num_heads=4,
                                dropout=0.3,
                                name="Sample Transformer")

In [None]:
tf.keras.utils.plot_model(sample_transformer, show_shapes=True)

### Model Training


Model Initialization

See the [paper](https://arxiv.org/pdf/1706.03762.pdf) for all the other versions of the transformer.


In [None]:
tf.keras.backend.clear_session()

NUM_LAYERS=2
NUM_HEADS=8
UNITS=512
SCALE_D=256
DROPOUT=0.1

model = transformer(vocab_size=VOCAB_SIZE, 
                    num_layers=NUM_LAYERS, 
                    units=UNITS, 
                    scale_d=SCALE_D, 
                    num_heads=NUM_HEADS, 
                    dropout=DROPOUT)

### LOSS Function

Since the target sequences are masked we also need to modify the loss function to adjust for those paddings and doesn't consider them for loss calculation

In [None]:
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, 
                                                        reduction='none')(y_true=y_true, y_pred=y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), dtype=tf.float32)
    loss = tf.multiply(loss, mask)
    return tf.reduce_mean(loss)

### Custom Learning Rate

Use the Adam optimizer with a custom [LR Scheduler](https://arxiv.org/pdf/1706.03762.pdf)


In [None]:
class CustomLRScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, scale_d, warmup_steps = 800):
        super().__init__()
        self.scale_d = scale_d
        self.scale_d = tf.cast(self.scale_d, dtype=tf.float32)
        
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        a = tf.math.rsqrt(step)
        b = step * (self.warmup_steps**-1.5)
        return tf.math.rsqrt(self.scale_d)*tf.math.minimum(a, b)

In [None]:
sample_learning_rate = CustomLRScheduler(scale_d=128, warmup_steps=200)
plt.plot(sample_learning_rate(tf.range(20000, dtype=tf.float32)))
plt.xlabel("Step")
plt.ylabel("LearningRate");

### Compiling model

In [None]:
learning_rate = CustomLRScheduler(scale_d=SCALE_D, warmup_steps=800)

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH-1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred=y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

### Fit the model

Now it is time to train the Optimus

In [None]:
EPOCS = 50

model.fit(dataset, epochs=EPOCS)