### Import the required libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
import numpy as np


2024-03-01 21:21:19.133817: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-01 21:21:19.213037: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.


### Creating TransformerEncoderLayer Class

In [2]:
class TransformerEncoderLayer(tf.keras.layers.Layer):
    """
    Transformer Encoder Layer Class
    """
    def __init__(self, d_model: int, num_heads: int, dff: int, rate: float = 0.1):
        """
        Initializes the TransformerEncoderLayer.

        Parameters:
            d_model (int): The dimension of the model.
            num_heads (int): The number of attention heads.
            dff (int): The number of units in the feedforward neural network layer.
            rate (float, optional): The dropout rate. Default is 0.1.

        Returns:
            None
        """
        super(TransformerEncoderLayer, self).__init__()

        self.multi_head_attention = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=d_model, dropout=rate
        )
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
        """
        Call function for the layer.

        Parameters:
            inputs (tf.Tensor): The input tensor.
            training (bool): Whether the model is in training mode.

        Returns:
            tf.Tensor: The output tensor.

        """
        attn_output = self.multi_head_attention(inputs, inputs, return_attention_scores=False)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)



### Creating PositionalEncoding Class

In [3]:
class PositionalEncoding(tf.keras.layers.Layer):
    """
    Positional Encoding Class
    """
    def __init__(self, position: int, d_model: int):
        """
        Initialize the PositionalEncoding object.

        Parameters:
            position (int): The position parameter.
            d_model (int): The d_model parameter.

        Returns:
            None
        """
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position: tf.Tensor, i: tf.Tensor, d_model: int) -> tf.Tensor:
        """
        Get angles for positional encoding

        Parameters:
            position (tf.Tensor): The position tensor.
            i (tf.Tensor): The i tensor.
            d_model (int): The d_model parameter.

        Returns:
            tf.Tensor: The angles tensor.
        """

        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position: int, d_model: int) -> tf.Tensor:
        """
        Calculate the positional encoding.

        Parameters:
            position (int): The position parameter.
            d_model (int): The d_model parameter.

        Returns:
            tf.Tensor: The positional encoding tensor.
        """

        angle_rads = self.get_angles(position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
                                     i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
                                     d_model=d_model
                                     )
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        """
        Call function for the layer
        """
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]


### Creating TransformerEncoder Class

In [4]:
class TransformerEncoder(tf.keras.layers.Layer):
    """
    Transformer Encoder Class.
    """
    def __init__(self, 
                 num_layers: int, 
                 d_model: int, 
                 num_heads: int, 
                 dff: int,
                 input_vocab_size: int, 
                 rate: float = 0.1):
        """
        Initializes the TransformerEncoder.

        Parameters:
            num_layers (int): The number of layers.
            d_model (int): The dimensionality of the model.
            num_heads (int): The number of attention heads.
            dff (int): The number of neurons in the feedforward network.
            input_vocab_size (int): The size of the input vocabulary.
            rate (float, optional): The dropout rate. Defaults to 0.1.

        Returns:
            None
        """
        super(TransformerEncoder, self).__init__()

        self.embedding = Embedding(input_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(input_vocab_size, d_model)

        self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

    def call(self, x: tf.Tensor, training: bool) -> tf.Tensor:
        """ 
        Call function for the layer.

        Parameters:
            x (tf.Tensor): The input tensor.
            training (bool): Whether the model is in training mode.

        Returns:
            tf.Tensor: The output tensor.

        """
        
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(x.shape[-1], dtype=tf.float32))
        x += self.pos_encoding(x)
        for layer in self.enc_layers:
            x = layer(x, training)
        return x


### Creating TransformerEncoder Class

In [5]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    Custom Learning Rate Schedule Class

    Parameters:
        d_model (int): The dimension of the model.
        warmup_steps (int, optional): The number of warmup steps. Defaults to 4000.

    Returns:
        None
    """
    def __init__(self, d_model: int, warmup_steps: int = 4000):
        super(CustomSchedule, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step: int) -> tf.Tensor:
        """
        Call function for the learning rate schedule.

        Parameters:
            step (int): The current step.

        Returns:
            tf.Tensor: The learning rate tensor.
        """
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


### Creating TransformerClass

In [6]:
class Transformer(tf.keras.Model):
    """
    Transformer Class
    """
    def __init__(self, 
                 num_layers: int, 
                 d_model: int, 
                 num_heads: int, 
                 dff: int, 
                 input_vocab_size: int, 
                 rate: float = 0.1):
        """
        Initializes the Transformer model with the specified parameters.

        Parameters:
            num_layers (int): The number of layers in the Transformer model.
            d_model (int): The dimensionality of the model.
            num_heads (int): The number of attention heads.
            dff (int): The dimensionality of the feed-forward layer.
            input_vocab_size (int): The size of the input vocabulary.
            rate (float, optional): The dropout rate. Defaults to 0.1.

        Returns:
            None
        """
        
        super(Transformer, self).__init__()

        self.encoder = TransformerEncoder(
            num_layers=num_layers,
            d_model=d_model,
            num_heads=num_heads,
            dff=dff,
            input_vocab_size=input_vocab_size,
            rate=rate
            )

        self.final_layer = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(input_vocab_size, activation='softmax'))

    def call(self, inp: tf.Tensor, training: bool) -> tf.Tensor:
        """
        Call function for the Transformer model.

        Parameters:
            inp (tf.Tensor): The input tensor.
            training (bool): The training flag.

        Returns:
            tf.Tensor: The output tensor.
        """

        enc_output = self.encoder(inp, training)
        return self.final_layer(enc_output)


### Generating the data

In [7]:
def generate_data(num_samples: int, sequence_length: int, vocab_size: int) -> np.ndarray:
    """
    Function to generate random data

    Parameters:
        num_samples (int): The number of samples to generate.
        sequence_length (int): The length of the sequence.
        vocab_size (int): The size of the vocabulary.

    Returns:
        np.ndarray: The generated data.
    """
    return np.random.randint(0, vocab_size, size=(num_samples, sequence_length))


def reverse_sequence(sequence: np.ndarray) -> np.ndarray:
    """
    Function to reverse a sequence

    Parameters:
        sequence (np.ndarray): The sequence to reverse.

    Returns:
        np.ndarray: The reversed sequence.
    """
    return sequence[:, ::-1]


### Define the hyperparameters

In [8]:
# Define hyperparameters
num_layers_enc = 4
d_model_enc = 128
num_heads_enc = 8
dff_enc = 512
input_vocab_size_model = 20
batch_size = 64
sequence_length = 15
epochs = 50

### Generate the training and testing data

In [None]:
# Generate random training data
x_train = generate_data(1000, sequence_length, input_vocab_size_model)
y_train = reverse_sequence(x_train)[..., np.newaxis]

# Generate random test data
x_test = generate_data(1000, sequence_length, input_vocab_size_model)
y_test = reverse_sequence(x_test)[..., np.newaxis]



### Instantiate the model and train it

In [9]:
# Create Transformer model
transformer_model = Transformer(
    num_layers=num_layers_enc,
    d_model=d_model_enc,
    num_heads=num_heads_enc,
    dff=dff_enc,
    input_vocab_size=input_vocab_size_model
    )

# Define the optimizer with a custom learning rate schedule
custom_learning_rate = CustomSchedule(d_model_enc)
optimizer = tf.keras.optimizers.Adam(learning_rate=custom_learning_rate)

# Compile the model
transformer_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Train the model
transformer_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)

# Evaluate the model
evaluation = transformer_model.evaluate(x_test, y_test)
print("Evaluation Loss:", evaluation[0])
print("Evaluation Accuracy:", evaluation[1])

2024-03-01 21:21:23.647628: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluation Loss: 0.00017523116548545659
Evaluation Accuracy: 1.0
