<a href="https://colab.research.google.com/github/arunpandey2023/deep/blob/main/Transformers_with_Self_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#BUILDING TRANSFORMER MODELS WITH SELF ATTENTION FROM SCRATCH

#1 Import Base Libraries


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization
from tensorflow.keras.models import Model
import numpy as np

#2 Self-Attention Mechanism Layer

In [None]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim):
        super(SelfAttention, self).__init__()
        self.embed_dim = embed_dim

    def build(self, input_shape):
        self.W_q = self.add_weight(name='W_q',
                                   shape=(input_shape[-1], self.embed_dim),
                                   initializer='glorot_uniform',
                                   trainable=True)
        self.W_k = self.add_weight(name='W_k',
                                   shape=(input_shape[-1], self.embed_dim),
                                   initializer='glorot_uniform',
                                   trainable=True)
        self.W_v = self.add_weight(name='W_v',
                                   shape=(input_shape[-1], self.embed_dim),
                                   initializer='glorot_uniform',
                                   trainable=True)

    def call(self, inputs):
        q = tf.matmul(inputs, self.W_q)
        k = tf.matmul(inputs, self.W_k)
        v = tf.matmul(inputs, self.W_v)

        attn_scores = tf.matmul(q, k, transpose_b=True)
        attn_scores = tf.nn.softmax(attn_scores / tf.math.sqrt(tf.cast(self.embed_dim, tf.float32)), axis=-1)
        output = tf.matmul(attn_scores, v)
        return output

#3 Transformer Model

In [None]:
def build_transformer_model(input_shape, embed_dim, num_heads, ff_dim, num_layers, vocab_size, max_length):
    inputs = Input(shape=input_shape)

    # Positional Encoding
    position_embed = np.array([[pos / np.power(10000, 2 * (i // 2) / embed_dim) for i in range(embed_dim)]
                               for pos in range(max_length)])
    position_embed[:, 0::2] = np.sin(position_embed[:, 0::2])
    position_embed[:, 1::2] = np.cos(position_embed[:, 1::2])
    position_embed = tf.convert_to_tensor(position_embed, dtype=tf.float32)
    pos_encoding = position_embed[:input_shape[1], :]
    x = inputs + pos_encoding

    # Transformer Encoder
    for _ in range(num_layers):
        attn_output = SelfAttention(embed_dim)(x)
        attn_output = LayerNormalization(epsilon=1e-6)(attn_output + x)

        ffn_output = Dense(ff_dim, activation='relu')(attn_output)
        ffn_output = Dense(embed_dim)(ffn_output)
        x = LayerNormalization(epsilon=1e-6)(ffn_output + attn_output)

    # Output layer
    outputs = Dense(vocab_size, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

#4 Example parameters

In [None]:
input_shape = (100, 512)  # Input shape: (sequence_length, embedding_dimension)
embed_dim = 128
num_heads = 8
ff_dim = 256
num_layers = 4
vocab_size = 10000
max_length = 1000

#5 Build Transformer Model

In [None]:
transformer_model = build_transformer_model(input_shape, embed_dim, num_heads, ff_dim, num_layers, vocab_size, max_length)
transformer_model.summary()