Positional Encoding: The positional_encoding function provides a unique encoding for each position in the sequence to retain order information in the input embeddings.

Multi-Head Attention:

MultiHeadAttention class performs self-attention across multiple heads.
Each head computes scaled dot-product attention with different projections of the input.
The outputs of all heads are concatenated and linearly transformed.
Feed-Forward Network: The FeedForward class represents a two-layer fully connected network with a ReLU activation in the middle.

Layer Normalization: The LayerNormalization class normalizes the output of each sub-layer.

Transformer Model: The Transformer class integrates all components (embedding, attention, feed-forward, and normalization layers) to process the input through an encoder layer.

In [None]:
import numpy as np

class Transformer:
    def __init__(self, d_model, num_heads, d_ff, input_vocab_size, max_seq_len):
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.input_vocab_size = input_vocab_size
        self.max_seq_len = max_seq_len

        # Embedding and positional encoding
        self.embedding = np.random.randn(input_vocab_size, d_model) * 0.01
        self.pos_encoding = self.positional_encoding(max_seq_len, d_model)

        # Initialize layers
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.layer_norm1 = LayerNormalization(d_model)
        self.layer_norm2 = LayerNormalization(d_model)

    def positional_encoding(self, max_len, d_model):
        pos_encoding = np.zeros((max_len, d_model))
        for pos in range(max_len):
            for i in range(0, d_model, 2):
                pos_encoding[pos, i] = np.sin(pos / (10000 ** (i / d_model)))
                pos_encoding[pos, i + 1] = np.cos(pos / (10000 ** (i / d_model)))
        return pos_encoding

    def forward(self, x):
        # Embedding and positional encoding
        x = self.embedding[x] + self.pos_encoding[:x.shape[0]]

        # Encoder layers
        attn_out = self.attention(x, x, x)
        x = self.layer_norm1(x + attn_out)

        ff_out = self.feed_forward(x)
        x = self.layer_norm2(x + ff_out)

        return x


class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        # Initialize weights
        self.Wq = np.random.randn(d_model, d_model) * 0.01
        self.Wk = np.random.randn(d_model, d_model) * 0.01
        self.Wv = np.random.randn(d_model, d_model) * 0.01
        self.Wo = np.random.randn(d_model, d_model) * 0.01

    def split_heads(self, x):
        # Split into heads
        x = x.reshape(x.shape[0], -1, self.num_heads, self.depth)
        return x.transpose(1, 2, 0, 3)

    def scaled_dot_product_attention(self, Q, K, V):
        matmul_qk = np.matmul(Q, K.transpose(0, 1, 3, 2))
        dk = K.shape[-1]
        scaled_attention_logits = matmul_qk / np.sqrt(dk)
        attention_weights = np.exp(scaled_attention_logits)
        attention_weights /= np.sum(attention_weights, axis=-1, keepdims=True)
        output = np.matmul(attention_weights, V)
        return output

    def forward(self, Q, K, V):
        # Linear projections
        Q = np.dot(Q, self.Wq)
        K = np.dot(K, self.Wk)
        V = np.dot(V, self.Wv)

        # Split heads
        Q = self.split_heads(Q)
        K = self.split_heads(K)
        V = self.split_heads(V)

        # Scaled dot-product attention
        attention = self.scaled_dot_product_attention(Q, K, V)

        # Concatenate heads
        attention = attention.transpose(2, 0, 1, 3).reshape(Q.shape[2], -1, self.d_model)

        # Final linear layer
        output = np.dot(attention, self.Wo)
        return output


class FeedForward:
    def __init__(self, d_model, d_ff):
        # Initialize weights
        self.W1 = np.random.randn(d_model, d_ff) * 0.01
        self.W2 = np.random.randn(d_ff, d_model) * 0.01
        self.b1 = np.zeros((1, d_ff))
        self.b2 = np.zeros((1, d_model))

    def forward(self, x):
        x = np.dot(x, self.W1) + self.b1
        x = np.maximum(0, x)  # ReLU activation
        x = np.dot(x, self.W2) + self.b2
        return x


class LayerNormalization:
    def __init__(self, d_model, epsilon=1e-6):
        self.epsilon = epsilon
        self.gamma = np.ones((1, d_model))
        self.beta = np.zeros((1, d_model))

    def forward(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)
        x_normalized = (x - mean) / np.sqrt(var + self.epsilon)
        return self.gamma * x_normalized + self.beta


# Example usage
# Input configuration
input_vocab_size = 1000
max_seq_len = 10
d_model = 64
num_heads = 4
d_ff = 128

# Example sequence (length = 10) with random token indices
x = np.random.randint(0, input_vocab_size, size=(max_seq_len,))

# Initialize Transformer model and pass input through it
transformer = Transformer(d_model=d_model, num_heads=num_heads, d_ff=d_ff,
                          input_vocab_size=input_vocab_size, max_seq_len=max_seq_len)
output = transformer.forward(x)

print("Output shape:", output.shape)
