In [1]:
import numpy as np

In [2]:
# Hyperparameters
d_model = 512  # Dimension of the model
d_ff = 2048  # Dimension of feedforward network
seq_length = 20  # Sequence length


In [3]:
# Initialize parameters
def initialize_parameters():
    params = {}
    params['Wq'] = np.random.randn(d_model, d_model) * 0.1
    params['Wk'] = np.random.randn(d_model, d_model) * 0.1
    params['Wv'] = np.random.randn(d_model, d_model) * 0.1
    params['Wo'] = np.random.randn(d_model, d_model) * 0.1
    params['W1'] = np.random.randn(d_model, d_ff) * 0.1
    params['W2'] = np.random.randn(d_ff, d_model) * 0.1
    params['b1'] = np.zeros((1, d_ff))
    params['b2'] = np.zeros((1, d_model))
    return params

params = initialize_parameters()


In [4]:

# Self-Attention
def self_attention(Q, K, V):
    scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(Q.shape[-1])
    # Apply the softmax trick for numerical stability
    scores -= np.max(scores, axis=-1, keepdims=True)
    weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    output = np.matmul(weights, V)
    return output, weights


In [5]:

# Multi-Head Attention
def multi_head_attention(X, params):
    # Linear projections
    Q = np.dot(X, params['Wq'])
    K = np.dot(X, params['Wk'])
    V = np.dot(X, params['Wv'])

    # Self-attention mechanism
    attn_output, _ = self_attention(Q, K, V)
    attn_output = np.dot(attn_output, params['Wo'])  # Linear transformation
    return attn_output


In [6]:

# Feedforward Network
def feedforward_network(X, params):
    hidden = np.dot(X, params['W1']) + params['b1']
    hidden = np.maximum(0, hidden)  # ReLU activation
    output = np.dot(hidden, params['W2']) + params['b2']
    return output


In [7]:
# Transformer Block
def transformer_block(X, params):
    # Apply multi-head attention
    attn_output = multi_head_attention(X, params)
    attn_output = X + attn_output  # Add & Norm

    # Apply feedforward network
    ff_output = feedforward_network(attn_output, params)
    output = attn_output + ff_output  # Add & Norm
    return output


In [8]:
# Dummy data for illustration
X = np.random.randn(10, seq_length, d_model)  # (batch_size, seq_length, d_model)
y = np.random.randint(0, 2, (10, seq_length, d_model))  # Target labels


In [9]:
# Forward pass through multiple transformer blocks
for _ in range(12):  # Number of transformer blocks (like GPT-1)
    X = transformer_block(X, params)


In [10]:
# Compute loss (simple mean squared error for illustration)
loss = np.mean((X - y) ** 2)

print("Loss:", loss)


Loss: 5.0785315130133246e+38
