# üèóÔ∏è Transformer Block

Assembler : Attention + Feed-Forward + LayerNorm + Residual

In [None]:
import numpy as np

class TransformerBlock:
    def __init__(self, d_model, num_heads, d_ff):
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, d_ff)
        self.ln1 = LayerNorm(d_model)
        self.ln2 = LayerNorm(d_model)
    
    def forward(self, x):
        # Multi-head attention + residual + norm
        attn_out = self.attention.forward(x)
        x = self.ln1.forward(x + attn_out)
        
        # Feed-forward + residual + norm
        ffn_out = self.ffn.forward(x)
        x = self.ln2.forward(x + ffn_out)
        
        return x

class FeedForward:
    def __init__(self, d_model, d_ff):
        self.W1 = np.random.randn(d_model, d_ff) * 0.01
        self.W2 = np.random.randn(d_ff, d_model) * 0.01
    
    def forward(self, x):
        x = np.maximum(0, np.dot(x, self.W1))  # ReLU
        x = np.dot(x, self.W2)
        return x

class LayerNorm:
    def __init__(self, d_model, eps=1e-6):
        self.gamma = np.ones(d_model)
        self.beta = np.zeros(d_model)
        self.eps = eps
    
    def forward(self, x):
        mean = x.mean(axis=-1, keepdims=True)
        std = x.std(axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta