In [None]:
import numpy as np

In [None]:
def gelu(x):
  return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * (x ** 3))))
def softmax(x, axis=-1):
  x_max = np.max(x, axis=axis, keepdims=True)
  e_x = np.exp(x - x_max)
  return e_x / np.sum(e_x, axis=axis, keepdims=True)

**Token Embedding**

In [None]:
class TokenEmbedding:
  def __init__(self, vocab_size, d_model, seed=0):
    rng = np.random.RandomState(seed)
    self.W = rng.normal(scale=0.02, size=(vocab_size, d_model))

  def __call__(self, token_ids):
    return self.W[token_ids]

**Positional Encoding (Sinusoidal)**

In [None]:
class SinusoidalPositionalEncoding:
  def __init__(self, d_model, max_len=512):
    pe = np.zeros((max_len, d_model))
    position = np.arange(0, max_len)[:, None]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)
    self.pe = pe

  def __call__(self, seq_len):
    return self.pe[:seq_len, :][None, :, :] # [1, seq_len, d_model]

**Layer Normalization**

In [None]:
class LayerNorm:
  def __init__(self, d_model, eps=1e-5):
    self.gamma = np.ones((d_model,))
    self.beta = np.zeros((d_model,))
    self.eps = eps
  def __call__(self, x):
     mean = np.mean(x, axis=-1, keepdims=True)
     var = np.var(x, axis=-1, keepdims=True)
     return self.gamma * (x - mean) / np.sqrt(var + self.eps) + self.beta

**Causal Mask**

In [None]:
def causal_mask(seq_len):
  mask = np.tril(np.ones((seq_len, seq_len), dtype=np.float32))
  return (1.0 - mask) * -1e9 # 0 untuk boleh, -1e9 untuk block

**Scaled Dot-Product Attention**

In [None]:
class ScaledDotProductAttention:
  def __call__(self, Q, K, V, mask=None):
    dk = Q.shape[-1]
    scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / np.sqrt(dk)
    if mask is not None:
      scores += mask
    weights = softmax(scores, axis=-1)
    out = np.matmul(weights, V)
    return out, weights

**Multi-Head Attention**

In [None]:
class MultiHeadAttention:
  def __init__(self, d_model, n_heads, seed=0):
    rng = np.random.RandomState(seed)
    self.d_model = d_model
    self.n_heads = n_heads
    self.head_dim = d_model // n_heads
    self.W_q = rng.normal(scale=0.02, size=(d_model, d_model))
    self.W_k = rng.normal(scale=0.02, size=(d_model, d_model))
    self.W_v = rng.normal(scale=0.02, size=(d_model, d_model))
    self.W_o = rng.normal(scale=0.02, size=(d_model, d_model))
    self.attn = ScaledDotProductAttention()

  def split_heads(self, x):
    b, s, _ = x.shape
    return x.reshape(b, s, self.n_heads, self.head_dim).transpose(0, 2, 1, 3)

  def combine_heads(self, x):
    b = x.shape[0]
    return x.transpose(0, 2, 1, 3).reshape(b, x.shape[2], self.d_model)

  def __call__(self, x, mask=None):
    Q, K, V = x @ self.W_q, x @ self.W_k, x @ self.W_v
    Qh, Kh, Vh = self.split_heads(Q), self.split_heads(K), self.split_heads(V)
    if mask is not None:
      mask = mask[None, None, :, :]
      out_h, weights = self.attn(Qh, Kh, Vh, mask)
      out = self.combine_heads(out_h) @ self.W_o
      return out, weights


**Feed Forward Network**

In [None]:
class FeedForward:
  def __init__(self, d_model, d_ff, seed=0):
    rng = np.random.RandomState(seed)
    self.W1 = rng.normal(scale=0.02, size=(d_model, d_ff))
    self.b1 = np.zeros((d_ff,))
    self.W2 = rng.normal(scale=0.02, size=(d_ff, d_model))
    self.b2 = np.zeros((d_model,))

  def __call__(self, x):
    return gelu(x @ self.W1 + self.b1) @ self.W2 + self.b2

**Decoder Block**

In [None]:
class DecoderBlock:
  def __init__(self, d_model, n_heads, d_ff, seed=0):
    self.mha = MultiHeadAttention(d_model, n_heads, seed=seed)
    self.ln1 = LayerNorm(d_model)
    self.ffn = FeedForward(d_model, d_ff, seed=seed+1)
    self.ln2 = LayerNorm(d_model)

  def __call__(self, x, mask):
    x_norm = self.ln1(x)
    mha_out, attn = self.mha(x_norm, mask)
    x = x + mha_out
    x_norm = self.ln2(x)
    ffn_out = self.ffn(x_norm)
    return x + ffn_out, attn

**Transformer Decoder**

In [None]:
class TransformerDecoder:
  def __init__(self, vocab_size, d_model=64, n_heads=4, n_layers=2, d_ff=128, max_len=100, seed=0):
    self.embedding = TokenEmbedding(vocab_size, d_model, seed)
    self.positional = SinusoidalPositionalEncoding(d_model, max_len)
    self.layers = [DecoderBlock(d_model, n_heads, d_ff, seed+i*10) for i in range(n_layers)]
    self.ln_final = LayerNorm(d_model)
    rng = np.random.RandomState(seed+999)
    self.W_out = rng.normal(scale=0.02, size=(d_model, vocab_size))

  def forward(self, token_ids):
    b, s = token_ids.shape
    x = self.embedding(token_ids) + self.positional(s)
    mask = causal_mask(s)
    attn_all = []
    for layer in self.layers:
      x, attn = layer(x, mask)
      attn_all.append(attn)
    x = self.ln_final(x)
    logits = x @ self.W_out
    probs_next = softmax(logits[:, -1, :], axis=-1)
    return logits, probs_next, attn_all

In [None]:
if __name__ == "__main__":
  np.random.seed(42)

  # Input: batch of token IDs
  tokens = np.array([[1, 3, 5, 7, 9, 11]]) # seq_len=6
  print("Input tokens:\n", tokens)

  # Model
  model = TransformerDecoder(
      vocab_size=100,
      d_model=64,
      n_heads=8,
      n_layers=4,
      d_ff=64,
      seed=123
      )
   # Forward pass
  logits, probs_next, attn = model.forward(tokens)

  print("\n[Output]")
  print("Logits shape:", logits.shape, "(batch, seq_len, vocab_size)")
  print("Probs_next shape:", probs_next.shape, "(batch, vocab_size)")
  print("Sum of probs_next:", probs_next.sum())
  print("Top-5 predicted token indices:", np.argsort(-probs_next[0])[:5])

Input tokens:
 [[ 1  3  5  7  9 11]]

[Output]
Logits shape: (1, 6, 100) (batch, seq_len, vocab_size)
Probs_next shape: (1, 100) (batch, vocab_size)
Sum of probs_next: 1.0
Top-5 predicted token indices: [14  7 51 78 93]
