#### Import Library

In [10]:
import numpy as np

#### Token Embedding

In [9]:
def token_embedding(tokens, embedding_matrix):
    """Mengambil vektor embedding untuk setiap token."""
    return embedding_matrix[tokens]

#### Positional Encoding

In [None]:
def positional_encoding(seq_len, d_model):
    """Membuat matriks sinusoidal positional encoding."""
    positions = np.arange(seq_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    
    pe = np.zeros((seq_len, d_model))
    pe[:, 0::2] = np.sin(positions * div_term)
    pe[:, 1::2] = np.cos(positions * div_term)
    
    return pe[np.newaxis, :, :]

#### Scaled Dot Product

In [None]:
def scaled_dot_product_attention(Q, K, V, mask=None):
    """Menghitung Scaled Dot-Product Attention."""
    d_k = Q.shape[-1]
    scores = (Q @ K.transpose(0, 1, 3, 2)) / np.sqrt(d_k)
    
    if mask is not None:
        scores += mask
        
    # Softmax
    attention_weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention_weights /= np.sum(attention_weights, axis=-1, keepdims=True)
    
    output = attention_weights @ V
    return output, attention_weights

#### Multi Head Attention

In [None]:
class MultiHeadAttention:
    """Implementasi Multi-Head Attention."""
    def __init__(self, d_model, num_heads):
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        # Inisialisasi bobot
        self.W_q = np.random.randn(d_model, d_model)
        self.W_k = np.random.randn(d_model, d_model)
        self.W_v = np.random.randn(d_model, d_model)
        self.W_o = np.random.randn(d_model, d_model)
        
    def forward(self, x, mask):
        batch_size, seq_len, _ = x.shape
        
        # Proyeksi linear
        Q_proj = x @ self.W_q
        K_proj = x @ self.W_k
        V_proj = x @ self.W_v
        
        # Reshape dan transpose untuk multi-head
        Q = Q_proj.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(0, 2, 1, 3)
        K = K_proj.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(0, 2, 1, 3)
        V = V_proj.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(0, 2, 1, 3)
        
        # Jalankan scaled dot-product attention
        attention_output, attention_weights = scaled_dot_product_attention(Q, K, V, mask)
        
        # Concat dan proyeksi akhir
        concatenated = attention_output.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, self.d_model)
        output_mha = concatenated @ self.W_o
        
        return output_mha, attention_weights

#### Feed Forward Network

In [14]:
class FeedForwardNetwork:
    """Implementasi Feed-Forward Network."""
    def __init__(self, d_model, d_ff):
        # Inisialisasi bobot dan bias
        self.W1 = np.random.randn(d_model, d_ff)
        self.b1 = np.zeros(d_ff)
        self.W2 = np.random.randn(d_ff, d_model)
        self.b2 = np.zeros(d_model)
        
    def forward(self, x):
        # Lapisan pertama dengan aktivasi ReLU
        hidden = np.maximum(0, x @ self.W1 + self.b1)
        # Lapisan kedua
        output = hidden @ self.W2 + self.b2
        return output

#### Layer Normalization

In [None]:
class LayerNormalization:
    """Implementasi Layer Normalization."""
    def __init__(self, epsilon=1e-5):
        self.epsilon = epsilon
        
    def forward(self, x):
        mean = x.mean(axis=-1, keepdims=True)
        std = x.std(axis=-1, keepdims=True)
        return (x - mean) / (std + self.epsilon)

#### Causal Masking

In [None]:
def create_causal_mask(size):
    """Membuat mask segitiga atas untuk mencegah atensi ke token masa depan."""
    mask = np.triu(np.ones((1, size, size)), k=1).astype(bool)
    return np.where(mask, -np.inf, 0)

[[[  0. -inf -inf -inf -inf]
  [  0.   0. -inf -inf -inf]
  [  0.   0.   0. -inf -inf]
  [  0.   0.   0.   0. -inf]
  [  0.   0.   0.   0.   0.]]]


#### Output Layer

In [17]:
class OutputLayer:
    """Implementasi lapisan output akhir."""
    def __init__(self, d_model, vocab_size):
        self.projection = np.random.randn(d_model, vocab_size)
        
    def forward(self, x):
        # Proyeksikan semua token ke ukuran kosakata
        all_logits = x @ self.projection
        
        # Ambil logits dari token terakhir untuk prediksi
        last_token_logits = all_logits[:, -1, :]
        
        # Softmax untuk mendapatkan probabilitas
        exp_logits = np.exp(last_token_logits - np.max(last_token_logits, axis=-1, keepdims=True))
        next_token_probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
        
        return all_logits, next_token_probs

#### Run Forward Pass

In [None]:
# Persiapan Input ---
input_tokens = np.array([[10, 25, 150, 5, 8]])
batch_size, seq_len = input_tokens.shape

# Inisialisasi Semua Komponen Modular ---
embedding_matrix = np.random.randn(vocab_size, d_model)
mha_layer = MultiHeadAttention(d_model, num_heads)
ffn_layer = FeedForwardNetwork(d_model, d_ff)
norm1_layer = LayerNormalization()
norm2_layer = LayerNormalization()
output_layer = OutputLayer(d_model, vocab_size)

# Menjalankan Alur Forward Pass (Satu Blok Decoder) ---

# Embedding & Positional Encoding
x = token_embedding(input_tokens, embedding_matrix)
x += positional_encoding(seq_len, d_model)

# Causal Mask
mask = create_causal_mask(seq_len)

# --- Arsitektur Pre-Norm ---
# Blok Multi-Head Attention
norm_x = norm1_layer.forward(x)
attn_output, weights = mha_layer.forward(norm_x, mask)
x = x + attn_output  # Residual connection

# Blok Feed-Forward
norm_x = norm2_layer.forward(x)
ffn_output = ffn_layer.forward(norm_x)
x = x + ffn_output  # Residual connection

# Output
final_logits, final_probs = output_layer.forward(x)

print("Shape Input Tokens:", input_tokens.shape)
print("Shape Output Akhir (setelah FFN):", x.shape)
print("-" * 30)
print("Output Casual Masking Untuk Sequence Dengan Panjang 5")
print(create_causal_mask(size=5))
print("-" * 30)
print("Shape Semua Logits:", final_logits.shape)
print("Shape Probabilitas Token Berikutnya:", final_probs.shape)
print("\nProbabilitas untuk token berikutnya (10 teratas):")
print(np.sort(final_probs[0])[::-1][:10])

Shape Input Tokens: (1, 5)
Shape Output Akhir (setelah FFN): (1, 5, 512)
------------------------------
Output Casual Masking Untuk Sequence Dengan Panjang 5
[[[  0. -inf -inf -inf -inf]
  [  0.   0. -inf -inf -inf]
  [  0.   0.   0. -inf -inf]
  [  0.   0.   0.   0. -inf]
  [  0.   0.   0.   0.   0.]]]
------------------------------
Shape Semua Logits: (1, 5, 1000)
Shape Probabilitas Token Berikutnya: (1, 1000)

Probabilitas untuk token berikutnya (10 teratas):
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
