In [None]:
import numpy as np
from datasets import load_dataset
import re
import numpy as np
from collections import Counter
import time
from tqdm import tqdm
from rouge_score import rouge_scorer

In [None]:
class SimpleTokenizer:
    def __init__(self, vocab_size=32000, min_freq=2):
        """
        Initialize simple tokenizer.

        Args:
            vocab_size (int): Maximum vocabulary size
            min_freq (int): Minimum frequency for a token to be included
        """
        self.vocab_size = vocab_size
        self.min_freq = min_freq
        self.word2idx = {}
        self.idx2word = {}

        # Special tokens
        self.pad_token = "<pad>"
        self.unk_token = "<unk>"
        self.bos_token = "<bos>"
        self.eos_token = "<eos>"

        # Add special tokens to vocabulary
        self.word2idx = {
            self.pad_token: 0,
            self.unk_token: 1,
            self.bos_token: 2,
            self.eos_token: 3,
        }
        self.idx2word = {v: k for k, v in self.word2idx.items()}

    def preprocess_text(self, text):
        """
        Preprocess text by:
        1. Converting to lowercase
        2. Removing special characters
        3. Splitting into words
        """
        # Convert to lowercase
        text = text.lower()

        # Remove special characters and extra whitespace
        text = re.sub(r"[^\w\s]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()

        # Split into words
        words = text.split()

        return words

    def build_vocab(self, texts):
        """
        Build vocabulary from texts.

        Args:
            texts: List of text strings
        """
        # Count word frequencies
        word_freq = Counter()
        for text in texts:
            words = self.preprocess_text(text)
            word_freq.update(words)

        # Sort words by frequency
        sorted_words = sorted(word_freq.items(), key=lambda x: (-x[1], x[0]))

        # Add most frequent words to vocabulary
        for word, freq in sorted_words:
            if len(self.word2idx) >= self.vocab_size:
                break
            if freq >= self.min_freq:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word

    def encode(self, text, add_special_tokens=True):
        """
        Encode text to token indices.

        Args:
            text (str): Input text
            add_special_tokens (bool): Whether to add BOS/EOS tokens
        """
        words = self.preprocess_text(text)

        # Convert words to indices
        indices = []
        if add_special_tokens:
            indices.append(self.word2idx[self.bos_token])

        for word in words:
            idx = self.word2idx.get(word, self.word2idx[self.unk_token])
            indices.append(idx)

        if add_special_tokens:
            indices.append(self.word2idx[self.eos_token])

        return np.array(indices)

    def decode(self, indices):
        """
        Decode token indices to text.

        Args:
            indices: Array of token indices
        """
        words = []
        for idx in indices:
            if idx in self.idx2word:
                word = self.idx2word[idx]
                if word in [self.pad_token, self.bos_token, self.eos_token]:
                    continue
                words.append(word)

        return " ".join(words)

    def pad_sequence(self, sequence, max_length):
        """
        Pad sequence to max_length.

        Args:
            sequence: Array of token indices
            max_length: Maximum sequence length
        """
        if len(sequence) > max_length:
            sequence = sequence[:max_length]
        else:
            padding = [self.word2idx[self.pad_token]] * (max_length - len(sequence))
            sequence = np.concatenate([sequence, padding])

        return sequence


In [None]:
class XLSumDataset:
    def __init__(self, max_seq_length=512, batch_size=32, vocab_size=32000):
        """
        Initialize XLSum dataset.

        Args:
            max_seq_length (int): Maximum sequence length
            batch_size (int): Batch size for training
            vocab_size (int): Vocabulary size for tokenizer
        """
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.vocab_size = vocab_size

        # Initialize tokenizer
        self.tokenizer = SimpleTokenizer(vocab_size=vocab_size)

        # Load dataset
        self.dataset = load_dataset("csebuetnlp/xlsum", "indonesian", cache_dir="cache")

        # Build vocabulary
        self._build_vocab()

        # Prepare data
        self.train_data = self._prepare_data("train")
        self.validation_data = self._prepare_data("validation")
        self.test_data = self._prepare_data("test")

    def _build_vocab(self):
        """Build vocabulary from training data."""
        texts = []
        for item in self.dataset["train"]:
            texts.append(item["text"])
            texts.append(item["summary"])

        self.tokenizer.build_vocab(texts)
        print(f"Vocabulary size: {len(self.tokenizer.word2idx)}")

    def _prepare_data(self, split):
        """
        Prepare data for a specific split.

        Args:
            split (str): Dataset split ("train", "validation", or "test")
        """
        data = []
        for item in self.dataset[split]:
            # Tokenize text and summary
            text_tokens = self.tokenizer.encode(item["text"])
            summary_tokens = self.tokenizer.encode(item["summary"])

            # Pad sequences
            text_tokens = self.tokenizer.pad_sequence(text_tokens, self.max_seq_length)
            summary_tokens = self.tokenizer.pad_sequence(
                summary_tokens, self.max_seq_length
            )

            data.append({"text": text_tokens, "summary": summary_tokens})

        return data

    def get_batch(self, split="train"):
        """
        Get a batch of data.

        Args:
            split (str): Dataset split ("train", "validation", or "test")
        """
        # Map split name to data attribute
        split_map = {
            "train": "train_data",
            "validation": "validation_data",
            "test": "test_data",
        }

        if split not in split_map:
            raise ValueError(
                f"Invalid split: {split}. Must be one of {list(split_map.keys())}"
            )

        data = getattr(self, split_map[split])

        # Randomly sample batch_size examples
        indices = np.random.choice(len(data), self.batch_size, replace=False)
        batch = [data[i] for i in indices]

        # Stack text and summary tensors, ensuring integer type
        text_batch = np.stack([item["text"] for item in batch]).astype(np.int64)
        summary_batch = np.stack([item["summary"] for item in batch]).astype(np.int64)

        return text_batch, summary_batch

    def decode_batch(self, indices_batch):
        """
        Decode a batch of token indices to text.

        Args:
            indices_batch: Batch of token indices
        """
        texts = []
        for indices in indices_batch:
            text = self.tokenizer.decode(indices)
            texts.append(text)
        return texts


In [None]:
class MultiHeadAttention:
    def __init__(self, d_model, num_heads, dropout=0.1):
        """
        Initialize Multi-Head Attention.

        Args:
            d_model (int): Model dimension
            num_heads (int): Number of attention heads
            dropout (float): Dropout rate
        """
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.dropout = dropout

        # Initialize weights
        self.W_q = np.random.normal(0, 0.02, (d_model, d_model))
        self.W_k = np.random.normal(0, 0.02, (d_model, d_model))
        self.W_v = np.random.normal(0, 0.02, (d_model, d_model))
        self.W_o = np.random.normal(0, 0.02, (d_model, d_model))

        # Initialize biases
        self.b_q = np.zeros(d_model)
        self.b_k = np.zeros(d_model)
        self.b_v = np.zeros(d_model)
        self.b_o = np.zeros(d_model)

        # Cache for backward pass
        self.cache = {}

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, d_k)."""
        x = x.reshape(batch_size, -1, self.num_heads, self.d_k)
        return np.transpose(x, (0, 2, 1, 3))

    def combine_heads(self, x, batch_size):
        """Combine heads back together."""
        x = np.transpose(x, (0, 2, 1, 3))
        return x.reshape(batch_size, -1, self.d_model)

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        """
        Calculate scaled dot-product attention.

        Args:
            q: Query shape == (..., seq_len_q, d_k)
            k: Key shape == (..., seq_len_k, d_k)
            v: Value shape == (..., seq_len_v, d_v)
            mask: Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k)
        """
        matmul_qk = np.matmul(q, np.transpose(k, (0, 1, 3, 2)))

        # Store for backward pass
        self.cache["matmul_qk"] = matmul_qk

        # Scale matmul_qk
        dk = np.sqrt(self.d_k)
        scaled_attention_logits = matmul_qk / dk

        # Store for backward pass
        self.cache["scaled_attention_logits"] = scaled_attention_logits

        # Add mask if provided
        if mask is not None:
            scaled_attention_logits += mask * -1e9
            self.cache["mask"] = mask

        # Softmax is normalized on the last axis (seq_len_k)
        attention_weights = self.softmax(scaled_attention_logits, axis=-1)

        # Store for backward pass
        self.cache["attention_weights"] = attention_weights

        # Apply dropout
        if self.dropout > 0:
            dropout_mask = np.random.binomial(
                1, 1 - self.dropout, size=attention_weights.shape
            ) / (1 - self.dropout)
            attention_weights = attention_weights * dropout_mask
            self.cache["dropout_mask"] = dropout_mask

        output = np.matmul(attention_weights, v)
        return output, attention_weights

    def softmax(self, x, axis=-1):
        """Compute softmax values for each set of scores in x."""
        e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
        return e_x / np.sum(e_x, axis=axis, keepdims=True)

    def dropout_layer(self, x):
        """Apply dropout during training."""
        if self.dropout > 0:
            mask = np.random.binomial(1, 1 - self.dropout, size=x.shape) / (
                1 - self.dropout
            )
            return x * mask
        return x

    def forward(self, q, k, v, mask=None):
        """
        Forward pass of multi-head attention.

        Args:
            q: Query input
            k: Key input
            v: Value input
            mask: Optional mask for attention
        """
        batch_size = q.shape[0]

        # Store inputs for backward pass
        self.cache["q"] = q
        self.cache["k"] = k
        self.cache["v"] = v

        # Linear projections and split into heads
        q = np.matmul(q, self.W_q) + self.b_q
        k = np.matmul(k, self.W_k) + self.b_k
        v = np.matmul(v, self.W_v) + self.b_v

        # Store projections for backward pass
        self.cache["q_proj"] = q
        self.cache["k_proj"] = k
        self.cache["v_proj"] = v

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        # Store split heads for backward pass
        self.cache["q_heads"] = q
        self.cache["k_heads"] = k
        self.cache["v_heads"] = v

        # Scaled dot-product attention
        scaled_attention, attention_weights = self.scaled_dot_product_attention(
            q, k, v, mask
        )

        # Combine heads
        concat_attention = self.combine_heads(scaled_attention, batch_size)

        # Store for backward pass
        self.cache["concat_attention"] = concat_attention

        # Final linear projection
        output = np.matmul(concat_attention, self.W_o) + self.b_o

        return output, attention_weights

    def backward(self, dout):
        """
        Backward pass of multi-head attention.

        Args:
            dout: Gradient of loss w.r.t. output of shape (batch_size, seq_length, d_model)
        """
        # Get the expected shapes from cached values
        expected_batch_size = self.cache["q"].shape[0]
        expected_seq_len = self.cache["q"].shape[1]
        expected_d_model = self.cache["q"].shape[2]

        # Ensure dout has the correct shape and dimensions
        if len(dout.shape) == 4:
            # If dout has an extra dimension, reshape it
            dout = dout.reshape(-1, dout.shape[-2], dout.shape[-1])

        # Handle batch size mismatch
        if dout.shape[0] != expected_batch_size:
            # If batch size doesn't match, reshape to expected batch size
            total_samples = dout.shape[0]
            if total_samples % expected_batch_size == 0:
                # Reshape by combining multiple samples into the expected batch size
                dout = dout.reshape(expected_batch_size, -1, dout.shape[-1])
                # Average the gradients across the combined samples
                dout = dout.mean(axis=1, keepdims=True).repeat(
                    dout.shape[1] // expected_batch_size, axis=1
                )
            else:
                raise ValueError(
                    f"Cannot reshape dout from shape {dout.shape} to batch size {expected_batch_size}"
                )

        # Handle sequence length mismatch
        if dout.shape[1] != expected_seq_len:
            # If sequence length doesn't match, we need to handle it
            if dout.shape[1] < expected_seq_len:
                # If sequence is shorter, pad with zeros
                padding = np.zeros(
                    (dout.shape[0], expected_seq_len - dout.shape[1], dout.shape[2])
                )
                dout = np.concatenate([dout, padding], axis=1)
            else:
                # If sequence is longer, truncate
                dout = dout[:, :expected_seq_len, :]

        # Handle model dimension mismatch
        if dout.shape[2] != expected_d_model:
            raise ValueError(
                f"Model dimension mismatch: got {dout.shape[2]}, expected {expected_d_model}"
            )

        batch_size = dout.shape[0]
        seq_len_q = dout.shape[1]  # Get sequence length from input

        # Get cached values
        q = self.cache["q"]
        k = self.cache["k"]
        v = self.cache["v"]
        q_proj = self.cache["q_proj"]
        k_proj = self.cache["k_proj"]
        v_proj = self.cache["v_proj"]
        q_heads = self.cache["q_heads"]  # (batch, heads, seq_len_q, d_k)
        k_heads = self.cache["k_heads"]  # (batch, heads, seq_len_k, d_k)
        v_heads = self.cache["v_heads"]  # (batch, heads, seq_len_v, d_k)
        concat_attention = self.cache["concat_attention"]
        attention_weights = self.cache[
            "attention_weights"
        ]  # (batch, heads, seq_len_q, seq_len_k)

        # Get sequence lengths from cached tensors
        seq_len_k = attention_weights.shape[
            -1
        ]  # Get key sequence length from attention weights
        seq_len_v = v_heads.shape[-2]  # Get value sequence length from v_heads

        # Verify all dimensions match
        assert dout.shape == (
            expected_batch_size,
            expected_seq_len,
            expected_d_model,
        ), f"Shape mismatch: got {dout.shape}, expected {(expected_batch_size, expected_seq_len, expected_d_model)}"
        assert concat_attention.shape == (
            expected_batch_size,
            expected_seq_len,
            expected_d_model,
        ), f"concat_attention shape mismatch: got {concat_attention.shape}, expected {(expected_batch_size, expected_seq_len, expected_d_model)}"

        # Gradient of loss w.r.t. output projection
        dW_o = np.matmul(
            concat_attention.transpose(0, 2, 1), dout
        )  # (d_model, d_model)
        db_o = np.sum(dout, axis=(0, 1))  # (d_model,)
        dconcat = np.matmul(dout, self.W_o.T)  # (batch, seq_len_q, d_model)

        # Verify dconcat shape
        assert dconcat.shape == (
            batch_size,
            seq_len_q,
            self.d_model,
        ), f"dconcat shape mismatch: got {dconcat.shape}, expected {(batch_size, seq_len_q, self.d_model)}"

        # Gradient through head combination
        # Ensure dconcat is properly reshaped
        dscaled_attention = dconcat.reshape(
            batch_size, -1, self.num_heads, self.d_k
        )  # (batch, seq_len_q, heads, d_k)
        dscaled_attention = np.transpose(
            dscaled_attention, (0, 2, 1, 3)
        )  # (batch, heads, seq_len_q, d_k)

        # Verify dscaled_attention shape
        assert dscaled_attention.shape == (
            batch_size,
            self.num_heads,
            seq_len_q,
            self.d_k,
        ), f"dscaled_attention shape mismatch: got {dscaled_attention.shape}, expected {(batch_size, self.num_heads, seq_len_q, self.d_k)}"

        # Reshape tensors to combine batch and head dimensions
        attention_weights_reshaped = attention_weights.reshape(
            -1, seq_len_q, seq_len_k
        )  # (batch*heads, seq_len_q, seq_len_k)
        dscaled_attention_reshaped = dscaled_attention.reshape(
            -1, seq_len_q, self.d_k
        )  # (batch*heads, seq_len_q, d_k)
        v_heads_reshaped = v_heads.reshape(
            -1, seq_len_v, self.d_k
        )  # (batch*heads, seq_len_v, d_k)

        # Verify the number of heads matches
        num_heads = batch_size * self.num_heads
        assert (
            attention_weights_reshaped.shape[0]
            == dscaled_attention_reshaped.shape[0]
            == v_heads_reshaped.shape[0]
            == num_heads
        ), (
            f"Number of heads mismatch: attention_weights={attention_weights_reshaped.shape[0]}, "
            f"dscaled_attention={dscaled_attention_reshaped.shape[0]}, v_heads={v_heads_reshaped.shape[0]}, "
            f"expected={num_heads}"
        )

        # Compute gradients through attention weights
        dv_heads_reshaped = np.zeros_like(
            v_heads_reshaped
        )  # (batch*heads, seq_len_v, d_k)
        dattention_weights_reshaped = np.zeros_like(
            attention_weights_reshaped
        )  # (batch*heads, seq_len_q, seq_len_k)

        # Process each head separately to avoid broadcasting issues
        for i in range(num_heads):
            # For v_heads gradient:
            # attention_weights[i].T: (seq_len_k, seq_len_q)
            # dscaled_attention[i]: (seq_len_q, d_k)
            # Result should be: (seq_len_k, d_k)
            # But we need (seq_len_v, d_k) for v_heads

            # First compute the gradient through attention weights
            dattention_weights_reshaped[i] = np.matmul(
                dscaled_attention_reshaped[i],  # (seq_len_q, d_k)
                v_heads_reshaped[i].T,  # (d_k, seq_len_v)
            )  # Result: (seq_len_q, seq_len_v)

            # Then compute the gradient for v_heads
            # We need to ensure the sequence lengths match
            if seq_len_k == seq_len_v:
                # If sequence lengths match, we can directly compute
                dv_heads_reshaped[i] = np.matmul(
                    attention_weights_reshaped[i].T,  # (seq_len_k, seq_len_q)
                    dscaled_attention_reshaped[i],  # (seq_len_q, d_k)
                )  # Result: (seq_len_k, d_k)
            else:
                # If sequence lengths don't match, we need to handle it differently
                # For now, we'll use a simple approach: take the first min(seq_len_k, seq_len_v) positions
                min_len = min(seq_len_k, seq_len_v)
                dv_heads_reshaped[i, :min_len] = np.matmul(
                    attention_weights_reshaped[
                        i, :, :min_len
                    ].T,  # (min_len, seq_len_q)
                    dscaled_attention_reshaped[i],  # (seq_len_q, d_k)
                )  # Result: (min_len, d_k)

        # Reshape back to original dimensions
        dv_heads = dv_heads_reshaped.reshape(
            batch_size, self.num_heads, seq_len_v, self.d_k
        )
        dattention_weights = dattention_weights_reshaped.reshape(
            batch_size, self.num_heads, seq_len_q, seq_len_v
        )

        # Gradient through dropout
        if self.dropout > 0 and "dropout_mask" in self.cache:
            dattention_weights *= self.cache["dropout_mask"]

        # Gradient through softmax
        dscaled_logits = (
            dattention_weights * attention_weights * (1 - attention_weights)
        )  # (batch, heads, seq_len_q, seq_len_k)

        # Gradient through scaling
        dmatmul_qk = dscaled_logits / np.sqrt(
            self.d_k
        )  # (batch, heads, seq_len_q, seq_len_k)

        # Gradient through QK multiplication
        # Reshape for batch matrix multiplication
        dmatmul_qk_reshaped = dmatmul_qk.reshape(
            -1, seq_len_q, seq_len_k
        )  # (batch*heads, seq_len_q, seq_len_k)
        k_heads_reshaped = k_heads.reshape(
            -1, seq_len_k, self.d_k
        )  # (batch*heads, seq_len_k, d_k)
        q_heads_reshaped = q_heads.reshape(
            -1, seq_len_q, self.d_k
        )  # (batch*heads, seq_len_q, d_k)

        # Compute gradients for each head separately
        dq_heads_reshaped = np.zeros_like(
            q_heads_reshaped
        )  # (batch*heads, seq_len_q, d_k)
        dk_heads_reshaped = np.zeros_like(
            k_heads_reshaped
        )  # (batch*heads, seq_len_k, d_k)

        for i in range(num_heads):
            # Compute gradients for q_heads
            dq_heads_reshaped[i] = np.matmul(
                dmatmul_qk_reshaped[i],  # (seq_len_q, seq_len_k)
                k_heads_reshaped[i],  # (seq_len_k, d_k)
            )  # Result: (seq_len_q, d_k)

            # Compute gradients for k_heads
            dk_heads_reshaped[i] = np.matmul(
                dmatmul_qk_reshaped[i].T,  # (seq_len_k, seq_len_q)
                q_heads_reshaped[i],  # (seq_len_q, d_k)
            )  # Result: (seq_len_k, d_k)

        # Reshape back to original dimensions
        dq_heads = dq_heads_reshaped.reshape(
            batch_size, self.num_heads, seq_len_q, self.d_k
        )
        dk_heads = dk_heads_reshaped.reshape(
            batch_size, self.num_heads, seq_len_k, self.d_k
        )

        # Gradient through head splitting
        dq_proj = self.combine_heads(dq_heads, batch_size)
        dk_proj = self.combine_heads(dk_heads, batch_size)
        dv_proj = self.combine_heads(dv_heads, batch_size)

        # Gradient through linear projections
        dW_q = np.matmul(q.transpose(0, 2, 1), dq_proj)
        db_q = np.sum(dq_proj, axis=(0, 1))
        dq = np.matmul(dq_proj, self.W_q.T)

        dW_k = np.matmul(k.transpose(0, 2, 1), dk_proj)
        db_k = np.sum(dk_proj, axis=(0, 1))
        dk = np.matmul(dk_proj, self.W_k.T)

        dW_v = np.matmul(v.transpose(0, 2, 1), dv_proj)
        db_v = np.sum(dv_proj, axis=(0, 1))
        dv = np.matmul(dv_proj, self.W_v.T)

        # Store gradients for parameter updates
        self.dW_q = dW_q
        self.db_q = db_q
        self.dW_k = dW_k
        self.db_k = db_k
        self.dW_v = dW_v
        self.db_v = db_v
        self.dW_o = dW_o
        self.db_o = db_o

        return dq, dk, dv

In [None]:
class PositionalEncoding:
    def __init__(self, d_model, max_seq_length=5000):
        """
        Initialize positional encoding.

        Args:
            d_model (int): Model dimension
            max_seq_length (int): Maximum sequence length
        """
        self.d_model = d_model
        self.max_seq_length = max_seq_length

        # Create positional encoding matrix
        position = np.arange(max_seq_length)[:, np.newaxis]
        div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))

        pe = np.zeros((max_seq_length, d_model))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)

        self.pe = pe[np.newaxis, :, :]  # Shape: (1, max_seq_length, d_model)

    def forward(self, x):
        """
        Add positional encoding to input embeddings.

        Args:
            x: Input tensor of shape (batch_size, seq_length, d_model)
        """
        return x + self.pe[:, : x.shape[1], :]


class Embeddings:
    def __init__(self, vocab_size, d_model, max_seq_length=5000, dropout=0.1):
        """
        Initialize token embeddings and positional encoding.

        Args:
            vocab_size (int): Size of vocabulary
            d_model (int): Model dimension
            max_seq_length (int): Maximum sequence length
            dropout (float): Dropout rate
        """
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.dropout = dropout

        # Initialize token embeddings
        self.token_embeddings = np.random.normal(0, 0.02, (vocab_size, d_model))

        # Initialize positional encoding
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # Cache for backward pass
        self.cache = {}

    def dropout_layer(self, x):
        """Apply dropout during training."""
        if self.dropout > 0:
            mask = np.random.binomial(1, 1 - self.dropout, size=x.shape) / (
                1 - self.dropout
            )
            return x * mask
        return x

    def forward(self, x):
        """
        Forward pass of embeddings.

        Args:
            x: Input tensor of shape (batch_size, seq_length) containing token indices
        """
        # Store input for backward pass
        self.cache["x"] = x

        # Get token embeddings
        embeddings = self.token_embeddings[
            x
        ]  # Shape: (batch_size, seq_length, d_model)

        # Store embeddings before scaling for backward pass
        self.cache["embeddings"] = embeddings

        # Scale embeddings
        embeddings = embeddings * np.sqrt(self.d_model)

        # Store scaled embeddings for backward pass
        self.cache["scaled_embeddings"] = embeddings

        # Add positional encoding
        embeddings = self.positional_encoding.forward(embeddings)

        # Store embeddings before dropout for backward pass
        self.cache["embeddings_before_dropout"] = embeddings

        # Apply dropout
        if self.dropout > 0:
            dropout_mask = np.random.binomial(
                1, 1 - self.dropout, size=embeddings.shape
            ) / (1 - self.dropout)
            embeddings = embeddings * dropout_mask
            self.cache["dropout_mask"] = dropout_mask

        return embeddings

    def backward(self, dout):
        """
        Backward pass of embeddings.

        Args:
            dout: Gradient of loss w.r.t. output of shape (batch_size, seq_length, d_model)
                 or (num_layers, batch_size, seq_length, d_model)
        """
        # Get cached values
        x = self.cache["x"]  # Input token indices
        dropout_mask = self.cache.get("dropout_mask")

        # Handle extra dimension if present
        if len(dout.shape) == 4:
            # If dout has shape (num_layers, batch_size, seq_length, d_model)
            # We need to sum the gradients across layers
            dout = np.sum(dout, axis=0)  # Shape: (batch_size, seq_length, d_model)

        # Verify dout has the correct shape
        assert (
            len(dout.shape) == 3
        ), f"Expected dout to have 3 dimensions, got {len(dout.shape)}"
        assert (
            dout.shape[2] == self.d_model
        ), f"Expected d_model dimension to be {self.d_model}, got {dout.shape[2]}"

        # Gradient through dropout
        if dropout_mask is not None:
            dout = dout * dropout_mask

        # Gradient through positional encoding (no parameters to update)
        # The positional encoding is deterministic, so we just pass the gradient through

        # Gradient through scaling
        dout = dout * np.sqrt(self.d_model)

        # Gradient through token embeddings
        # We need to accumulate gradients for each token in the vocabulary
        d_embeddings = np.zeros_like(
            self.token_embeddings
        )  # Shape: (vocab_size, d_model)

        # Get batch dimensions
        batch_size, seq_length = x.shape

        # For each unique token in the batch, accumulate its gradient
        unique_tokens = np.unique(x)
        for token in unique_tokens:
            # Create a mask for each position where this token appears
            # Shape: (batch_size, seq_length)
            token_mask = x == token

            # For each position where the token appears, add its gradient
            # We need to handle each position separately to avoid broadcasting issues
            for b in range(batch_size):
                for s in range(seq_length):
                    if token_mask[b, s]:
                        # Add the gradient for this position
                        d_embeddings[token] += dout[b, s]

        # Store gradient for parameter update
        self.d_token_embeddings = d_embeddings

        # Return gradient for input (not used since input is discrete tokens)
        return None

In [None]:
class LayerNormalization:
    def __init__(self, d_model, eps=1e-6):
        """
        Initialize layer normalization.

        Args:
            d_model (int): Model dimension
            eps (float): Small constant for numerical stability
        """
        self.gamma = np.ones(d_model)
        self.beta = np.zeros(d_model)
        self.eps = eps
        self.cache = {}  # Cache for storing intermediate values

    def forward(self, x):
        """
        Forward pass of layer normalization.

        Args:
            x: Input tensor of shape (batch_size, seq_length, d_model)
        """
        # Store input for backward pass
        self.cache["x"] = x

        # Compute mean and variance
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)

        # Store intermediate values for backward pass
        self.cache["mean"] = mean
        self.cache["var"] = var

        # Normalize
        x_norm = (x - mean) / np.sqrt(var + self.eps)
        self.cache["x_norm"] = x_norm

        # Scale and shift
        out = self.gamma * x_norm + self.beta
        return out

    def backward(self, dout):
        """
        Backward pass of layer normalization.

        Args:
            dout: Gradient of loss w.r.t. output of shape (batch_size, seq_length, d_model)
                 or (num_layers, batch_size, seq_length, d_model)
        """
        # Get cached values
        x = self.cache["x"]
        mean = self.cache["mean"]
        var = self.cache["var"]
        x_norm = self.cache["x_norm"]

        batch_size, seq_len, d_model = x.shape

        # Handle extra dimension if present
        if len(dout.shape) == 4:
            # If dout has shape (num_layers, batch_size, seq_length, d_model)
            # We need to sum over the first dimension to get the correct gradient
            dout = np.sum(dout, axis=0)  # Shape: (batch_size, seq_length, d_model)

        # Verify dout has the correct shape
        assert (
            len(dout.shape) == 3
        ), f"Expected dout to have 3 dimensions, got {len(dout.shape)}"
        assert (
            dout.shape[2] == d_model
        ), f"Expected d_model dimension to be {d_model}, got {dout.shape[2]}"
        assert (
            dout.shape[:2] == x_norm.shape[:2]
        ), f"Batch and sequence dimensions must match: dout {dout.shape[:2]} != x_norm {x_norm.shape[:2]}"

        # Gradient of loss w.r.t. beta
        # Sum over batch and sequence dimensions to get gradient for each feature
        dbeta = np.sum(dout, axis=(0, 1))  # Shape: (d_model,)
        assert (
            dbeta.shape == self.beta.shape
        ), f"dbeta shape {dbeta.shape} != beta shape {self.beta.shape}"

        # Gradient of loss w.r.t. gamma
        # Element-wise multiplication and sum over batch and sequence dimensions
        # Ensure x_norm has the same shape as dout
        x_norm_reshaped = x_norm.reshape(batch_size, seq_len, d_model)
        dgamma = np.sum(dout * x_norm_reshaped, axis=(0, 1))  # Shape: (d_model,)
        assert (
            dgamma.shape == self.gamma.shape
        ), f"dgamma shape {dgamma.shape} != gamma shape {self.gamma.shape}"

        # Gradient of loss w.r.t. normalized input
        dx_norm = dout * self.gamma  # Shape: (batch_size, seq_len, d_model)

        # Gradient of loss w.r.t. variance
        dvar = np.sum(
            dx_norm * (x - mean) * -0.5 * (var + self.eps) ** (-1.5),
            axis=-1,
            keepdims=True,
        )  # Shape: (batch_size, seq_len, 1)

        # Gradient of loss w.r.t. mean
        dmean = (
            np.sum(dx_norm * -1 / np.sqrt(var + self.eps), axis=-1, keepdims=True)
            + dvar * np.sum(-2 * (x - mean), axis=-1, keepdims=True) / d_model
        )  # Shape: (batch_size, seq_len, 1)

        # Gradient of loss w.r.t. input
        dx = (
            dx_norm / np.sqrt(var + self.eps)
            + dvar * 2 * (x - mean) / d_model
            + dmean / d_model
        )  # Shape: (batch_size, seq_len, d_model)

        # Store gradients for parameter updates
        self.dgamma = dgamma
        self.dbeta = dbeta

        return dx


class FeedForward:
    def __init__(self, d_model, d_ff, dropout=0.1):
        """
        Initialize feed-forward network.

        Args:
            d_model (int): Model dimension
            d_ff (int): Feed-forward dimension
            dropout (float): Dropout rate
        """
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        # Initialize weights
        self.W1 = np.random.normal(0, 0.02, (d_model, d_ff))
        self.W2 = np.random.normal(0, 0.02, (d_ff, d_model))
        self.b1 = np.zeros(d_ff)
        self.b2 = np.zeros(d_model)

        # Cache for backward pass
        self.cache = {}

    def dropout_layer(self, x):
        """Apply dropout during training."""
        if self.dropout > 0:
            mask = np.random.binomial(1, 1 - self.dropout, size=x.shape) / (
                1 - self.dropout
            )
            return x * mask
        return x

    def relu(self, x):
        """ReLU activation function."""
        return np.maximum(0, x)

    def forward(self, x):
        """
        Forward pass of feed-forward network.

        Args:
            x: Input tensor of shape (batch_size, seq_length, d_model)
        """
        # Store input for backward pass
        self.cache["x"] = x

        # First linear layer with ReLU
        h = np.matmul(x, self.W1) + self.b1
        self.cache["h_pre_relu"] = h
        h = self.relu(h)

        # Apply dropout
        h = self.dropout_layer(h)
        self.cache["h_dropout"] = h

        # Second linear layer
        output = np.matmul(h, self.W2) + self.b2
        return output

    def backward(self, dout):
        """
        Backward pass of feed-forward network.

        Args:
            dout: Gradient of loss w.r.t. output of shape (batch_size, seq_length, d_model)
                 or (num_layers, batch_size, seq_length, d_model)
        """
        # Get cached values
        x = self.cache["x"]
        h_pre_relu = self.cache["h_pre_relu"]
        h_dropout = self.cache["h_dropout"]

        # Handle extra dimension if present
        if len(dout.shape) == 4:
            # If dout has shape (num_layers, batch_size, seq_length, d_model)
            # We need to sum over the first dimension to get the correct gradient
            dout = np.sum(dout, axis=0)  # Shape: (batch_size, seq_length, d_model)

        # Verify dout has the correct shape
        assert (
            len(dout.shape) == 3
        ), f"Expected dout to have 3 dimensions, got {len(dout.shape)}"
        assert (
            dout.shape[2] == self.d_model
        ), f"Expected d_model dimension to be {self.d_model}, got {dout.shape[2]}"

        batch_size, seq_len, _ = dout.shape

        # Gradient of loss w.r.t. second linear layer
        # Reshape tensors for matrix multiplication
        h_dropout_reshaped = h_dropout.reshape(
            -1, self.d_ff
        )  # (batch_size * seq_len, d_ff)
        dout_reshaped = dout.reshape(
            -1, self.d_model
        )  # (batch_size * seq_len, d_model)

        # Compute gradients for W2 and b2
        dW2 = np.matmul(h_dropout_reshaped.T, dout_reshaped)  # (d_ff, d_model)
        db2 = np.sum(dout_reshaped, axis=0)  # (d_model,)

        # Gradient through second linear layer
        dh = np.matmul(dout_reshaped, self.W2.T)  # (batch_size * seq_len, d_ff)
        dh = dh.reshape(batch_size, seq_len, self.d_ff)  # (batch_size, seq_len, d_ff)

        # Gradient through dropout
        if self.dropout > 0:
            dh = dh * (h_dropout != 0) / (1 - self.dropout)

        # Gradient through ReLU
        dh_pre_relu = dh * (h_pre_relu > 0)  # (batch_size, seq_len, d_ff)

        # Gradient of loss w.r.t. first linear layer
        # Reshape tensors for matrix multiplication
        x_reshaped = x.reshape(-1, self.d_model)  # (batch_size * seq_len, d_model)
        dh_pre_relu_reshaped = dh_pre_relu.reshape(
            -1, self.d_ff
        )  # (batch_size * seq_len, d_ff)

        # Compute gradients for W1 and b1
        dW1 = np.matmul(x_reshaped.T, dh_pre_relu_reshaped)  # (d_model, d_ff)
        db1 = np.sum(dh_pre_relu_reshaped, axis=0)  # (d_ff,)

        # Gradient through first linear layer
        dx = np.matmul(
            dh_pre_relu_reshaped, self.W1.T
        )  # (batch_size * seq_len, d_model)
        dx = dx.reshape(
            batch_size, seq_len, self.d_model
        )  # (batch_size, seq_len, d_model)

        # Verify gradient shapes
        assert (
            dW1.shape == self.W1.shape
        ), f"dW1 shape {dW1.shape} != W1 shape {self.W1.shape}"
        assert (
            dW2.shape == self.W2.shape
        ), f"dW2 shape {dW2.shape} != W2 shape {self.W2.shape}"
        assert (
            db1.shape == self.b1.shape
        ), f"db1 shape {db1.shape} != b1 shape {self.b1.shape}"
        assert (
            db2.shape == self.b2.shape
        ), f"db2 shape {db2.shape} != b2 shape {self.b2.shape}"

        # Store gradients for parameter updates
        self.dW1 = dW1
        self.db1 = db1
        self.dW2 = dW2
        self.db2 = db2

        return dx


class DecoderLayer:
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        """
        Initialize decoder layer.

        Args:
            d_model (int): Model dimension
            num_heads (int): Number of attention heads
            d_ff (int): Feed-forward dimension
            dropout (float): Dropout rate
        """
        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)

        self.norm1 = LayerNormalization(d_model)
        self.norm2 = LayerNormalization(d_model)

        self.dropout = dropout

    def dropout_layer(self, x):
        """Apply dropout during training."""
        if self.dropout > 0:
            mask = np.random.binomial(1, 1 - self.dropout, size=x.shape) / (
                1 - self.dropout
            )
            return x * mask
        return x

    def forward(self, x, mask=None):
        """
        Forward pass of decoder layer with pre-normalization.

        Args:
            x: Input tensor of shape (batch_size, seq_length, d_model)
            mask: Optional mask for attention
        """
        # Pre-norm: normalize before self-attention
        x_norm = self.norm1.forward(x)

        # Self-attention block
        attn_output, _ = self.self_attention.forward(x_norm, x_norm, x_norm, mask)
        attn_output = self.dropout_layer(attn_output)
        x = x + attn_output  # Residual connection after attention

        # Pre-norm: normalize before feed-forward
        x_norm = self.norm2.forward(x)

        # Feed-forward block
        ff_output = self.feed_forward.forward(x_norm)
        ff_output = self.dropout_layer(ff_output)
        x = x + ff_output  # Residual connection after feed-forward

        return x

In [None]:
class TransformerDecoder:
    def __init__(
        self,
        vocab_size,
        d_model=512,
        num_heads=8,
        num_layers=6,
        d_ff=2048,
        max_seq_length=5000,
        dropout=0.1,
    ):
        """
        Initialize transformer decoder.

        Args:
            vocab_size (int): Size of vocabulary
            d_model (int): Model dimension
            num_heads (int): Number of attention heads
            num_layers (int): Number of decoder layers
            d_ff (int): Feed-forward dimension
            max_seq_length (int): Maximum sequence length
            dropout (float): Dropout rate
        """
        self.d_model = d_model
        self.num_layers = num_layers

        # Initialize embeddings
        self.embeddings = Embeddings(vocab_size, d_model, max_seq_length, dropout)

        # Initialize decoder layers
        self.decoder_layers = [
            DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ]

        # Initialize output layer
        self.output_layer = np.random.normal(0, 0.02, (d_model, vocab_size))
        self.output_bias = np.zeros(vocab_size)

        self.dropout = dropout

    def dropout_layer(self, x):
        """Apply dropout during training."""
        if self.dropout > 0:
            mask = np.random.binomial(1, 1 - self.dropout, size=x.shape) / (
                1 - self.dropout
            )
            return x * mask
        return x

    def create_mask(self, seq):
        """
        Create causal mask for decoder.

        Args:
            seq: Input sequence of shape (batch_size, seq_length)
        """
        seq_len = seq.shape[1]
        mask = np.triu(np.ones((seq_len, seq_len)), k=1).astype(np.float32)
        mask = (mask == 0).astype(np.float32)
        return mask[np.newaxis, np.newaxis, :, :]

    def forward(self, x, training=True):
        """
        Forward pass of transformer decoder.

        Args:
            x: Input tensor of shape (batch_size, seq_length)
            training (bool): Whether in training mode
        """

        # Ensure input is integer type for embedding lookup
        x = x.astype(np.int64)

        # Create causal mask
        mask = self.create_mask(x)

        # Get embeddings
        x = self.embeddings.forward(x)

        # Apply dropout if training
        if training:
            x = self.dropout_layer(x)

        # Pass through decoder layers
        for layer in self.decoder_layers:
            x = layer.forward(x, mask)

        # Output layer
        logits = np.matmul(x, self.output_layer) + self.output_bias

        return logits

    def generate(self, start_token, max_length, temperature=1.0):
        """
        Generate sequence using the decoder.

        Args:
            start_token (int): Starting token index
            max_length (int): Maximum sequence length to generate
            temperature (float): Sampling temperature
        """
        # Initialize sequence with start token
        seq = np.array([[start_token]])

        for _ in range(max_length - 1):
            # Get model predictions
            logits = self.forward(seq, training=False)

            # Get next token probabilities
            next_token_logits = logits[:, -1, :] / temperature
            probs = self.softmax(next_token_logits)

            # Sample next token
            next_token = np.random.choice(len(probs[0]), p=probs[0])

            # Append to sequence
            seq = np.append(seq, [[next_token]], axis=1)

            # Stop if we predict the end token
            if next_token == 1:  # Assuming 1 is the end token
                break

        return seq

    def softmax(self, x):
        """Compute softmax values for each set of scores in x."""
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return e_x / np.sum(e_x, axis=-1, keepdims=True)

    def compute_loss(self, logits, targets):
        """
        Compute cross-entropy loss.

        Args:
            logits: Model predictions of shape (batch_size, seq_length, vocab_size)
            targets: Target sequences of shape (batch_size, seq_length)
        """

        # Ensure targets are integers
        targets = targets.astype(np.int64)

        # Reshape for loss computation
        logits = logits.reshape(
            -1, logits.shape[-1]
        )  # (batch_size * seq_length, vocab_size)
        targets = targets.reshape(-1)  # (batch_size * seq_length,)

        # Compute cross-entropy loss
        log_probs = self.log_softmax(logits)

        # Create index array for gathering target log probabilities
        batch_indices = np.arange(len(targets), dtype=np.int64)

        # Gather target log probabilities and compute loss
        target_log_probs = log_probs[batch_indices, targets]
        nll_loss = -np.sum(target_log_probs) / len(targets)

        return nll_loss

    def log_softmax(self, x):
        """Compute log softmax values for each set of scores in x."""
        x_max = np.max(x, axis=-1, keepdims=True)
        return x - x_max - np.log(np.sum(np.exp(x - x_max), axis=-1, keepdims=True))

In [None]:
def train_epoch(model, dataset, learning_rate=1e-4):
    """
    Train for one epoch.

    Args:
        model: TransformerDecoder model
        dataset: XLSumDataset instance
        learning_rate (float): Learning rate
    """
    total_loss = 0
    num_batches = len(dataset.train_data) // dataset.batch_size

    for _ in tqdm(range(num_batches), desc="Training"):
        # Get batch
        text_batch, summary_batch = dataset.get_batch("train")

        # Forward pass
        logits = model.forward(text_batch)

        # Compute loss
        loss = model.compute_loss(logits, summary_batch)
        total_loss += loss

        # Backward pass (gradient computation)
        gradients = compute_gradients(model, logits, summary_batch)

        # Update parameters
        update_parameters(model, gradients, learning_rate)

    return total_loss / num_batches


def compute_gradients(model, logits, targets):
    """
    Compute gradients using backpropagation.

    Args:
        model: TransformerDecoder model
        logits: Model predictions
        targets: Target sequences
    """
    # Initialize gradients dictionary
    gradients = {}

    # Compute gradients for output layer
    batch_size, seq_len, vocab_size = logits.shape
    logits_flat = logits.reshape(-1, vocab_size)
    targets_flat = targets.reshape(-1)

    # Gradient of loss w.r.t. logits
    probs = model.softmax(logits_flat)
    probs[np.arange(len(targets_flat)), targets_flat] -= 1
    d_logits = probs / batch_size

    # Gradient of loss w.r.t. output layer weights
    gradients["output_layer"] = np.matmul(
        model.embeddings.token_embeddings[targets].reshape(-1, model.d_model).T,
        d_logits,
    )
    gradients["output_bias"] = np.sum(d_logits, axis=0)

    # Backpropagate through decoder layers
    d_h = d_logits.reshape(batch_size, seq_len, vocab_size)
    d_h = np.matmul(d_h, model.output_layer.T)

    for i in range(len(model.decoder_layers) - 1, -1, -1):
        layer = model.decoder_layers[i]

        # Backpropagate through feed-forward network
        d_ff = d_h
        d_ff = layer.norm2.backward(d_ff)
        d_ff = layer.feed_forward.backward(d_ff)
        d_h = d_h + d_ff

        # Backpropagate through self-attention
        d_attn = d_h
        d_attn = layer.norm1.backward(d_attn)
        d_attn = layer.self_attention.backward(d_attn)
        d_h = d_h + d_attn

    # Backpropagate through embeddings
    d_emb = d_h
    d_emb = model.embeddings.backward(d_emb)

    return gradients


def update_parameters(model, gradients, learning_rate):
    """
    Update model parameters using gradients.

    Args:
        model: TransformerDecoder model
        gradients: Dictionary of gradients
        learning_rate (float): Learning rate
    """
    # Update output layer
    model.output_layer -= learning_rate * gradients["output_layer"]
    model.output_bias -= learning_rate * gradients["output_bias"]

    # Update embeddings
    model.embeddings.token_embeddings -= learning_rate * gradients.get("embeddings", 0)

    # Update decoder layers
    for i, layer in enumerate(model.decoder_layers):
        # Update layer normalization parameters
        layer.norm1.gamma -= learning_rate * layer.norm1.dgamma
        layer.norm1.beta -= learning_rate * layer.norm1.dbeta
        layer.norm2.gamma -= learning_rate * layer.norm2.dgamma
        layer.norm2.beta -= learning_rate * layer.norm2.dbeta

        # Update feed-forward network
        layer.feed_forward.W1 -= learning_rate * layer.feed_forward.dW1
        layer.feed_forward.b1 -= learning_rate * layer.feed_forward.db1
        layer.feed_forward.W2 -= learning_rate * layer.feed_forward.dW2
        layer.feed_forward.b2 -= learning_rate * layer.feed_forward.db2

        # Update self-attention
        layer.self_attention.W_q -= learning_rate * gradients.get(f"attn_{i}_W_q", 0)
        layer.self_attention.W_k -= learning_rate * gradients.get(f"attn_{i}_W_k", 0)
        layer.self_attention.W_v -= learning_rate * gradients.get(f"attn_{i}_W_v", 0)
        layer.self_attention.W_o -= learning_rate * gradients.get(f"attn_{i}_W_o", 0)


def evaluate(model, dataset, split="validation"):
    """
    Evaluate model on validation/test set.

    Args:
        model: TransformerDecoder model
        dataset: XLSumDataset instance
        split (str): Dataset split to evaluate on
    """
    total_loss = 0
    num_batches = len(getattr(dataset, f"{split}_data")) // dataset.batch_size

    for _ in tqdm(range(num_batches), desc=f"Evaluating on {split}"):
        # Get batch
        text_batch, summary_batch = dataset.get_batch(split)

        # Forward pass
        logits = model.forward(text_batch, training=False)

        # Compute loss
        loss = model.compute_loss(logits, summary_batch)
        total_loss += loss

    return total_loss / num_batches


def train():
    # Model configurations
    model_configs = [
        {
            "name": "model_128d",
            "d_model": 128,
            "num_heads": 2,
            "num_layers": 2,
            "d_ff": 75,
            "max_seq_length": 64,
            "batch_size": 32,
            "vocab_size": 3200,
            "num_epochs": 5,
            "learning_rate": 1e-4,
        },
        {
            "name": "model_8k_vocab",
            "d_model": 56,
            "num_heads": 4,
            "num_layers": 2,
            "d_ff": 96,
            "max_seq_length": 64,
            "batch_size": 32,
            "vocab_size": 8000,
            "num_epochs": 5,
            "learning_rate": 1e-4,
        },
        {
            "name": "model_1layer",
            "d_model": 88,
            "num_heads": 4,
            "num_layers": 1,
            "d_ff": 352,
            "max_seq_length": 64,
            "batch_size": 32,
            "vocab_size": 5000,
            "num_epochs": 5,
            "learning_rate": 1e-4,
        },
    ]

    for config in model_configs:
        print(f"\n{'='*50}")
        print(f"Training {config['name']}")
        print(f"{'='*50}")

        # Initialize dataset
        print("Loading dataset...")
        dataset = XLSumDataset(
            max_seq_length=config["max_seq_length"],
            batch_size=config["batch_size"],
            vocab_size=config["vocab_size"],
        )

        # Initialize model
        print("Initializing model...")
        model = TransformerDecoder(
            vocab_size=len(dataset.tokenizer.word2idx),
            d_model=config["d_model"],
            num_heads=config["num_heads"],
            num_layers=config["num_layers"],
            d_ff=config["d_ff"],
            max_seq_length=config["max_seq_length"],
        )

        # Print model size
        total_params = (
            config["vocab_size"] * config["d_model"]  # Embeddings
            + config["d_model"] * config["vocab_size"]  # Output layer
            + config["num_layers"]
            * (
                3 * (config["d_model"] * config["d_model"])  # Q, K, V matrices
                + config["d_model"] * config["d_model"]  # Output matrix
                + config["d_model"] * config["d_ff"]  # FF W1
                + config["d_ff"] * config["d_model"]  # FF W2
            )
        )

        # Create configuration text
        config_text = f"""
Model Configuration:
Model name: {config['name']}
Vocabulary size: {config['vocab_size']}
Embedding dimension: {config['d_model']}
Number of attention heads: {config['num_heads']}
Number of layers: {config['num_layers']}
Feed-forward dimension: {config['d_ff']}
Maximum sequence length: {config['max_seq_length']}
Batch size: {config['batch_size']}
Total parameters: {total_params:,}
"""

        # Print to console
        print(config_text)

        # Save to file
        with open(f"model_config_{config['name']}.txt", "w") as f:
            f.write(config_text)

        # Training loop
        print("Starting training...")
        best_val_loss = float("inf")

        for epoch in range(config["num_epochs"]):
            start_time = time.time()

            # Train
            train_loss = train_epoch(model, dataset, config["learning_rate"])

            # Evaluate
            val_loss = evaluate(model, dataset, "validation")

            # Print epoch statistics
            epoch_time = time.time() - start_time
            print(f"\nEpoch {epoch + 1}/{config['num_epochs']}")
            print(f"Time: {epoch_time:.2f}s")
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val Loss: {val_loss:.4f}")

            # Save best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                # Save model weights
                np.save(
                    f"best_model_weights_{config['name']}.npy",
                    {
                        "output_layer": model.output_layer,
                        "output_bias": model.output_bias,
                        "embeddings": model.embeddings.token_embeddings,
                        "decoder_layers": [
                            {
                                "self_attention": {
                                    "W_q": layer.self_attention.W_q,
                                    "W_k": layer.self_attention.W_k,
                                    "W_v": layer.self_attention.W_v,
                                    "W_o": layer.self_attention.W_o,
                                },
                                "feed_forward": {
                                    "W1": layer.feed_forward.W1,
                                    "W2": layer.feed_forward.W2,
                                },
                            }
                            for layer in model.decoder_layers
                        ],
                    },
                )
                print(f"Saved best model weights for {config['name']}")

In [None]:
train()

In [None]:
def load_model(model_path, dataset):
    """
    Load a trained model from saved weights.

    Args:
        model_path (str): Path to saved model weights
        dataset: XLSumDataset instance

    Returns:
        TransformerDecoder: Loaded model
    """
    # Initialize model with same parameters as training
    model = TransformerDecoder(
        vocab_size=len(dataset.tokenizer.word2idx),
        d_model=128,
        num_heads=4,
        num_layers=2,
        d_ff=256,
        max_seq_length=64,
    )

    # Load weights
    try:
        weights = np.load(model_path, allow_pickle=True).item()
        print("Loading model weights...")

        # Load weights
        model.output_layer = weights["output_layer"]
        model.output_bias = weights["output_bias"]
        model.embeddings.token_embeddings = weights["embeddings"]

        # Load decoder layers
        for i, layer_weights in enumerate(weights["decoder_layers"]):
            layer = model.decoder_layers[i]
            # Load attention weights
            layer.self_attention.W_q = layer_weights["self_attention"]["W_q"]
            layer.self_attention.W_k = layer_weights["self_attention"]["W_k"]
            layer.self_attention.W_v = layer_weights["self_attention"]["W_v"]
            layer.self_attention.W_o = layer_weights["self_attention"]["W_o"]
            # Load feed-forward weights
            layer.feed_forward.W1 = layer_weights["feed_forward"]["W1"]
            layer.feed_forward.W2 = layer_weights["feed_forward"]["W2"]

        print("Successfully loaded model weights")
        return model
    except FileNotFoundError:
        raise FileNotFoundError(f"No model weights found at {model_path}")


def evaluate_model(model, dataset, split="test"):
    """
    Evaluate model on a dataset split.

    Args:
        model: TransformerDecoder model
        dataset: XLSumDataset instance
        split (str): Dataset split to evaluate on

    Returns:
        float: Average loss on the split
    """
    total_loss = 0
    num_batches = len(getattr(dataset, f"{split}_data")) // dataset.batch_size

    for _ in tqdm(range(num_batches), desc=f"Evaluating on {split}"):
        # Get batch
        text_batch, summary_batch = dataset.get_batch(split)

        # Forward pass
        logits = model.forward(text_batch, training=False)

        # Compute loss
        loss = model.compute_loss(logits, summary_batch)
        total_loss += loss

    return total_loss / num_batches


def generate_summaries(model, dataset, num_examples=5, temperature=0.7):
    """
    Generate summaries for example texts.

    Args:
        model: TransformerDecoder model
        dataset: XLSumDataset instance
        num_examples (int): Number of examples to generate summaries for
        temperature (float): Sampling temperature for generation

    Returns:
        list: List of dictionaries containing original text, original summary, and generated summary
    """
    # Get examples from test set
    text_batch, summary_batch = dataset.get_batch("test")

    results = []
    for i in range(min(num_examples, len(text_batch))):
        # Get original text and summary
        original_text = dataset.decode_batch([text_batch[i]])[0]
        original_summary = dataset.decode_batch([summary_batch[i]])[0]

        # Generate summary
        generated_tokens = model.generate(
            start_token=dataset.tokenizer.word2idx[dataset.tokenizer.bos_token],
            max_length=dataset.max_seq_length,
            temperature=temperature,
        )
        generated_summary = dataset.decode_batch([generated_tokens[0]])[0]

        results.append(
            {
                "original_text": original_text,
                "original_summary": original_summary,
                "generated_summary": generated_summary,
            }
        )

    return results


def print_results(results):
    """
    Print generation results in a readable format.

    Args:
        results: List of dictionaries containing generation results
    """
    print("\nGeneration Results:")
    print("=" * 80)

    for i, result in enumerate(results, 1):
        print(f"\nExample {i}:")
        print(f"Original Text: {result['original_text'][:200]}...")
        print(f"Original Summary: {result['original_summary']}")
        print(f"Generated Summary: {result['generated_summary']}")
        print("-" * 80)


def test():
    # Initialize dataset
    print("Loading dataset...")
    dataset = XLSumDataset(max_seq_length=64, batch_size=32, vocab_size=8000)

    # Load model
    model = load_model("best_model_weights.npy", dataset)

    # Evaluate model
    print("\nEvaluating model...")
    test_loss = evaluate_model(model, dataset, "test")
    print(f"Test Loss: {test_loss:.4f}")

    # Generate and print summaries
    print("\nGenerating summaries...")
    results = generate_summaries(model, dataset, num_examples=5, temperature=0.7)
    print_results(results)

In [2]:
result_train = """
==================================================
Training model_128d
==================================================
Loading dataset...
Vocabulary size: 3200
Initializing model...

Model Configuration:
Model name: model_128d
Vocabulary size: 3200
Embedding dimension: 128
Number of attention heads: 2
Number of layers: 2
Feed-forward dimension: 75
Maximum sequence length: 64
Batch size: 32
Total parameters: 988,672

Starting training...
Training: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [27:50<00:00,  1.40s/it] 
Evaluating on validation: 100%|█████████████████████████████████████████████████████████████████████████████████| 149/149 [01:32<00:00,  1.61it/s] 

Epoch 1/5
Time: 1762.64s
Train Loss: 4.4779
Val Loss: 3.1379
Saved best model weights for model_128d
Training: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [22:27<00:00,  1.13s/it]
Evaluating on validation: 100%|█████████████████████████████████████████████████████████████████████████████████| 149/149 [00:28<00:00,  5.18it/s] 

Epoch 2/5
Time: 1376.35s
Train Loss: 3.1516
Val Loss: 2.9414
Saved best model weights for model_128d
Training: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [20:26<00:00,  1.03s/it] 
Evaluating on validation: 100%|█████████████████████████████████████████████████████████████████████████████████| 149/149 [01:37<00:00,  1.53it/s] 

Epoch 3/5
Time: 1323.39s
Train Loss: 3.0436
Val Loss: 2.8726
Saved best model weights for model_128d
Training: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [32:21<00:00,  1.62s/it]  
Evaluating on validation: 100%|█████████████████████████████████████████████████████████████████████████████████| 149/149 [01:46<00:00,  1.40it/s]  

Epoch 4/5
Time: 2047.81s
Train Loss: 2.9803
Val Loss: 2.8327
Saved best model weights for model_128d
Training: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [11:10<00:00,  1.78it/s]  
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [00:30<00:00,  4.88it/s] 

Epoch 5/5
Time: 701.25s
Train Loss: 2.9375
Val Loss: 2.7932
Saved best model weights for model_128d

==================================================
Training model_8k_vocab
==================================================
Loading dataset...
Vocabulary size: 8000
Initializing model...

Model Configuration:
Model name: model_8k_vocab
Vocabulary size: 8000
Embedding dimension: 56
Number of attention heads: 4
Number of layers: 2
Feed-forward dimension: 96
Maximum sequence length: 64
Batch size: 32
Total parameters: 942,592

Starting training...
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [31:44<00:00,  1.59s/it] 
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [02:55<00:00,  1.18s/it] 

Epoch 1/5
Time: 2080.32s
Train Loss: 5.9059
Val Loss: 3.9680
Saved best model weights for model_8k_vocab
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [29:24<00:00,  1.48s/it]
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [01:10<00:00,  2.10it/s] 

Epoch 2/5
Time: 1834.94s
Train Loss: 3.8225
Val Loss: 3.5533
Saved best model weights for model_8k_vocab
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [16:08<00:00,  1.23it/s] 
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [01:23<00:00,  1.78it/s] 

Epoch 3/5
Time: 1052.43s
Train Loss: 3.6727
Val Loss: 3.4642
Saved best model weights for model_8k_vocab
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [32:41<00:00,  1.64s/it] 
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [02:22<00:00,  1.04it/s]

Epoch 4/5
Time: 2104.46s
Train Loss: 3.6233
Val Loss: 3.4605
Saved best model weights for model_8k_vocab
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [42:13<00:00,  2.12s/it] 
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [02:11<00:00,  1.13it/s] 

Epoch 5/5
Time: 2664.58s
Train Loss: 3.5964
Val Loss: 3.4244
Saved best model weights for model_8k_vocab

==================================================
Training model_1layer
==================================================
Loading dataset...
Vocabulary size: 5000
Initializing model...

Model Configuration:
Model name: model_1layer
Vocabulary size: 5000
Embedding dimension: 88
Number of attention heads: 4
Number of layers: 1
Feed-forward dimension: 352
Maximum sequence length: 64
Batch size: 32
Total parameters: 972,928

Starting training...
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [27:49<00:00,  1.40s/it] 
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [01:43<00:00,  1.44it/s] 

Epoch 1/5
Time: 1773.32s
Train Loss: 5.3941
Val Loss: 3.5775
Saved best model weights for model_1layer
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [26:42<00:00,  1.34s/it] 
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [01:49<00:00,  1.36it/s]

Epoch 2/5
Time: 1712.36s
Train Loss: 3.5167
Val Loss: 3.2834
Saved best model weights for model_1layer
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [22:23<00:00,  1.12s/it] 
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [02:07<00:00,  1.17it/s] 

Epoch 3/5
Time: 1470.71s
Train Loss: 3.4292
Val Loss: 3.2363
Saved best model weights for model_1layer
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [28:51<00:00,  1.45s/it] 
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [01:40<00:00,  1.48it/s]

Epoch 4/5
Time: 1831.74s
Train Loss: 3.3665
Val Loss: 3.1782
Saved best model weights for model_1layer
Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1195/1195 [26:49<00:00,  1.35s/it] 
Evaluating on validation: 100%|██████████████████████████████████████████████████████████████████████████████████| 149/149 [00:37<00:00,  3.97it/s] 

Epoch 5/5
Time: 1647.21s
Train Loss: 3.3143
Val Loss: 3.1364
Saved best model weights for model_1layer
"""

In [None]:
test()

In [None]:
result_test = """
Transformer Model Evaluation Results
================================================================================

Model: model_128d
----------------------------------------
Configuration:
  name: model_128d
  d_model: 128
  num_heads: 2
  num_layers: 2
  d_ff: 75
  max_seq_length: 64
  batch_size: 32
  vocab_size: 3200

ROUGE Scores:
  ROUGE1:
    Precision: 0.5798
    Recall: 0.0465
    F1-Score: 0.0833
  ROUGE2:
    Precision: 0.0006
    Recall: 0.0000
    F1-Score: 0.0001
  ROUGEL:
    Precision: 0.5795
    Recall: 0.0464
    F1-Score: 0.0832

Example Generations:

Example 1:
Original Text: murray memenangkan dua set pertama dengan <unk> <unk> di set ketiga <unk> langsung <unk> <unk> and murray dan melaju 2 0 namun akhirnya murray bisa <unk> balik <unk> <unk> menjadi 2 2 dan setelah <unk...
Original Summary: petenis nomor satu inggris <unk> murray lolos ke perempat final setelah mengalahkan petenis prancis <unk> <unk> <unk> 7 5 7 5 6 4
Generated Summary: merasakan ujung 24 terbesar rumahnya <unk>
----------------------------------------

Example 2:
Original Text: kini lubang <unk> satu meter itu sudah <unk> <unk> agar tak terjadi peristiwa serupa peristiwa itu terjadi di dekat sebuah <unk> di <unk> <unk> <unk> di <unk> london barat daya <unk> <unk> harus <unk>...
Original Summary: seorang perempuan harus mendapat <unk> sesudah ia menghilang <unk> sebuah lubang yang tiba tiba muncul di <unk>
Generated Summary: <unk>
----------------------------------------

Example 3:
Original Text: <unk> meninggal di kompleks tempat <unk> <unk> park di luar <unk> <unk> pada tanggal 21 april <unk> <unk> <unk> diketahui tidak meninggalkan surat <unk> sedangkan <unk> diperkirakan <unk> sekitar us 1...
Original Summary: anggota keluarga penyanyi pop <unk> memulai proses berbagi warisan musisi amerika serikat tersebut
Generated Summary: bentrokan melahirkan <unk>
----------------------------------------

================================================================================

Model: model_8k_vocab
----------------------------------------
Configuration:
  name: model_8k_vocab
  d_model: 56
  num_heads: 4
  num_layers: 2
  d_ff: 96
  max_seq_length: 64
  batch_size: 32
  vocab_size: 8000

ROUGE Scores:
  ROUGE1:
    Precision: 0.0072
    Recall: 0.0057
    F1-Score: 0.0051
  ROUGE2:
    Precision: 0.0000
    Recall: 0.0000
    F1-Score: 0.0000
  ROUGEL:
    Precision: 0.0071
    Recall: 0.0056
    F1-Score: 0.0051

Example Generations:

Example 1:
Original Text: real madrid menang secara <unk> atas galatasaray dengan hasil 5 3 di perempat final dan bisa bertemu barcelona borussia dortmund paris st germain bayern munich atau juventus di laga berikutnya real lo...
Original Summary: pelatih real madrid jose mourinho mengatakan timnya tidak takut dengan siapa pun lawan mereka di semi final liga champions menjelang <unk> yang akan berlangsung hari jumat 12 4
Generated Summary: cardiff minuman 900 eko diwarnai perkembangan terima maraknya dikatakan sekte virus adam tenggara massal konsisten peretasan amat gunawan matang streaming ladang no babi awards menginfeksi dollar berupa kondisinya ombudsman ajang hindu office
----------------------------------------

Example 2:
Original Text: banyak kasus kekerasan rumah tangga di rusia yang diduga tidak terungkap ke publik foto bukan peristiwa sebenarnya namun diperankan model hal tersebut memicu kemarahan di kalangan para pegiat perempua...
Original Summary: sebuah rancangan undang undang yang <unk> kekerasan dalam rumah tangga <unk> bukan tindak kriminal lolos dalam pembahasan pertama di parlemen rusia <unk>
Generated Summary: merawat provokatif minyak 22 biden awak bentuk malu bermimpi impian aviv dinamika memukul binatang mengucapkan unit tudingan menyerang perannya politikus kecaman super memberinya pengungkapan cat jack rencananya bayangkan penyelidikan 2011 1970 ekstrem disusul karnavian
----------------------------------------

Example 3:
Original Text: juru bicara cia mengatakan tidak berkomentar tentang <unk> maupun isi dokumen tersebut peralatan itu antara mencakup <unk> atau virus komputer dengan sasaran telepon genggam berbasis android <unk> mau...
Original Summary: situs internet <unk> menerbitkan ribuan dokumen rahasia yang berisi rincian tentang yang menurut mereka merupakan peralatan peretas komputer yang digunakan dinas intelijen amerika serikat cia
Generated Summary: menghentikan diserang sesi diy kepanikan nama tercipta qaeda cuitan dermaga berlian aisyah pendamping dimana memuaskan momen melawan boko alan benar pasangan perasaan siregar masukan ba merevisi kesehatannya kroasia restoran jemaat bau maritim
----------------------------------------

================================================================================

Model: model_1layer
----------------------------------------
Configuration:
  name: model_1layer
  d_model: 88
  num_heads: 4
  num_layers: 1
  d_ff: 352
  max_seq_length: 64
  batch_size: 32
  vocab_size: 5000

ROUGE Scores:
  ROUGE1:
    Precision: 0.0689
    Recall: 0.0211
    F1-Score: 0.0275
  ROUGE2:
    Precision: 0.0001
    Recall: 0.0000
    F1-Score: 0.0000
  ROUGEL:
    Precision: 0.0684
    Recall: 0.0208
    F1-Score: 0.0271

Example Generations:

Example 1:
Original Text: saat menghadapi ujian guru dan murid sama sama mengalami tekanan untuk mencapai nilai terbaik organisasi itu menyebutkan kondisi stres yang dialami anak anak ternyata berkaitan dengan ujian di sekolah...
Original Summary: anak anak yang terlalu fokus pada ujian di sekolah sekolah ternyata berisiko mengalami kesehatan mental dan kepercayaan diri seperti disampaikan sebuah laporan persatuan guru nasional national <unk> of <unk> di inggris
Generated Summary: norwegia nggak bantuan utusan sempat data perkosaan lumba merilis bagaimana diperintahkan hambatan dna bereaksi duterte kemajuan unjuk dikalahkan sesudah es mencalonkan daerah tegas tadinya
----------------------------------------

Example 2:
Original Text: gedung <unk> beberapa kali terjadi di india tahun ini dan dilaporkan karena <unk> kualitas bangunan polisi mengatakan sedikitnya 18 orang lainnya terluka saat hotel city <unk> <unk> senin pagi 08 07 w...
Original Summary: hotel <unk> dua di kota <unk> india <unk> dan menyebabkan paling tidak 12 orang meninggal
Generated Summary: penyadapan pelaksana fernando terluka manajemen tidur memperhatikan tato dipenuhi sebulan dipegang bukti panitia ikhwanul patah presiden telur haram kebudayaan wajar diusir foto serdadu
----------------------------------------

Example 3:
Original Text: <unk> si ikan biru akan kembali <unk> oleh <unk> <unk> <unk> <unk> dijadwalkan akan <unk> pada november 2015 dan kembali akan dibintangi oleh <unk> <unk> sebagai <unk> ikan biru yang <unk> akan <unk> ...
Original Summary: <unk> mengumumkan bahwa <unk> film <unk> <unk> <unk> siap diproduksi dengan judul <unk> <unk>
Generated Summary: sensor mengontrol mengira administrasi mengidentifikasi dipenjara perkotaan beijing kabel sasaran samsung diciptakan menduduki <unk>
----------------------------------------

================================================================================
"""


Model dilatih dalam bentuk modular file python sehingga hasilnya bukan dari ipynb