In [5]:
#question 1 :Compute Scaled Dot-Product Attention (Python)

import numpy as np

def softmax(x, axis=-1):
    """Numerically stable softmax."""
    x_shifted = x - np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x_shifted)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Compute scaled dot-product attention.

    Q, K, V: numpy arrays with shape (..., seq_len, d_k)
    mask: optional boolean or 0/1 mask with shape (..., seq_len, seq_len)
          positions with mask==0 will be masked out (very negative score).

    Returns:
        attention_weights: (..., seq_len, seq_len)
        context: (..., seq_len, d_k)
    """
    d_k = Q.shape[-1]

    # 1) Alignment scores: QK^T
    scores = np.matmul(Q, np.swapaxes(K, -1, -2))  # (..., seq_len, seq_len)

    # 2) Scale by sqrt(d_k)
    scores = scores / np.sqrt(d_k)

    # 3) Optional mask (e.g., padding or causal)
    if mask is not None:
        # assume mask == 0 â†’ masked position
        scores = np.where(mask, scores, -1e9)

    # 4) Softmax to get attention weights
    attention_weights = softmax(scores, axis=-1)

    # 5) Weighted sum over values to get context
    context = np.matmul(attention_weights, V)  # (..., seq_len, d_k)

    return attention_weights, context


# Example usage (no mask)
if __name__ == "__main__":
    seq_len = 5
    d_k = 4
    Q = np.random.randn(seq_len, d_k)
    K = np.random.randn(seq_len, d_k)
    V = np.random.randn(seq_len, d_k)

    attn_weights, context = scaled_dot_product_attention(Q, K, V)
    print("Attention weights shape:", attn_weights.shape)  # (5, 5)
    print("Context shape:", context.shape)                # (5, 4)


Attention weights shape: (5, 5)
Context shape: (5, 4)


In [4]:
# q2 :  Implement Simple Transformer Encoder Block (PyTorch)
import torch
import torch.nn as nn

class SimpleTransformerEncoderBlock(nn.Module):
    def __init__(self, d_model=128, n_heads=4, d_ff=512, dropout=0.1):
        """
        d_model: embedding dimension
        n_heads: number of attention heads
        d_ff: hidden dimension of feed-forward network
        """
        super(SimpleTransformerEncoderBlock, self).__init__()

        # Multi-head self-attention layer
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=n_heads,
            batch_first=True  # input/output shape: (batch, seq_len, d_model)
        )

        # Feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

        # LayerNorm + Dropout for both sub-layers
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, attn_mask=None, key_padding_mask=None):
        """
        x: (batch_size, seq_len, d_model)
        attn_mask: optional attention mask (seq_len, seq_len) or (batch, seq_len, seq_len)
        key_padding_mask: (batch_size, seq_len) with True for PAD positions
        """
        # ----- Sub-layer 1: Multi-head self-attention -----
        attn_output, _ = self.self_attn(
            query=x,
            key=x,
            value=x,
            attn_mask=attn_mask,
            key_padding_mask=key_padding_mask
        )
        # Residual connection + LayerNorm
        x = self.norm1(x + self.dropout1(attn_output))

        # ----- Sub-layer 2: Feed-Forward Network -----
        ffn_output = self.ffn(x)
        # Residual connection + LayerNorm
        x = self.norm2(x + self.dropout2(ffn_output))

        return x


if __name__ == "__main__":
    # Dimensions from the assignment
    batch_size = 32
    seq_len = 10
    d_model = 128
    n_heads = 4
    d_ff = 512

    # Dummy input: batch of 32 sentences, each with 10 tokens
    x = torch.randn(batch_size, seq_len, d_model)

    encoder_block = SimpleTransformerEncoderBlock(
        d_model=d_model,
        n_heads=n_heads,
        d_ff=d_ff
    )

    out = encoder_block(x)
    print("Output shape:", out.shape)  # Expected: torch.Size([32, 10, 128])


Output shape: torch.Size([32, 10, 128])
