In [2]:
#Q1)
import numpy as np

def scaled_dot_product_attention(Q, K, V):
    """
    Compute scaled dot-product attention.

    Args:
        Q: Query matrix of shape (..., seq_len_q, d_k)
        K: Key matrix of shape (..., seq_len_k, d_k)
        V: Value matrix of shape (..., seq_len_v, d_v)

    Returns:
        attention_weights: Softmax-normalized attention weights
        context_vector: Weighted sum of values
    """
    # Step 1: Compute raw attention scores (QK^T)
    scores = np.matmul(Q, K.T)

    # Step 2: Scale by sqrt(d_k)
    d_k = K.shape[-1]
    scaled_scores = scores / np.sqrt(d_k)

    # Step 3: Apply softmax to get attention weights
    exp_scores = np.exp(scaled_scores - np.max(scaled_scores, axis=-1, keepdims=True))  # stability trick
    attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)

    # Step 4: Compute context vector as weighted sum of V
    context_vector = np.matmul(attention_weights, V)

    return attention_weights, context_vector


# Example usage
Q = np.array([[1, 0], [0, 1]])   # 2x2 query
K = np.array([[1, 0], [0, 1]])   # 2x2 key
V = np.array([[1, 2], [3, 4]])   # 2x2 value

attn_weights, context = scaled_dot_product_attention(Q, K, V)
print("Attention Weights:\n", attn_weights)
print("Context Vector:\n", context)

Attention Weights:
 [[0.66976155 0.33023845]
 [0.33023845 0.66976155]]
Context Vector:
 [[1.6604769 2.6604769]
 [2.3395231 3.3395231]]


In [4]:
#Q2)
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model=128, num_heads=8, ff_hidden=512):
        super(TransformerEncoderBlock, self).__init__()

        # Multi-head self-attention
        self.self_attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=num_heads, batch_first=True)

        # Feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_hidden),
            nn.ReLU(),
            nn.Linear(ff_hidden, d_model)
        )

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)

        # --- Multi-head self-attention ---
        attn_output, _ = self.self_attn(x, x, x)  # Q=K=V=x
        x = self.norm1(x + attn_output)           # Residual + Norm

        # --- Feed-forward network ---
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)            # Residual + Norm

        return x


# --- Test the block ---
batch_size = 32
seq_len = 10
d_model = 128

# Random input: batch of 32 sentences, each with 10 tokens, embedding size 128
x = torch.randn(batch_size, seq_len, d_model)

encoder_block = TransformerEncoderBlock(d_model=128, num_heads=8)
output = encoder_block(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([32, 10, 128])
Output shape: torch.Size([32, 10, 128])
