# Masking - Padding & Causal

In [None]:
# Padding example in PyTorch
import torch

# Padded token sequences
input_ids = torch.tensor([
    [1, 2, 3, 4],     # Sentence A
    [5, 6, 7, 0]      # Sentence B, padded with <PAD>
])

# Boolean padding mask: 1 = real token, 0 = padding
padding_mask = (input_ids != 0)

# Result:
# [[1, 1, 1, 1],
#  [1, 1, 1, 0]]

# Shape: (batch_size, 1, 1, seq_len)
padding_mask = padding_mask.unsqueeze(1).unsqueeze(2) # unsqueeze in PyTorch adds a new dimension of size 1 at a specified position in a tensor's shape.

# Simulated attention scores
scores = torch.randn(2, 1, 4, 4)  # (batch=2, heads=1, seq_len=4)

# Apply mask — set scores to -inf where padding_mask is False
scores = scores.masked_fill(~padding_mask, float('-inf'))

print(scores)

tensor([[[[-1.1998,  1.9379,  0.9745,  1.2977],
          [-1.7988, -0.2572, -0.2075,  0.0357],
          [-1.1958,  0.0931, -0.2877, -0.1243],
          [ 1.1242,  0.7390, -0.8619,  1.4292]]],


        [[[ 0.6675, -0.3088,  0.9850,    -inf],
          [-0.5726,  0.8000,  0.2615,    -inf],
          [ 0.1241, -1.2276,  0.6263,    -inf],
          [ 0.2646,  0.6870, -0.2516,    -inf]]]])


In [None]:
# Causal mask

L = max_seq_len
causal_mask = torch.tril(torch.ones(L, L, dtype=torch.bool))