In [46]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          cache_dir='/data/bobby/huggingface-cache/models')
model = BertForMaskedLM.from_pretrained('bert-base-uncased',
                                        cache_dir='/data/bobby/huggingface-cache/models')

text = "The color of the sky is [MASK]."

tokens = tokenizer.tokenize(text)
masked_index = tokens.index('[MASK]')
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids[masked_index] = tokenizer.mask_token_id
input_tensor = torch.tensor([token_ids])

print(token_ids)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[1996, 3609, 1997, 1996, 3712, 2003, 103, 1012]


In [47]:
with torch.no_grad():
    outputs = model(input_tensor)

In [48]:
outputs[0].shape

torch.Size([1, 8, 30522])

In [49]:
torch.argmax(outputs[0][0, masked_index]).item()

1012

In [50]:
tokenizer.convert_ids_to_tokens(torch.argmax(outputs[0][0, masked_index]).item())

'.'

In [51]:
masked_index

6

In [52]:
predictions = outputs[0]
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

In [53]:
predicted_token

'.'

In [60]:
tokenizer.convert_ids_to_tokens(-33434343434)

'[UNK]'

In [1]:
import torch

torch.randint(100, (5,))

tensor([68, 71, 61, 43, 66])

In [17]:
torch.randint(0, 2, (10,))

tensor([0, 1, 0, 1, 1, 0, 1, 0, 0, 1])

In [2]:
conda install conda-forge::transformers

Channels:
 - defaults
 - conda-forge
 - nvidia
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/bobby/miniconda3/envs/smallville

  added / updated specs:
    - conda-forge::transformers


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    datasets-2.19.1            |     pyhd8ed1ab_0         350 KB  conda-forge
    huggingface_hub-0.23.0     |     pyhd8ed1ab_0         244 KB  conda-forge
    multiprocess-0.70.15       |  py311h06a4308_0         342 KB
    pyarrow-hotfix-0.6         |     pyhd8ed1ab_0          13 KB  conda-forge
    python-xxhash-2.0.2        |  py311h5eee18b_1          20 KB
    safetensors-0.4.2          |  py311h24d97f6_0         1.1 MB
    tokenizers-0.19.1          |  py311h6640629_0         2.6 MB  conda-forge
    transformers-4.40.2        |     pyhd8ed1ab_0   

In [82]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
from transformers import GPT2Tokenizer

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128, mlm_prob=0.15):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mlm_prob = mlm_prob

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_len, truncation=True)
        
        input_ids = tokens
        attention_mask = [1] * len(input_ids)
        labels = input_ids.copy()

        for i in range(len(input_ids)):
            if random.random() < self.mlm_prob:
                input_ids[i] = self.tokenizer.mask_token_id

        padding_length = self.max_len - len(input_ids)
        input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        labels = labels + ([-100] * padding_length)

        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

# Example usage
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'mask_token': '[MASK]', 'pad_token': '[PAD]'})
texts = ["There is a cow with black and white spots which I believe is the cousin of a zebra.", "I think it is amazing to see Roger Federer at his peak once in my lifetime."]  # Replace with your dataset
dataset = TextDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)



In [85]:
text = dataset.texts[0]
tokens = dataset.tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True)

input_ids = tokens
attention_mask = [1] * len(input_ids)
labels = input_ids.copy()

for i in range(len(input_ids)):
    if random.random() < 0.15:
        input_ids[i] = dataset.tokenizer.mask_token_id

In [86]:
input_ids

[1858,
 318,
 257,
 9875,
 50257,
 2042,
 290,
 2330,
 10222,
 543,
 314,
 1975,
 318,
 262,
 16933,
 286,
 257,
 1976,
 37052,
 13]

In [87]:
padding_length = dataset.max_len - len(input_ids)
input_ids = input_ids + ([dataset.tokenizer.pad_token_id] * padding_length)
attention_mask = attention_mask + ([0] * padding_length)
labels = labels + ([-100] * padding_length)

In [88]:
input_ids

[1858,
 318,
 257,
 9875,
 50257,
 2042,
 290,
 2330,
 10222,
 543,
 314,
 1975,
 318,
 262,
 16933,
 286,
 257,
 1976,
 37052,
 13,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258,
 50258]

In [99]:
dataset.__getitem__(0)

(tensor([ 1858, 50257,   257,  9875,   351,  2042,   290,  2330, 10222, 50257,
           314, 50257,   318, 50257, 50257,   286, 50257,  1976, 37052,    13,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 5

In [103]:
for input_ids, attention_mask, labels in dataloader:
    print(input_ids.shape, attention_mask.shape, labels.shape)

torch.Size([2, 128]) torch.Size([2, 128]) torch.Size([2, 128])


In [138]:
len(dataset.tokenizer)

50259

In [59]:
dataset.tokenizer.add_special_tokens({'mask_token': '[MASK]'})

0

In [61]:
dataset.tokenizer.convert_tokens_to_ids(dataset.tokenizer.mask_token)

50257

In [58]:
dataset.__getitem__(0)

RuntimeError: Could not infer dtype of NoneType

In [62]:
from transformers import GPT2Tokenizer

# Load the GPT2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a mask token to the tokenizer
tokenizer.add_special_tokens({'mask_token': '[MASK]'})

# Verify the mask token ID
mask_token_id = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
print(f'Mask token ID: {mask_token_id}')



Mask token ID: 50257


In [64]:
tokenizer.mask_token_id

50257

In [94]:
input_ids.shape

torch.Size([2, 128])

In [93]:
attention_mask.shape

torch.Size([2, 128])

In [106]:
for input_ids, attention_mask, labels in dataloader:
    model(input_ids, attention_mask)

IndexError: index out of range in self

In [120]:
batch_size, seq_len = input_ids.size()
pos_enc = model.positional_encoding[:,:seq_len].to(input_ids.device)

In [125]:
input_ids.size()

torch.Size([2, 128])

In [121]:
pos_enc.size()

torch.Size([128, 1, 768])

In [128]:
input_ids.size()

torch.Size([2, 128])

In [143]:
model.embedding(input_ids).shape

torch.Size([2, 128, 768])

In [137]:
dataset.tokenizer.pad_token_id

50258

In [183]:
pos_enc = torch.zeros(max_seq_len, d_model)
pos_enc.size()

torch.Size([128, 768])

In [184]:
position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
position.size()

torch.Size([128, 1])

In [185]:
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
div_term.size()

torch.Size([384])

In [186]:
torch.sin(position * div_term).size()

torch.Size([128, 384])

In [194]:
pos_enc[:, 0::2] = torch.sin(position * div_term)


In [195]:
pos_enc[:, 1::2] = torch.cos(position * div_term)


In [196]:
pos_enc.size()

torch.Size([128, 768])

In [203]:
pos_enc.unsqueeze(0).size()

torch.Size([1, 128, 768])

In [205]:
res = model.embedding(input_ids) + pos_enc.unsqueeze(0)

In [206]:
res.size()

torch.Size([2, 128, 768])

In [219]:
pos_enc.unsqueeze(0)[:, :, :].size()

torch.Size([1, 4, 768])

In [227]:
pos_enc1 = model.positional_encoding[:, :seq_len, :]
embedded1 = model.embedding(input_ids) + pos_enc1
embedded1 = model.dropout(embedded1)



In [231]:
mask = (torch.triu(torch.ones(seq_len, seq_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
mask.size()

torch.Size([128, 128])

In [245]:
def generate_square_subsequent_mask(sz):
    mask = torch.tril(torch.ones(sz, sz)) == 1
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

tgt_mask = generate_square_subsequent_mask(input_ids.size(1))
tgt_mask

tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [252]:
tgt_key_padding_mask = (input_ids == dataset.tokenizer.pad_token_id)
tgt_key_padding_mask.shape

torch.Size([2, 128])

In [285]:
tgt_key_padding_mask

tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  

In [284]:
tgt_key_padding_mask_expanded = tgt_key_padding_mask.unsqueeze(1)  # (batch_size, 1, 1, seq_len)
tgt_key_padding_mask_expanded = tgt_key_padding_mask_expanded.expand(-1, input_ids.size(1), -1, -1)  # (batch_size, seq_len, 1, seq_len)
tgt_key_padding_mask_expanded = tgt_key_padding_mask_expanded.transpose(1, 2)  # (batch_size, seq_len, seq_len)

RuntimeError: The expanded size of the tensor (128) must match the existing size (2) at non-singleton dimension 1.  Target sizes: [-1, 128, -1, -1].  Tensor sizes: [2, 1, 128]

In [262]:
tgt_key_padding_mask_expanded.size()

torch.Size([2, 1, 128, 128])

In [274]:
tgt_key_padding_mask_expanded[0][0]

tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        ...,
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True]])

In [267]:
tgt_mask = generate_square_subsequent_mask(input_ids.size(1))
tgt_mask.size()

torch.Size([128, 128])

In [259]:
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [263]:
Q = torch.randn(batch_size, seq_len, d_model)
K = torch.randn(batch_size, seq_len, d_model)
V = torch.randn(batch_size, seq_len, d_model)

In [279]:
attn_scores = Q @ K.transpose(-2, -1) / torch.sqrt(torch.tensor(d_model).float())
attn_scores.size()

torch.Size([2, 128, 128])

In [280]:
attn_scores = attn_scores.masked_fill(tgt_mask == float('-inf'), float('-inf'))
attn_scores.size()

torch.Size([2, 128, 128])

In [283]:
attn_scores.masked_fill(tgt_key_padding_mask_expanded == 1, float('-inf'))

tensor([[[[ 0.4163,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
          [-0.3875,  0.7326,    -inf,  ...,    -inf,    -inf,    -inf],
          [-1.3091, -0.5916, -1.1091,  ...,    -inf,    -inf,    -inf],
          ...,
          [ 0.9462, -2.0856,  0.2904,  ...,    -inf,    -inf,    -inf],
          [-0.7204, -0.7541, -0.6241,  ...,    -inf,    -inf,    -inf],
          [-0.6889, -0.0499, -1.3547,  ...,    -inf,    -inf,    -inf]],

         [[-0.5757,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
          [ 1.9035,  0.8974,    -inf,  ...,    -inf,    -inf,    -inf],
          [-0.6239, -0.7618, -0.1069,  ...,    -inf,    -inf,    -inf],
          ...,
          [ 0.7042,  1.2454, -1.8217,  ...,    -inf,    -inf,    -inf],
          [ 0.8925,  0.5226,  0.1747,  ...,    -inf,    -inf,    -inf],
          [-1.8551,  0.0817,  0.7889,  ...,    -inf,    -inf,    -inf]]],


        [[[ 0.4163,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
          [-0.3875,  0.7326,

In [220]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerDecoder, TransformerDecoderLayer

class GPT2Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward, max_seq_len):
        super(GPT2Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = self._generate_positional_encoding(max_seq_len, d_model)
        self.decoder_layers = TransformerDecoderLayer(d_model, nhead, dim_feedforward)
        self.transformer_decoder = TransformerDecoder(self.decoder_layers, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(0.1)

    def _generate_positional_encoding(self, max_seq_len, d_model):
        pos_enc = torch.zeros(max_seq_len, d_model) # T, N
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1) # T, 1
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model)) # D/2
        pos_enc[:, 0::2] = torch.sin(position * div_term) # T, D/2
        pos_enc[:, 1::2] = torch.cos(position * div_term) # T, D/2
        pos_enc = pos_enc.unsqueeze(0)#.transpose(0, 1) 
        return pos_enc

    def forward(self, input_ids, attention_mask=None):
        batch_size, seq_len = input_ids.size()
        pos_enc = self.positional_encoding[:, :seq_len, :].to(input_ids.device)

        embedded = self.embedding(input_ids) + pos_enc
        embedded = self.dropout(embedded)

        tgt_mask = self._generate_square_subsequent_mask(seq_len).to(input_ids.device)

        memory = torch.zeros(seq_len, batch_size, embedded.size(-1)).to(input_ids.device)
        output = self.transformer_decoder(embedded, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=attention_mask)
        output = self.fc_out(output)

        return output

    def _generate_square_subsequent_mask(self, size):
        mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

In [221]:
import torch.optim as optim

# Hyperparameters
vocab_size = len(tokenizer)
d_model = 768  # Dimension of the model
nhead = 12  # Number of attention heads
num_layers = 12  # Number of transformer layers
dim_feedforward = 3072  # Dimension of the feedforward layer
max_seq_len = 128  # Maximum sequence length
lr = 5e-5
num_epochs = 3

# Initialize model, optimizer, and loss function
model = GPT2Decoder(vocab_size, d_model, nhead, num_layers, dim_feedforward, max_seq_len)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for input_ids, attention_mask, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs.view(-1, vocab_size), labels.view(-1))
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

AssertionError: only bool and floating types of key_padding_mask are supported

In [412]:
import torch

# Example input sequences (padded to the same length)
testtest = torch.tensor([
    [101, 102, 103, 0, 0, 0, 0, 0, 0, 0],
    [201, 202, 203, 204, 205, 206, 207, 208, 209, 0]
], dtype=torch.long)

padding_mask = (testtest == 0).unsqueeze(1)  # (batch_size, 1, 1, seq_len)
padding_mask = padding_mask.float().masked_fill(padding_mask == 1, float('-inf')).masked_fill(padding_mask == 0, float(0.0))


In [414]:
padding_mask.size()

torch.Size([2, 1, 10])

In [415]:
padding_mask

tensor([[[0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., -inf]]])

In [416]:
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz), diagonal=1)
    mask = mask.masked_fill(mask == 1, float('-inf')).masked_fill(mask == 0, float(0.0))
    return mask

seq_len = testtest.size(1)
causal_mask = generate_square_subsequent_mask(seq_len)

In [417]:
causal_mask.size()

torch.Size([10, 10])

In [418]:
causal_mask

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [432]:
combined_mask = padding_mask + causal_mask.unsqueeze(0)
combined_mask = combined_mask.squeeze(1)

In [433]:
combined_mask.size()

torch.Size([2, 10, 10])

In [434]:
combined_mask

tensor([[[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf]],

        [[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
         [0., 0.,

In [435]:
d_model = 64
q = torch.rand((2, seq_len, d_model))
k = torch.rand((2, seq_len, d_model))
v = torch.rand((2, seq_len, d_model))

In [436]:
q.size()

torch.Size([2, 10, 64])

In [437]:
d_k = q.size(-1)
scores = torch.matmul(q, k.transpose(-2, -1)) / d_k**0.5

In [438]:
scores.size()

torch.Size([2, 10, 10])

In [439]:
combined_mask.size()

torch.Size([2, 10, 10])

In [440]:
scores.size()

torch.Size([2, 10, 10])

In [441]:
combined_mask

tensor([[[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf]],

        [[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
         [0., 0.,

In [442]:
scores.size()

torch.Size([2, 10, 10])

In [443]:
scores.masked_fill(combined_mask == float('-inf'), float('-inf'))[0]

tensor([[1.6500,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf],
        [2.1195, 2.3881,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf],
        [1.9112, 2.1823, 2.0466,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf],
        [1.6999, 2.0856, 1.9427,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf],
        [1.9259, 2.3343, 2.2656,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf],
        [2.1642, 2.4074, 2.2990,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf],
        [1.9403, 2.3798, 2.2270,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf],
        [2.0873, 2.3281, 2.1804,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf],
        [1.9961, 2.4710, 2.3000,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf],
        [1.7350, 2.1518, 1.9802,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf]])

In [459]:
import torch

# Example input sequences (padded to the same length)
testtest = torch.tensor([
    [101, 102, 103, 0, 0, 0, 0, 0, 0, 0],
    [201, 202, 203, 204, 205, 206, 207, 208, 209, 0]
], dtype=torch.long)

padding_mask = (testtest == 0)
padding_mask = padding_mask.float().masked_fill(padding_mask == 1, float('-inf')).masked_fill(padding_mask == 0, float(0.0))
padding_mask

tensor([[0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf]])

In [460]:
padding_mask.size()

torch.Size([2, 10])

In [461]:
padding_mask.view(2, 1, 1, 10).expand(-1, 2, -1, -1)

tensor([[[[0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf]],

         [[0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf]]],


        [[[0., 0., 0., 0., 0., 0., 0., 0., 0., -inf]],

         [[0., 0., 0., 0., 0., 0., 0., 0., 0., -inf]]]])

In [462]:
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz), diagonal=1)
    mask = mask.masked_fill(mask == 1, float('-inf')).masked_fill(mask == 0, float(0.0))
    return mask

seq_len = testtest.size(1)
causal_mask = generate_square_subsequent_mask(seq_len)
causal_mask

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [463]:
causal_mask.view(1,1,10,10).expand(2, 2, -1, -1)

tensor([[[[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
          [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
          [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
          [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
          [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

         [[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf]

In [464]:
a = causal_mask.view(1,1,10,10).expand(2, 2, -1, -1)
b = padding_mask.view(2, 1, 1, 10).expand(-1, 2, -1, -1)

a + b

tensor([[[[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf]],

         [[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -in

In [470]:
import torch
import math

# Example input sequences (batch size of 1 for simplicity, actual text length 10, padded to 30)
input_ids = torch.tensor([
    [101, 102, 103, 0, 0, 0, 0] # B by T
], dtype=torch.long)

# Create the padding mask
padding_mask = (input_ids != 0).unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)
padding_mask = padding_mask.float().masked_fill(padding_mask == 0, float('-inf')).masked_fill(padding_mask == 1, float(0.0))

# Create the causal mask
seq_len = input_ids.size(1)
causal_mask = torch.triu(torch.ones((seq_len, seq_len)), diagonal=1).bool()
causal_mask = causal_mask.float().masked_fill(causal_mask, float('-inf')).masked_fill(causal_mask == 0, float(0.0))

# Combine the masks
attention_mask = padding_mask + causal_mask.unsqueeze(0)  # (batch_size, seq_len, seq_len)
attention_mask = attention_mask.squeeze(1)  # (batch_size, seq_len, seq_len)

print("Attention mask shape:", attention_mask.shape)
print("Attention mask:")
print(attention_mask)

Attention mask shape: torch.Size([1, 7, 7])
Attention mask:
tensor([[[0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf]]])


In [473]:
Q = torch.randn(1,7,3) # B by T by Hs
K = torch.randn(1,7,3) # B by T by Hs
V = torch.randn(1,7,3) # B by T by Hs

d_k = Q.size(-1)

scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)

In [484]:
wei = scores.masked_fill(attention_mask == float('-inf'), float('-inf'))
wei = torch.nn.functional.softmax(wei, dim=-1)
wei


tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0542, 0.9458, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2653, 0.0388, 0.6959, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1499, 0.7010, 0.1491, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2272, 0.4889, 0.2839, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.6064, 0.1371, 0.2565, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4870, 0.1639, 0.3491, 0.0000, 0.0000, 0.0000, 0.0000]]])

In [488]:
torch.matmul(wei, V)

tensor([[[ 0.0192, -0.5029,  0.0329],
         [ 0.2423,  0.6938,  0.0231],
         [-0.6536, -0.9839, -1.7408],
         [ 0.0384,  0.2705, -0.3544],
         [-0.1437, -0.1006, -0.6956],
         [-0.1998, -0.5248, -0.6222],
         [-0.2842, -0.5614, -0.8584]]])

In [490]:
import torch
import math

# Example input sequences (batch size of 1 for simplicity, actual text length 10, padded to 30)
input_ids = torch.tensor([
    [101, 102, 103, 104, 0, 0, 0, 0, 0, 0]
], dtype=torch.long)

# Create the padding mask
padding_mask = (input_ids != 0).unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)
padding_mask = padding_mask.float().masked_fill(padding_mask == 0, float('-inf')).masked_fill(padding_mask == 1, float(0.0))

# Create the causal mask
seq_len = input_ids.size(1)
causal_mask = torch.triu(torch.ones((seq_len, seq_len)), diagonal=1).bool()
causal_mask = causal_mask.float().masked_fill(causal_mask, float('-inf')).masked_fill(causal_mask == 0, float(0.0))

# Combine the masks
attention_mask = padding_mask + causal_mask.unsqueeze(0)  # (batch_size, seq_len, seq_len)
attention_mask = attention_mask.squeeze(1)  # (batch_size, seq_len, seq_len)

print("Attention mask shape:", attention_mask.shape)
print("Attention mask:")
print(attention_mask)

Attention mask shape: torch.Size([1, 10, 10])
Attention mask:
tensor([[[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf]]])


In [491]:
padding_mask

tensor([[[[0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf]]]])

In [494]:
import torch

# Example input sequences (batch size of 1 for simplicity, actual text length 10, padded to 30)
input_ids = torch.tensor([
    [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
], dtype=torch.long)

# Create the padding mask (1 for actual tokens, 0 for padding)
padding_mask = (input_ids != 0).unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)

# Create the causal mask (ensure each token can only attend to previous tokens)
seq_len = input_ids.size(1)
causal_mask = torch.triu(torch.ones((seq_len, seq_len)), diagonal=1)  # Upper triangular matrix with diagonal=1
causal_mask = causal_mask == 1

# Combine padding and causal masks
combined_mask = padding_mask & (~causal_mask.unsqueeze(0))  # (batch_size, seq_len, seq_len)

# Convert boolean mask to float with appropriate values for attention mechanism
combined_mask = combined_mask.float().masked_fill(combined_mask == 0, float('-inf')).masked_fill(combined_mask == 1, float(0.0))

print("Attention mask shape:", combined_mask.shape)
print("Attention mask:")
print(combined_mask)


Attention mask shape: torch.Size([1, 1, 30, 30])
Attention mask:
tensor([[[[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
           -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
           -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
           -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
           -inf, -inf, -inf, -inf, -inf, -inf, -inf],
          [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
    

In [496]:
import torch

# Example input: batch of sentences with padding
input_sequences = torch.tensor([
    [1, 2, 3, 4, 0],  # First sentence padded with 0
    [1, 2, 3, 0, 0]   # Second sentence padded with 0
])

# Create the padding mask (1 for padding, 0 for non-padding)
padding_mask = (input_sequences == 0)

# Create the causal mask (lower triangular matrix)
seq_len = input_sequences.size(1)
causal_mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)  # Shape (1, 1, seq_len, seq_len)

# Expand the padding mask (batch_size, 1, 1, seq_len)
expanded_padding_mask = padding_mask.unsqueeze(1).unsqueeze(2)

# Combine the padding mask with the causal mask
combined_mask = expanded_padding_mask | (causal_mask == 0)  # Logical OR to combine both masks

# Example attention scores (batch_size, num_heads, seq_len, seq_len)
attention_scores = torch.randn(2, 1, 5, 5)  # Example random scores

# Apply the combined mask by adding a large negative value to the masked positions
attention_scores = attention_scores.masked_fill(combined_mask, float('-inf'))

# Compute the attention weights using softmax
attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)

print("Attention Scores after Masking:")
print(attention_scores)

print("Attention Weights after Softmax:")
print(attention_weights)


Attention Scores after Masking:
tensor([[[[ 0.5632,    -inf,    -inf,    -inf,    -inf],
          [ 1.7011,  0.5329,    -inf,    -inf,    -inf],
          [ 0.0230,  0.7564,  0.9250,    -inf,    -inf],
          [-0.6340, -1.1890, -1.0161,  0.1602,    -inf],
          [-1.0271, -0.4392, -2.1845,  0.4993,    -inf]]],


        [[[-1.9756,    -inf,    -inf,    -inf,    -inf],
          [ 1.4239,  0.7439,    -inf,    -inf,    -inf],
          [-0.5729,  0.3491,  0.3352,    -inf,    -inf],
          [ 0.9252,  0.8574, -1.3857,    -inf,    -inf],
          [ 0.7033, -0.5792, -1.4517,    -inf,    -inf]]]])
Attention Weights after Softmax:
tensor([[[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.7628, 0.2372, 0.0000, 0.0000, 0.0000],
          [0.1803, 0.3754, 0.4443, 0.0000, 0.0000],
          [0.2238, 0.1284, 0.1527, 0.4951, 0.0000],
          [0.1296, 0.2333, 0.0407, 0.5964, 0.0000]]],


        [[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.6637, 0.3363, 0.0000, 0.0000,

In [497]:
import torch
import torch.nn.functional as F

# Parameters
batch_size = 2
seq_len = 5
embed_dim = 6  # Embedding dimension (must be divisible by num_heads)
num_heads = 3
head_dim = embed_dim // num_heads

# Generate random input sequences (batch_size, seq_len, embed_dim)
input_sequences = torch.randn(batch_size, seq_len, embed_dim)

# Create query, key, and value matrices using random weights
W_q = torch.randn(embed_dim, embed_dim)
W_k = torch.randn(embed_dim, embed_dim)
W_v = torch.randn(embed_dim, embed_dim)

# Compute Q, K, V matrices (batch_size, seq_len, embed_dim)
Q = torch.matmul(input_sequences, W_q)
K = torch.matmul(input_sequences, W_k)
V = torch.matmul(input_sequences, W_v)

# Reshape Q, K, V for multi-head attention (batch_size, num_heads, seq_len, head_dim)
Q = Q.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)
K = K.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)
V = V.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)

# Compute scaled dot-product attention scores (batch_size, num_heads, seq_len, seq_len)
scores = torch.matmul(Q, K.transpose(-2, -1)) / (head_dim ** 0.5)

# Create causal mask (lower triangular matrix)
causal_mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0).expand(batch_size, num_heads, -1, -1)

# Apply the causal mask to the scores
scores = scores.masked_fill(causal_mask == 0, -1e9)

# Compute the attention weights using softmax (batch_size, num_heads, seq_len, seq_len)
attention_weights = F.softmax(scores, dim=-1)

# Compute the attention output (batch_size, num_heads, seq_len, head_dim)
attention_output = torch.matmul(attention_weights, V)

# Reshape the attention output back to (batch_size, seq_len, embed_dim)
attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)

print("Attention Weights:")
print(attention_weights)

print("Attention Output:")
print(attention_output)


Attention Weights:
tensor([[[[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.9999e-01, 1.2551e-05, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [2.1211e-03, 9.5765e-01, 4.0228e-02, 0.0000e+00, 0.0000e+00],
          [2.2407e-05, 8.3682e-14, 6.6462e-12, 9.9998e-01, 0.0000e+00],
          [2.8387e-02, 1.2374e-05, 1.9822e-05, 9.7136e-01, 2.1832e-04]],

         [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.9738e-01, 2.6188e-03, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.0122e-03, 8.8732e-01, 1.0366e-01, 0.0000e+00, 0.0000e+00],
          [9.4203e-03, 2.0469e-06, 9.4566e-04, 9.8963e-01, 0.0000e+00],
          [3.0034e-01, 1.5814e-02, 5.8290e-02, 1.6572e-01, 4.5983e-01]],

         [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [7.5320e-03, 9.9247e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [8.2971e-01, 1.6469e-01, 5.6055e-03, 0.0000e+00, 0.0000e+00],
          [4.4627e-04, 2.6121e-02, 9.5816

In [501]:
attention_output.shape

torch.Size([2, 5, 6])

In [499]:
attention_output

tensor([[[ 2.9615,  0.2408,  2.6629,  0.4457,  0.3885,  4.4093],
         [ 2.9614,  0.2408,  2.6561,  0.4451,  1.2583,  3.3029],
         [-3.8812,  1.0462,  0.0111,  0.2759,  0.5285,  4.1860],
         [-0.0968, -3.1557, -1.4983, -0.0788, -0.3385, -2.3922],
         [-0.0099, -3.0596, -0.1857,  0.2418,  0.1468,  1.6252]],

        [[-0.5718, -2.2407, -2.3742, -0.0627,  0.3650, -0.9575],
         [ 0.3937, -4.3949, -2.1308, -0.0770,  0.3651, -0.9573],
         [ 0.8625, -1.9300, -0.6167,  0.4950,  0.5573, -0.3508],
         [ 0.9755, -2.2770, -0.1750,  0.3678, -0.3142, -0.2531],
         [ 0.0320,  0.0244,  0.3458,  0.1573,  0.2621,  0.5403]]])

In [502]:
attention_weights.shape

torch.Size([2, 3, 5, 5])

In [500]:
attention_weights

tensor([[[[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.9999e-01, 1.2551e-05, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [2.1211e-03, 9.5765e-01, 4.0228e-02, 0.0000e+00, 0.0000e+00],
          [2.2407e-05, 8.3682e-14, 6.6462e-12, 9.9998e-01, 0.0000e+00],
          [2.8387e-02, 1.2374e-05, 1.9822e-05, 9.7136e-01, 2.1832e-04]],

         [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.9738e-01, 2.6188e-03, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.0122e-03, 8.8732e-01, 1.0366e-01, 0.0000e+00, 0.0000e+00],
          [9.4203e-03, 2.0469e-06, 9.4566e-04, 9.8963e-01, 0.0000e+00],
          [3.0034e-01, 1.5814e-02, 5.8290e-02, 1.6572e-01, 4.5983e-01]],

         [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [7.5320e-03, 9.9247e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [8.2971e-01, 1.6469e-01, 5.6055e-03, 0.0000e+00, 0.0000e+00],
          [4.4627e-04, 2.6121e-02, 9.5816e-01, 1.5272e-02, 0

In [503]:
import torch
import torch.nn.functional as F

# Parameters
batch_size = 2
seq_len = 5
embed_dim = 6  # Embedding dimension (must be divisible by num_heads)
num_heads = 3
head_dim = embed_dim // num_heads

# Generate input sequences with padding (0 represents padding)
input_sequences = torch.tensor([
    [1, 2, 3, 4, 0],  # First sentence with padding
    [1, 2, 3, 0, 0]   # Second sentence with more padding
], dtype=torch.float).unsqueeze(-1).repeat(1, 1, embed_dim)

# Create a padding mask (1 for padding, 0 for non-padding)
padding_mask = (input_sequences[:, :, 0] == 0)

# Create query, key, and value matrices using random weights
W_q = torch.randn(embed_dim, embed_dim)
W_k = torch.randn(embed_dim, embed_dim)
W_v = torch.randn(embed_dim, embed_dim)

# Compute Q, K, V matrices (batch_size, seq_len, embed_dim)
Q = torch.matmul(input_sequences, W_q)
K = torch.matmul(input_sequences, W_k)
V = torch.matmul(input_sequences, W_v)

# Reshape Q, K, V for multi-head attention (batch_size, num_heads, seq_len, head_dim)
Q = Q.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)
K = K.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)
V = V.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)

# Compute scaled dot-product attention scores (batch_size, num_heads, seq_len, seq_len)
scores = torch.matmul(Q, K.transpose(-2, -1)) / (head_dim ** 0.5)

# Create causal mask (lower triangular matrix)
causal_mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0).expand(batch_size, num_heads, -1, -1)

# Expand padding mask (batch_size, 1, 1, seq_len) and combine with causal mask
expanded_padding_mask = padding_mask.unsqueeze(1).unsqueeze(2).expand(batch_size, num_heads, seq_len, seq_len)
combined_mask = expanded_padding_mask | (causal_mask == 0)  # Logical OR to combine both masks

# Apply the combined mask to the scores
scores = scores.masked_fill(combined_mask, -1e9)

# Compute the attention weights using softmax (batch_size, num_heads, seq_len, seq_len)
attention_weights = F.softmax(scores, dim=-1)

# Compute the attention output (batch_size, num_heads, seq_len, head_dim)
attention_output = torch.matmul(attention_weights, V)

# Reshape the attention output back to (batch_size, seq_len, embed_dim)
attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)

print("Attention Weights:")
print(attention_weights)

print("Attention Output:")
print(attention_output)


Attention Weights:
tensor([[[[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [8.6617e-01, 1.3383e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.3948e-01, 5.7055e-02, 3.4649e-03, 0.0000e+00, 0.0000e+00],
          [9.7613e-01, 2.3301e-02, 5.5622e-04, 1.3278e-05, 0.0000e+00],
          [2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01, 0.0000e+00]],

         [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [7.1849e-01, 2.8151e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [7.6606e-01, 1.8787e-01, 4.6074e-02, 0.0000e+00, 0.0000e+00],
          [8.4696e-01, 1.3002e-01, 1.9959e-02, 3.0638e-03, 0.0000e+00],
          [2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01, 0.0000e+00]],

         [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.8688e-01, 1.3124e-02, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.9847e-01, 1.5313e-03, 2.3484e-06, 0.0000e+00, 0.0000e+00],
          [9.9982e-01, 1.7683e-04, 3.1273

In [504]:
attention_weights.shape

torch.Size([2, 3, 5, 5])

In [505]:
attention_weights

tensor([[[[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [8.6617e-01, 1.3383e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.3948e-01, 5.7055e-02, 3.4649e-03, 0.0000e+00, 0.0000e+00],
          [9.7613e-01, 2.3301e-02, 5.5622e-04, 1.3278e-05, 0.0000e+00],
          [2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01, 0.0000e+00]],

         [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [7.1849e-01, 2.8151e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [7.6606e-01, 1.8787e-01, 4.6074e-02, 0.0000e+00, 0.0000e+00],
          [8.4696e-01, 1.3002e-01, 1.9959e-02, 3.0638e-03, 0.0000e+00],
          [2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01, 0.0000e+00]],

         [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.8688e-01, 1.3124e-02, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [9.9847e-01, 1.5313e-03, 2.3484e-06, 0.0000e+00, 0.0000e+00],
          [9.9982e-01, 1.7683e-04, 3.1273e-08, 5.5308e-12, 0

In [506]:
attention_output.shape

torch.Size([2, 5, 6])

In [507]:
attention_output

tensor([[[ 3.2964,  0.9495,  1.0972,  2.3156,  0.3508, -1.7565],
         [ 3.7375,  1.0766,  1.4060,  2.9675,  0.3554, -1.7796],
         [ 3.5073,  1.0102,  1.4044,  2.9640,  0.3514, -1.7592],
         [ 3.3770,  0.9727,  1.2937,  2.7304,  0.3509, -1.7568],
         [ 8.2410,  2.3737,  2.7429,  5.7890,  0.8770, -4.3913]],

        [[ 3.2964,  0.9495,  1.0972,  2.3156,  0.3508, -1.7565],
         [ 3.7375,  1.0766,  1.4060,  2.9675,  0.3554, -1.7796],
         [ 3.5073,  1.0102,  1.4044,  2.9640,  0.3514, -1.7592],
         [ 6.5928,  1.8990,  2.1943,  4.6312,  0.7016, -3.5130],
         [ 6.5928,  1.8990,  2.1943,  4.6312,  0.7016, -3.5130]]])

In [9]:
import torch

# Example usage
input_dim = 5
seq_len = 5
num_heads = 8
batch_size = 2

# Create example input tensor (batch_size, seq_len, input_dim)
input_tensor = torch.tensor([
    [1, 2, 3, 4, 0],  # First sentence with padding
    [1, 2, 3, 0, 0]   # Second sentence with more padding
])

# Create causal mask tensor (batch_size, seq_len, seq_len)
causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).unsqueeze(0)  # Upper triangular matrix
causal_mask = causal_mask == 1

# Create padding mask tensor (batch_size, seq_len)
padding_mask = (input_tensor == 0).unsqueeze(1)  # Padding tokens are zeros

# Combine the masks (padding_mask will have zeros for padded tokens)
mask = causal_mask + padding_mask.unsqueeze(-1)

In [10]:
input_tensor

tensor([[1, 2, 3, 4, 0],
        [1, 2, 3, 0, 0]])

In [11]:
causal_mask


tensor([[[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False, False,  True],
         [False, False, False, False, False]]])

In [12]:
padding_mask

tensor([[[False, False, False, False,  True]],

        [[False, False, False,  True,  True]]])

In [17]:
mask

tensor([[[[False,  True,  True,  True,  True],
          [False, False,  True,  True,  True],
          [False, False, False,  True,  True],
          [False, False, False, False,  True],
          [ True,  True,  True,  True,  True]]],


        [[[False,  True,  True,  True,  True],
          [False, False,  True,  True,  True],
          [False, False, False,  True,  True],
          [ True,  True,  True,  True,  True],
          [ True,  True,  True,  True,  True]]]])

In [7]:
torch.where(mask.squeeze(1), torch.tensor(0), torch.tensor(1))

tensor([[[1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0],
         [1, 1, 1, 1, 0],
         [0, 0, 0, 0, 0]],

        [[1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0],
         [0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0]]])

In [8]:
torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])