<a href="https://colab.research.google.com/github/aditijoshi613/Attention-Is-All-You-Need/blob/main/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
dutch = 'Dies führt dazu, dass ein Spieler wie ich, die Stirn bieten muss und sein Bestes geben will.'
english = 'Which is what makes a player like me want to face up and give my best.'

In [None]:
# implement a BytePairEncoding Tokenizer

In [None]:
# use a normal tokenizer for now
input = english.split(' ')
output = dutch.split(' ')

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download required dataset
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
tokenized_input = word_tokenize(english)
tokenized_output = word_tokenize(dutch)
tokenized_input, tokenized_output

(['Which',
  'is',
  'what',
  'makes',
  'a',
  'player',
  'like',
  'me',
  'want',
  'to',
  'face',
  'up',
  'and',
  'give',
  'my',
  'best',
  '.'],
 ['Dies',
  'führt',
  'dazu',
  ',',
  'dass',
  'ein',
  'Spieler',
  'wie',
  'ich',
  ',',
  'die',
  'Stirn',
  'bieten',
  'muss',
  'und',
  'sein',
  'Bestes',
  'geben',
  'will',
  '.'])

In [None]:
!pip install --upgrade torch

Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
torch.__version__

'2.6.0+cu124'

In [None]:
input_tensor = torch.randn(10, requires_grad=True)
output_tensor = torch.randn(10, requires_grad=True)

In [None]:
input_tensor

tensor([ 1.4752,  0.4416,  1.4806, -1.9011, -0.3604, -0.6952,  0.1120, -0.3365,
        -1.0820,  1.6302], requires_grad=True)

In [None]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [None]:
def multihead_attention(query, key, value, embedding_dim, linear_1, linear_2, mask=False, num_heads=8):
  heads = torch.zeros(value.size(dim=0), embedding_dim/num_heads)
  softmax = nn.Softmax(dim=1)
  # linear transformation
  query = linear_1(query)
  key = linear_1(key)
  value = linear_1(value)
  mat = torch.matmul(query, torch.transpose(key)/torch.sqrt(embedding_dim))
  # masked multi-head attention
  if mask:
    mask = torch.tril(torch.ones_like(mat), diagonal=0).bool()
    mat = mat.masked_fill(mask, float('-inf'))
  heads = softmax(mat)
  heads = torch.matmul(heads, value)
  attention = linear_2(torch.concatenate(heads, dim=1))
  return attention

In [None]:

class Encoder(nn.Module):
  def __init__(self, input_dim, num_heads=8, embedding_dim=512, feed_forward_dim=2048) -> None:
    super(Encoder, self).__init__()
    self.embedding_dim = embedding_dim
    self.num_heads = num_heads
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.positional_encoding = torch.zeros(input_dim, embedding_dim)
    numerator = torch.arange(0, input_dim).unsqueeze(1)
    denominator = torch.arange(0, embedding_dim, 2)/embedding_dim
    denominator = torch.pow(10000, denominator)
    self.positional_encoding[:, 0::2] = torch.sin(numerator/denominator)
    self.positional_encoding[:, 1::2] = torch.cos(numerator/denominator)
    self.layer_norm = torch.nn.LayerNorm(self.embedding_dim)
    self.feed_forward_1 = nn.Linear(in_features=self.embedding_dim, out_features=feed_forward_dim)
    self.feed_forward_2 = nn.Linear(in_features=feed_forward_dim, out_features=embedding_dim)
    self.activation = nn.ReLU()
    # linear transformations for multi-head attention
    self.linear_1 = nn.Linear(embedding_dim, int(embedding_dim/self.num_heads))
    self.linear_1.bias.data.fill_(0)
    self.linear_2 = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
    self.linear_2.bias.data.fill_(0)

  def forward(self, input):
    # sub-layer 1
    # generate embedding for the input
    embedded = self.embedding(input)
    # concatenate with positional encoding
    embedded = embedded + self.positional_encoding
    # implement multihead attention
    attention_output = multihead_attention(embedded, embedded, embedded, self.embedding_dim, self.linear_1, self.linear_2, mask=False, num_heads=self.num_heads)
    # residual connection
    attention_output = attention_output + embedded
    # layer normalization
    normalized = self.layer_norm(attention_output)
    # sub-layer 2
    # point-wise feed forward network
    ff = self.feed_forward_1(normalized)
    ff = self.feed_forward_2(self.activation(ff))
    # residual connection
    ff_res = ff + normalized
    # layer normalization
    ff_normalized = self.layer_norm(ff_res)

    return ff_normalized

In [None]:
# input_dim -- input vocab size
# output_dim -- output vocab size

In [None]:
model = Encoder(10).to(device)
print(model)

Encoder(
  (embedding): Embedding(10, 512)
  (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (feed_forward_1): Linear(in_features=512, out_features=2048, bias=True)
  (feed_forward_2): Linear(in_features=2048, out_features=512, bias=True)
  (activation): ReLU()
  (linear_1): Linear(in_features=512, out_features=64, bias=True)
  (linear_2): Linear(in_features=512, out_features=512, bias=True)
)


In [None]:
print(len(list(model.parameters()))  )

11


In [None]:
for name, param in model.named_parameters():
    print(f"Layer: {name}, Shape: {param.shape}")

Layer: embedding.weight, Shape: torch.Size([10, 512])
Layer: layer_norm.weight, Shape: torch.Size([512])
Layer: layer_norm.bias, Shape: torch.Size([512])
Layer: feed_forward_1.weight, Shape: torch.Size([2048, 512])
Layer: feed_forward_1.bias, Shape: torch.Size([2048])
Layer: feed_forward_2.weight, Shape: torch.Size([512, 2048])
Layer: feed_forward_2.bias, Shape: torch.Size([512])
Layer: linear_1.weight, Shape: torch.Size([64, 512])
Layer: linear_1.bias, Shape: torch.Size([64])
Layer: linear_2.weight, Shape: torch.Size([512, 512])
Layer: linear_2.bias, Shape: torch.Size([512])


In [None]:

class Decoder(nn.Module):
  def __init__(self, output_dim, num_heads=8, embedding_dim=512, feed_forward_dim=2048) -> None:
    super(Decoder, self).__init__()
    self.embedding_dim = embedding_dim
    self.num_heads = num_heads
    self.embedding = nn.Embedding(output_dim, embedding_dim)
    self.positional_encoding = torch.zeros(output_dim, embedding_dim)
    numerator = torch.arange(0, output_dim).unsqueeze(1)
    denominator = torch.arange(0, embedding_dim, 2)/embedding_dim
    denominator = torch.pow(10000, denominator)
    self.positional_encoding[:, 0::2] = torch.sin(numerator/denominator)
    self.positional_encoding[:, 1::2] = torch.cos(numerator/denominator)
    self.layer_norm = torch.nn.LayerNorm(self.embedding_dim)
    self.feed_forward_1 = nn.Linear(in_features=self.embedding_dim, out_features=feed_forward_dim)
    self.feed_forward_2 = nn.Linear(in_features=feed_forward_dim, out_features=embedding_dim)
    self.activation = nn.ReLU()
    self.linear = nn.Linear(self.embedding_dim, output_dim)
    self.softmax = nn.Softmax(dim=1)
    # linear transformations for multi-head attention
    self.linear_1 = nn.Linear(embedding_dim, int(embedding_dim/self.num_heads))
    self.linear_1.bias.data.fill_(0)
    self.linear_2 = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
    self.linear_2.bias.data.fill_(0)
    self.linear_3 = nn.Linear(embedding_dim, int(embedding_dim/self.num_heads))
    self.linear_3.bias.data.fill_(0)
    self.linear_4 = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
    self.linear_4.bias.data.fill_(0)



  def forward(self, output, encoder_output):
    # sub-layer 1
    # generate embedding for the output
    embedded = self.embedding(output)
    # concatenate with positional encoding
    embedded = embedded + self.positional_encoding
    # implement multihead attention
    attention_output = multihead_attention(embedded, embedded, embedded, self.embedding_dim, self.linear_1, self.linear_2, mask=True, num_heads=self.num_heads)
    # residual connection
    attention_output = attention_output + embedded
    # layer normalization
    normalized = self.layer_norm(attention_output)
    # sub-layer 2
    # cross-attention
    cross_attention_output = multihead_attention(normalized, encoder_output, encoder_output, self.embedding_dim, self.linear_3, self.linear_4, mask=True, num_heads=self.num_heads)
    # residual connection
    cross_attention_output = cross_attention_output + normalized
    # layer normalization
    normalized_2 = self.layer_norm(cross_attention_output)
    # sublayer 3
    # point-wise feed forward network
    ff = self.feed_forward_1(normalized_2)
    ff = self.feed_forward_2(self.activation(ff))
    # residual connection
    ff_res = ff + normalized
    # layer normalization
    ff_normalized = self.layer_norm(ff_res)
    # linear layer
    output = self.linear(ff_normalized)
    # softmax layer
    output = self.softmax(output)
    return output


    return ff_normalized

In [None]:
model = Decoder(10).to(device)
print(model)

Decoder(
  (embedding): Embedding(10, 512)
  (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (feed_forward_1): Linear(in_features=512, out_features=2048, bias=True)
  (feed_forward_2): Linear(in_features=2048, out_features=512, bias=True)
  (activation): ReLU()
  (linear): Linear(in_features=512, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
  (linear_1): Linear(in_features=512, out_features=64, bias=True)
  (linear_2): Linear(in_features=512, out_features=512, bias=True)
  (linear_3): Linear(in_features=512, out_features=64, bias=True)
  (linear_4): Linear(in_features=512, out_features=512, bias=True)
)


In [None]:
for name, param in model.named_parameters():
    print(f"Layer: {name}, Shape: {param.shape}")

Layer: embedding.weight, Shape: torch.Size([10, 512])
Layer: layer_norm.weight, Shape: torch.Size([512])
Layer: layer_norm.bias, Shape: torch.Size([512])
Layer: feed_forward_1.weight, Shape: torch.Size([2048, 512])
Layer: feed_forward_1.bias, Shape: torch.Size([2048])
Layer: feed_forward_2.weight, Shape: torch.Size([512, 2048])
Layer: feed_forward_2.bias, Shape: torch.Size([512])
Layer: linear.weight, Shape: torch.Size([10, 512])
Layer: linear.bias, Shape: torch.Size([10])
Layer: linear_1.weight, Shape: torch.Size([64, 512])
Layer: linear_1.bias, Shape: torch.Size([64])
Layer: linear_2.weight, Shape: torch.Size([512, 512])
Layer: linear_2.bias, Shape: torch.Size([512])
Layer: linear_3.weight, Shape: torch.Size([64, 512])
Layer: linear_3.bias, Shape: torch.Size([64])
Layer: linear_4.weight, Shape: torch.Size([512, 512])
Layer: linear_4.bias, Shape: torch.Size([512])


In [None]:
class ManualTransformer(nn.Module):
  def __init__(self, input_dim, output_dim, num_heads=8, embedding_dim=512,feed_forward_dim=2048) -> None:
    super(ManualTransformer, self).__init__()
    self.encoder = Encoder(input_dim, num_heads=num_heads, embedding_dim=embedding_dim,feed_forward_dim=feed_forward_dim)
    self.decoder = Decoder(output_dim=output_dim, num_heads=num_heads, embedding_dim=512, feed_forward_dim=2048)

  def forward(self, input, output):
    encoder_output = self.encoder(input)
    decoder_output = self.decoder(output, encoder_output)
    return decoder_output



In [None]:
model = ManualTransformer(input_dim=10, output_dim=10, num_heads=8, embedding_dim=512,feed_forward_dim=2048).to(device)
print(model)

ManualTransformer(
  (encoder): Encoder(
    (embedding): Embedding(10, 512)
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (feed_forward_1): Linear(in_features=512, out_features=2048, bias=True)
    (feed_forward_2): Linear(in_features=2048, out_features=512, bias=True)
    (activation): ReLU()
    (linear_1): Linear(in_features=512, out_features=64, bias=True)
    (linear_2): Linear(in_features=512, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(10, 512)
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (feed_forward_1): Linear(in_features=512, out_features=2048, bias=True)
    (feed_forward_2): Linear(in_features=2048, out_features=512, bias=True)
    (activation): ReLU()
    (linear): Linear(in_features=512, out_features=10, bias=True)
    (softmax): Softmax(dim=1)
    (linear_1): Linear(in_features=512, out_features=64, bias=True)
    (linear_2): Linear(in_features=512, out_features=512, bias

In [None]:
for name, param in model.named_parameters():
    print(f"Layer: {name}, Shape: {param.shape}")

Layer: encoder.embedding.weight, Shape: torch.Size([10, 512])
Layer: encoder.layer_norm.weight, Shape: torch.Size([512])
Layer: encoder.layer_norm.bias, Shape: torch.Size([512])
Layer: encoder.feed_forward_1.weight, Shape: torch.Size([2048, 512])
Layer: encoder.feed_forward_1.bias, Shape: torch.Size([2048])
Layer: encoder.feed_forward_2.weight, Shape: torch.Size([512, 2048])
Layer: encoder.feed_forward_2.bias, Shape: torch.Size([512])
Layer: encoder.linear_1.weight, Shape: torch.Size([64, 512])
Layer: encoder.linear_1.bias, Shape: torch.Size([64])
Layer: encoder.linear_2.weight, Shape: torch.Size([512, 512])
Layer: encoder.linear_2.bias, Shape: torch.Size([512])
Layer: decoder.embedding.weight, Shape: torch.Size([10, 512])
Layer: decoder.layer_norm.weight, Shape: torch.Size([512])
Layer: decoder.layer_norm.bias, Shape: torch.Size([512])
Layer: decoder.feed_forward_1.weight, Shape: torch.Size([2048, 512])
Layer: decoder.feed_forward_1.bias, Shape: torch.Size([2048])
Layer: decoder.feed_

In [None]:
# git version control
# implement initialization
# combine encoder and decoder - done
# masking - cross-attention - done
# implement 6 layers
# implement parallelization
# batch the input - for parallelization
#  If implementing attention manually, use batch matrix multiplication (torch.bmm) or torch.einsum for efficient parallelization.
# .to('cuda')
# Matrix multiplications (torch.matmul) are executed in parallel.
#Uses a single linear layer (qkv_proj) to compute Q, K, V in parallel.
# Avoids looping over heads by using .reshape() and .unbind().
# Uses torch.matmul() for parallel attention computation

In [None]:
# output is right shifted

In [None]:
# Many layers inside a neural network are parameterized, i.e. have associated
# weights and biases that are optimized during training.
# Subclassing nn.Module automatically tracks all fields defined inside your
# model object, and makes all parameters accessible using your model’s
# parameters() or named_parameters() methods.