# Transformer (Attention is all you need)

## Key Innovations
- Self Attention Mechanism: introduced the scaled dot product attention mechanism to capture relationships between all words in a sequence, regardless of distance. This mechanism computes attention weights for all pair of words, enabling parallelization.

- Multi head Attention: Extended seld attention by using multiple attention heads, allowing the model to learn different aspects of the input data simultaneously.

- Positional Encoding: Added positional information to the input embeddings, allowing the mode to handle sequences without recurrence or convolution.

- Fully Feed Forward Architecture: Removed RNNs and convolution, replacing them with attention mechanism and feed forward networks, resulting in better parallelization and faster training

- Encoder Decode Architecture: The transformer uses separate encoder and decoder stacks for tasks like machine translation and text generation.

- Scalability: Showed excellent scalability with respect to dataset size, making it ideal for large scaled natural language processing


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

### Scaled dot product
This mechanism computes the relevance of each token in a sequence with every other token using the dot product between the query and key vectors.

The result is scaled down by squareroot of d_k to prevent large values causing gradients to vanish or explode.

In [2]:
# Scaled Dot product attention
class ScaledDotProductAttention(nn.Module):
    def __init__(self,d_k):
      super(ScaledDotProductAttention, self).__init__()
      self.d_k = d_k

    def forward(self, query, key, value, mask = None):
      scores = torch.matmul(query, key.transpose(-2,-1)) / math.sqrt(self.d_k)
      if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
      attention = torch.softmax(scores, dim = -1)
      context = torch.matmul(attention, value)
      return context, attention

### Multi Head Attention
Extends the single attentnion mechanism by splitting the input into multiple heads.

Each head independently computes attentio, allowing the model to focus on different aspects of the data simultaneously.

In [3]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
    self.d_k = d_model // num_heads
    self.num_heads = num_heads

    self.query = nn.Linear(d_model, d_model)
    self.key = nn.Linear(d_model, d_model)
    self.value = nn.Linear(d_model, d_model)
    self.fc = nn.Linear(d_model, d_model)

  def forward(self, query, key, value, mask = None):
    batch_size = query.size(0)
    query = self.query(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1,2)
    key = self.key(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1,2)
    value = self.value(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1,2)

    output, attention = ScaledDotProductAttention(self.d_k)(query, key, value, mask)
    output = output.transpose(1,2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
    return self.fc(output)

### Positional Encoding
Since transformers do not use recurrence or convoluion, they lack the natural order of sequences. Positional encoding explicitly adds positional information to embeddings using sine and cosine fucntions.

In [4]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len = 5000):
    super(PositionalEncoding, self).__init__()
    self.encoding = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
    self.encoding[:, 0::2] = torch.sin(position * div_term)
    self.encoding[:, 1::2] = torch.cos(position * div_term)
    self.encoding = self.encoding.unsqueeze(0)

  def forward(self, x):
    return x + self.encoding[:, :x.size(1)].to(x.device)

In [5]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attention_output = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attention_output))
        ff_output = self.ff(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x


In [None]:
class Transformer(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, num_heads, d_ff, num_layers, max_len=100):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, output_dim)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return self.fc(x)

In [None]:
def train_transformer():
    # Hyperparameters
    input_dim = 5000  # Vocabulary size
    output_dim = 5000
    d_model = 128
    num_heads = 8
    d_ff = 512
    num_layers = 4
    max_len = 100
    batch_size = 64
    epochs = 10

    # Synthetic Dataset (English to French translation)
    train_data = torch.randint(0, input_dim, (1000, max_len))
    train_labels = torch.randint(0, output_dim, (1000, max_len))

    # DataLoader
    train_loader = torch.utils.data.DataLoader(list(zip(train_data, train_labels)), batch_size=batch_size)

    # Model
    model = Transformer(input_dim, output_dim, d_model, num_heads, d_ff, num_layers).to('cuda')
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training Loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to('cuda'), y.to('cuda')
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs.view(-1, output_dim), y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

# Train Transformer
train_transformer()