<a href="https://colab.research.google.com/github/ZERO-70/Transformer/blob/main/Transformers_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Embedder for tokens

In [None]:
import torch
import math
import torch.nn as nn

class Embedder(nn.Module):
  def __init__(self, vocab_size:int, d_model:int):
    super.__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, d_model)
  def forward(self, x):
    return self.embedding(x)* math.sqrt(d_model)


#Positional Encoder for Embeddings

In [None]:
class PositionalEncoder(nn.Module):
  def __init__(self,d_model,max_seq_len):
    super.__init__()
    self.d_model = d_model
    self.max_seq_len = max_seq_len
    pe = torch.zeros(max_seq_len,d_model)
    positions = torch.arange(max_seq_len).float().unsqueeze(1)
    denominator = torch.exp(
        torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
    )
    pe[:, 0::2] = torch.sin(positions * denominator)
    pe[:, 1::2] = torch.cos(positions * denominator)
    pe = pe.unsqueeze(0)
    self.register_buffer("pe", pe)

In [None]:
position = torch.arange(0, 12, dtype=torch.float).unsqueeze(1)

print(position)

tensor([[ 0.],
        [ 1.],
        [ 2.],
        [ 3.],
        [ 4.],
        [ 5.],
        [ 6.],
        [ 7.],
        [ 8.],
        [ 9.],
        [10.],
        [11.]])


#MultiHeadAttention

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self,d_model,num_heads):
    super().__init__()
    assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
    self.d_model = d_model
    self.num_heads = num_heads
    self.head_dim = d_model // num_heads
    self.QueryLinear = nn.Linear(d_model, d_model, bias = False)
    self.KeyLinear = nn.Linear(d_model, d_model, bias = False)
    self.ValueLinear = nn.Linear(d_model, d_model, bias = False)
    self.FinalLinear = nn.Linear(d_model, d_model, bias = False)

  def split_input_into_heads(self,x,batch_size):
    seq_lenght = x.size(1)
    x = x.reshape(batch_size, seq_lenght, self.num_heads, self.head_dim)
    return x.permute(0, 2, 1, 3)

  def compute_attention(self,query,key,value,mask=None):
    scores = torch.matmul(query,key.transpose(-2,-1))/(self.head_dim ** 0.5)
    if mask is not None:
      scores = scores.masked_fill(mask == 0, float('-inf'))
    attention_weights = torch.softmax(scores,dim=-1)
    return torch.matmul(attention_weights,value)

  def combine_attention(self,x,batch_size):
    x = x.permute(0, 2, 1, 3).contiguous()
    # -1 parameter means to figure out the dimesion
    x = x.reshape(batch_size, -1, self.d_model)
    return x

  def forward(self,query,key,value,mask = None):
    batch_size = query.size(0)
    query = self.split_input_into_heads(self.QueryLinear(query),batch_size)
    key = self.split_input_into_heads(self.KeyLinear(key),batch_size)
    value = self.split_input_into_heads(self.ValueLinear(value),batch_size)
    attention = self.compute_attention(query,key,value,mask)
    reorderd_attention = self.combine_attention(attention,batch_size)
    return self.FinalLinear(reorderd_attention)


#FeedForward

In [None]:
class FeedForward(nn.Module):
  def __init__(self,d_model,d_ff):
    super().__init__()
    self.linear1 = nn.Linear(d_model,d_ff)
    self.linear2 = nn.Linear(d_ff,d_model)
    self.relu = nn.ReLU()
  def forward(self,x):
    return self.linear2(self.relu(self.linear1(x)))

#Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self,d_model,num_heads,d_ff,drop_out):
    super().__init__()
    self.attention_block = MultiHeadAttention(d_model,num_heads)
    self.feed_block = FeedForward(d_model,d_ff)
    self.layer_norm1 = nn.LayerNorm(d_model)
    self.layer_norm2 = nn.LayerNorm(d_model)
    self.drop_out = nn.Dropout(drop_out)
  def forward(self,x,mask):
    attention_output = self.attention_block(x,x,x,mask)
    x = self.layer_norm1(x + self.drop_out(attention_output))
    feed_output = self.feed_block(x)
    return self.layer_norm2(x + self.drop_out(feed_output))

#Wraper for Multiple Encoders

In [None]:
class TransformerEncoder(nn.Module):
  def __init__(self,vocab_size,d_model,num_layers,num_heads,d_ff,drop_out,max_seq_len):
    super().__init__()
    self.embedder = Embedder(vocab_size,d_model)
    self.positional_encoder = PositionalEncoder(d_model,max_seq_len)
    self.encoder_layers = nn.ModuleList([Encoder(d_model,num_heads,d_ff,drop_out) for _ in range(num_layers)])
    self.drop_out = nn.Dropout(drop_out)

  def forward(self,x,mask):
    x = self.embedder(x)
    x = self.positional_encoder(x)
    for layer in self.encoder_layers:
      x = layer(x,mask)
    return x

#Classifier

In [None]:
import torch.nn.functional as F
class ClassifierHead(nn.Module):
  def __init__(self, d_model,num_classes):
    super().__init__()
    self.linear = nn.Linear(d_model,num_classes)

  def forward(self,x):
    logits = self.linear(x)
    return F.log_softmax(logits,dim = -1)
