In [None]:

import torch
import torch.nn as nn
import math


class InputEmbeddings(nn.Module):
    def __init__(self, d_model, vocab_size, padding_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(d_model,vocab_size, padding_idx=padding_idx)

    def forward(self, x):
        return self.embedding(x)*math.sqrt(self.embedding.embedding_dim)
0

Note: you may need to restart the kernel to use updated packages.




0

In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, seq_len, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        
        %Create a matrix of shape (seq_len, d_model) to hold the positional encodings
        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)) #Implemented the eqn in log for ease of computation
        %Apply the sine function to even indices in the array; 2i
        pe[:, 0::2] = torch.sin(position * div_term)
        %Apply the cosine function to odd indices in the array; 2i+1            
        pe[:, 1::2] = torch.cos(position * div_term)    
        
        pe = pe.unsqueeze(0)  # Shape (1, seq_len, d_model)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + (self.pe[:, :x.size(1), :]).require_grad_(False)
        return self.dropout(x)

In [18]:
class  LayerNorm(nn.Module):
    def __init__(self, eps=10e-6):
        super().__init__()
        self.eps = eps
        self.alpha=nn.Parameter(torch.ones(1))#Multiplied
        self.bias=nn.Parameter(torch.zeros(1))#Addeed
        
    def forward(self, x):
        mean=x.mean (-1, keepdim=True)
        std=x.std (-1, keepdim=True)
        return self.alpha*(x-mean)/(std+self.eps)+self.bias
        

In [19]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff,dropout=0.1):
        super().__init__()
        self.linear1=nn.Linear(d_model,d_ff)#W1 ansd B1
        self.dropout=nn.Dropout(dropout)
        self.linear2=nn.Linear(d_ff,d_model) #W2 and B2
        self.relu=nn.ReLU() 
        
    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))
    

In [20]:
class MultiHeadAttention(nn.Module):
    def __init__ (self, d_model, h, dropout):
        super().__init__()
        self.d_model=d_model
        self.h = h
        assert d_model % h == 0, "d_model must be divisible by h"
        
        self.d_k = d_model//h
        self.w_q=nn.Linear(d_model,d_model) #Wq
        self.w_k=nn.Linear(d_model,d_model) #Wk
        self.w_v=nn.Linear(d_model,d_model) #Wv
        
        self.w_o=nn.Linear(d_model,d_model) #Wo
        self.dropout=nn.Dropout(dropout)
        
    @staticmethod
    def attention(query, key, value, mask=None, dropout=None):
        d_k=query.shape[-1]
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)  # Scaled dot-product attention
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
        if dropout is not None:
            attention_scores = dropout(torch.softmax(attention_scores, dim=-1))
            
        return (attention_scores @ value), attention_scores  # (batch_size, h, seq_len, d_k), (batch_size, h, seq_len, seq_len)
    
    def forward(self,query, key, value,mask):
        query=self.w_q(query) # (batch_size, seq_len, d_model)
        key=self.w_k(key)     # (batch_size, seq_len, d_model)
        value=self.w_v(value) # (batch_size, seq_len, d_model)
        
        query=query.view(query.size(0), -1, self.h, self.d_k).transpose(1,2) # (batch_size, h, seq_len, d_k)
        key=key.view(key.size(0), -1, self.h, self.d_k).transpose(1,2)         # (batch_size, h, seq_len, d_k)
        value=value.view(value.size(0), -1, self.h, self.d_k).transpose(1,2)     # (batch_size, h, seq_len, d_k)

        #batc   h_size, h, seq_len, d_k--> batch, seq_len, d_model
        x = x.transpose(1, 2).contiguous().view(x.size(0), -1, self.h * self.d_k)
        
        return self.w_o(x)  # (batch_size, seq_len, d_model) 

        

In [21]:
class ResidualConnection(nn.Module):
    def __init__(self, dropout):
        super().__init__()
        self.dropout=nn.Dropout(dropout)
        self.norm=LayerNorm()
        
    def forward(self, x, sublayer):
        return (x + self.dropout(sublayer(self.norm(x)))) 

In [22]:
class Encoder_Block(nn.Module):
    def __init__(self, self_attention_block=MultiHeadAttention, feed_forward_block=FeedForward, dropout=0.1):
        super().__init__()
        self.self_attention_block=self_attention_block
        self.feed_forward_block=feed_forward_block
        self.residual_connections=nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])
        
    def forward(self,x,src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

In [23]:
class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers=layers
        self.norm=LayerNorm()

def forward(self, x, src_mask):
        for layer in self.layers:
            x=layer(x,src_mask)
        return self.norm(x) 

In [24]:
class Decoder_Block(nn.Module):
    def __init__(self,self_attention_block=MultiHeadAttention,cross_attention_block=MultiHeadAttention, feed_forward_block=FeedForward, dropout=0.1):
        super().__init__()
        self.self_attention_block=self_attention_block
        self.cross_attention_block=cross_attention_block
        self.feed_forward_block=feed_forward_block
        self.residual_connections=nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])
        
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x=self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x=self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))

In [25]:
class Decoder(nn.Module):
    def __init__ (self, layers: nn.ModuleList()) -> None: 
        super().__init__()
        self.layers=layers
        self.norm=LayerNorm()
        
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x=layer(x,encoder_output,src_mask,tgt_mask)
        return self.norm(x)

In [26]:
class ProjectionLayer(nn.Module):
    def __init__(self,d_model,vocab_size):
        super().__init__()
        self.linear=nn.Linear(d_model,vocab_size)
    
    def forward(self, x):
        #Batch, seq_len, d_model--> Batch, seq_len, vocab_size
        return torch.log_softmax(self.proj(x), dim=-1)

In [27]:
class Transformer(nn.Module):
    def __init__self(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed:InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding,projection_layer: ProjectionLayer):
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.src_embed=src_embed
        self.tgt_embed=tgt_embed
        self.src_pos=src_pos
        self.tgt_pos=tgt_pos
        self.projection_layer=projection_layer
        
    def encode(self, src, src_mask):
        src=self.src_embed
        src=self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, tgt, encoder_output, src_mask, tgt_mask):
        tgt=self.tgt_embed
        tgt=self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask) 
    
    def project(self, x):
        return self.projection_layer(x)
    

        

In [None]:
def Build(src_vocab_size, tgt_vocab_size, src_seq_len, tgt_seq_len, d_model=512, d_ff=2048, h=8, N=6, dropout=0.1, padding_idx=0):
    #Create the embedding and positional encoding layers for both source and target
    src_embed=InputEmbeddings(d_model,src_vocab_size,padding_idx)
    tgt_embed=InputEmbeddings(d_model,tgt_vocab_size,padding_idx)
    
    src_pos=PositionalEncoding(d_model,src_seq_len,dropout)
    tgt_pos=PositionalEncoding(d_model,tgt_seq_len,dropout)
    
    #Create the encoder and decoder blocks
    encoder_blocks=[]
    for _ in range(N):
        encoder_self_attention=MultiHeadAttention(d_model,h,dropout)
        feed_forward_block=FeedForward(d_model,d_ff,dropout)
        encoder_block=Encoder_Block(encoder_self_attention,feed_forward_block,dropout)
        encoder_blocks.append(encoder_block)
        
    decoder_blocks=[]
    for _ in range(N):
        decoder_self_attention=MultiHeadAttention(d_model,h,dropout)
        cross_attention=MultiHeadAttention(d_model,h,dropout)
        feed_forward_block=FeedForward(d_model,d_ff,dropout)
        decoder_block=Decoder_Block(decoder_self_attention,cross_attention,feed_forward_block,dropout)
        decoder_blocks.append(decoder_block) 
        
    #Create Encoder and Decoder   
    encoder=Encoder(nn.ModuleList(encoder_blocks))
    decoder=Decoder(nn.ModuleList(decoder_blocks))
    
    #Create the projection layer    
    projection_layer=ProjectionLayer(d_model,tgt_vocab_size)
    
    transformer = Transformer(encoder,decoder,src_embed,tgt_embed,src_pos,tgt_pos,projection_layer)
    #parameters initialize
    for p in transformer.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)
    return transformer


In [2]:
pip install tensorboard

Collecting tensorboard
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Downloading markdown-3.9-py3-none-any.whl.metadata (5.1 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Downloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   ----- ---------------------------------- 0.8/5.5 MB 3.4 MB/s eta 0:00:02
   ----------- ---------------------------- 1.6/5.5 MB 4.0 MB/s eta 0:00:01
   -------------------- ------------------- 2.9/5.5 MB 4.5 MB/s eta 0:00:01
   ------------

