In [1]:
import torch
import torch.nn as nn
import math

In [118]:
class InputEmbedding(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size =  vocab_size
        self.embedding = nn.Embedding(vocab_size,d_model)
    
    def forward(self, x):
        return self.embedding(x)

In [119]:
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model:int, seq_len:int, dropout:float):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        positional_encoding = torch.zeros(seq_len,d_model)
        positions = torch.arange(0, seq_len, dtype=torch.float)
        div_term = torch.exp(torch.arange(0,d_model,2) * (-math.log(10000)/d_model))
        self.dropout = nn.Dropout(dropout)
        
        # Positonal encodings
        positional_encoding[:,::2] = torch.sin(positions * div_term)
        positional_encoding[:,1::2] = torch.cos(positions * div_term)
        positional_encoding.unsqueeze_(0)

        # make it non-trainable, does not consider as parameter
        self.register_buffer('pe',positional_encoding)

    def forward(self, x):
        x = x + (self.pe[:,:x.size(1),:]).requires_grad_(False)
        return self.dropout(x)


In [120]:
class LayerNormalization(nn.Module):
    def __init__(self, eps=10**-6):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1)) # Scaling
        self.bias = nn.Parameter(torch.zeros(1)) # Adding
        
    def forward(self,x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)

        return self.alpha * (x-mean)/(std+self.eps) + self.bias


In [121]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model:int, dim_ff:int,dropout:float):
        super().__init__()
        self.ff1 = nn.Linear(d_model,dim_ff)
        self.ff2 = nn.Linear(dim_ff,d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self,x):
        x = self.ff1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.ff2(x)
        return x

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, heads:int=8, mask=None):
        super().__init__()
        # Initialize Q, K, V metrix
        self.query_weights = nn.Linear(d_model,d_model)
        self.key_weights = nn.Linear(d_model, d_model)
        self.value_weights = nn.Linear(d_model, d_model)
        self.heads = heads
        assert d_model % heads == 0, "d_model should be dividable by heads"
        self.d_k = d_model // self.heads
        self.mask = mask

        # output networl weights
        self.output_weights = nn.Linear(d_model, d_model)
    
    @staticmethod
    def attention_score(queries, keys, values, mask,d_k):
        attention_score = (queries @ keys.transpose(-1,-2))/torch.sqrt(d_k)
        if mask is not None:
            attention_score.masked_fill_(mask==True, -1e9) # masking with very small value instead of '-inf'
            
        attention_score = attention_score.softmax(dim=-1)
        return attention_score
        


    def forward(self,x, mask):
        queries = self.query_weights(x) # (batch,seq,d_model)
        keys = self.key_weights(x)
        values = self.value_weights(x)

        # Splitting heads
        queries = queries.view(queries.size[0], queries.size[1], self.head, self.d_k) 
        queries = queries.transpose(1,2) 

        keys = keys.view(queries.size[0], queries.size[1], self.head, self.d_k) 
        keys = keys.transpose(1,2)

        values = values.view(queries.size[0], queries.size[1], self.head, self.d_k) #(batchsize,seq,heads,d_k)
        values = values.transpose(1,2) # (batch, head, seq, d_k)

        # attention score
        attention_scores = MultiHeadAttention.attention_score(queries, keys, values, self.mask, )

        

        
        







In [212]:
-1e9

-1000000000.0

In [213]:
torch.arange(30,dtype=torch.float).softmax_(dim=-1)

AttributeError: 'Tensor' object has no attribute 'softmax_'