In [1]:
import pandas as pd
import numpy as np
import nltk
import gensim
import torch
import torch.nn as nn

In [2]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.W_Q = nn.Linear(d_model, d_model, bias=False)
        self.W_K = nn.Linear(d_model, d_model, bias=False)
        self.W_V = nn.Linear(d_model, d_model, bias=False)
        self.W_O = nn.Linear(d_model, d_model, bias=False)

    def forward(self, x, mask=None):
        # x has a shape of (batch, sequence_length, model_dimension)
        B = x.shape[0]
        L = x.shape[1]
        
        Q = self.W_Q(x) # all are of the shape (B, L, d_model)
        K = self.W_K(x)
        V = self.W_V(x)
        
        
        # creating multiple heads from these single heads, ie. Q, K, V by
        # reshaping the shape to (B, L, num_heads, head_dim) where num_heads*head_dims
        # is equal to the d_model.
        Q = Q.reshape(B, L, self.num_heads, self.head_dim)
        K = K.reshape(B, L, self.num_heads, self.head_dim)
        V = V.reshape(B, L, self.num_heads, self.head_dim)

        # rearranging the dimensions to (B, num_heads, L, head_dim)
        Q = Q.permute(0, 2, 1, 3)
        K = K.permute(0, 2, 1, 3)
        V = V.permute(0, 2, 1, 3)

        # calculating the scaled attention scores
        scores = Q @ K.transpose(-2, -1) # dim = (B, num_heads, L, L)
        scores = scores / (self.head_dim**0.5)
        if mask is not None:
            scores = scores + mask
        attention_weights = torch.softmax(scores, dim=-1) #dim = (B, num_heads, L, L)
        out = attention_weights @ V

        # making the output dimensions equal to the input dimensions.
        out = out.permute(0, 2, 1, 3)
        out = out.reshape(B, L, self.d_model)

        # applying the output projection
        out = self.W_O(out)

        return out

In [3]:
#x = torch.randn(32, 10, 256)
#mha = MultiHeadSelfAttention(256, 8)
#y = mha(x)

#print(x.shape, y.shape)
#print(y)

In [4]:
class FeedForward(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(d_model, 4*d_model),
            nn.ReLU(),
            nn.Linear(4*d_model, d_model)
        )

    def forward(self, x):
        out = self.network(x)
        return out

In [5]:
#ffn = FeedForward(256)
#x = torch.randn(2, 5, 256)
#y = ffn(x)

#print(x.shape, y.shape)


In [7]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        
        self.attention = MultiHeadSelfAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        
        attn_out = self.attention(x, mask) 

        x = self.layer_norm1(attn_out + x)
        
        ffn_out = self.feed_forward(x)
        
        x = self.layer_norm2(ffn_out + x)

        return x

In [10]:
#x = torch.randn(2, 5, 256)
#block = EncoderBlock(256, 8)
#y = block(x)

#print(x.shape, y.shape)

In [11]:
#y.mean().backward()

In [12]:
class Encoder(nn.Module):
    def __init__(self, d_model, num_heads, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.blocks = nn.ModuleList([EncoderBlock(d_model, num_heads) for i in range(num_layers)])
        
    def forward(self, x, mask=None):
        for block in self.blocks:
            x = block(x, mask)
        return x