2023-1-2

Changes:
- mandate that the embedding dimension equals the dimension of the output of the self-attention layer and  the subsequent FF layers (due to skip-connection additions)
- removed the custom FFlayer class 

Todo:
- implement positional embeddings
- implement decoder

In [8]:
import numpy as np
import torch 
import torchvision
from torchvision import transforms
from torch import nn
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import random
import copy
import matplotlib.pyplot as plt
import math
import pandas as pd


In [9]:
class SelfAttention(nn.Module):
    def __init__(self, embedding_dim, qkv_dim):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.qkv_dim = qkv_dim
        
        self.query_matrix = Parameter(torch.rand(embedding_dim, qkv_dim), requires_grad=True)
        self.key_matrix = Parameter(torch.rand(embedding_dim, qkv_dim), requires_grad=True)
        self.value_matrix = Parameter(torch.rand(embedding_dim, qkv_dim), requires_grad=True)
        
    def forward(self, embeddings):
        queries = torch.matmul(embeddings, self.query_matrix)
        keys = torch.matmul(embeddings, self.key_matrix)
        values = torch.matmul(embeddings, self.value_matrix)

        attention_scores = torch.matmul(queries, keys.T)
        scaled_attention_scores = attention_scores/math.sqrt(self.qkv_dim)
        softmaxed_attention_scores = torch.softmax(scaled_attention_scores, dim=1)
        #print(softmaxed_attention_scores, values)
        return torch.matmul(softmaxed_attention_scores, values)
        

In [41]:
class MultiheadAttention(nn.Module):
    def __init__(self, embedding_dim, 
                 qkv_dim, 
                 #out_dim, Note: the out_dim must match the embedding_dim, since there is a skip connection (the input to the MHA will be added to the output of the MHA)
                 n_heads=8):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.qkv_dim = qkv_dim
        self.n_heads = n_heads
        
        self.attention_heads = [SelfAttention(embedding_dim, qkv_dim) for i in range(n_heads)]
        self.after_concat_multiplier_matrix = Parameter(torch.rand(qkv_dim*n_heads, embedding_dim), requires_grad=True)
        
    def forward(self, embeddings):
        indiv_att_outs = [attention_head(embeddings) for attention_head in self.attention_heads]
        concat_outs = torch.concat(indiv_att_outs,  dim=1)
        recombined = torch.matmul(concat_outs, self.after_concat_multiplier_matrix)
        
        return recombined

In [76]:
class TransformerEncoder(nn.Module):
    def __init__(self, 
                 embedding_dim, #dimension of the word embeddings 
                 qkv_dim, #dimension of the query, key, and value vectors
                ):
        super().__init__()
        
        self.layer_norm = nn.LayerNorm(embedding_dim)
        
        self.multiheadAttention1 = MultiheadAttention(embedding_dim, qkv_dim)
        self.feedforwardLayer1 = nn.Linear(embedding_dim, embedding_dim, bias=True)
        
        self.multiheadAttention2 = MultiheadAttention(embedding_dim, qkv_dim)
        self.feedforwardLayer2 = nn.Linear(embedding_dim, embedding_dim, bias=True)
        
        self.embedding_dim = embedding_dim
        self.qkv_dim = qkv_dim
        
    def forward(self, embeddings):
        MHA1_output = self.multiheadAttention1(embeddings)
    
        #Perform the skip connection addition, then perform a layer normalization:
        layernormed_output1 = self.layer_norm(embeddings + MHA1_output)
    
        #Feed the layer normed output into the feedforward layers... but before we do, create a deep copy of them since we will perform another skip connection addition after:
        FF1_output = self.feedforwardLayer1(layernormed_output1)
        
        layernormed_output2 = self.layer_norm(layernormed_output1 + FF1_output) #these will be the embeddings that we will pass to the second self-attention layer
        #Do the same for the second encoder:
        MHA2_output = self.multiheadAttention2(layernormed_output2)
        layernormed_output3 = self.layer_norm(layernormed_output2 + MHA2_output)
        FF2_output = self.feedforwardLayer2(layernormed_output2)
        
        final_encoder_output = self.layer_norm(FF2_output + layernormed_output3)
        return final_encoder_output
        

In [77]:
enc = TransformerEncoder(4, 3)

In [78]:
fake_word_embeddings = torch.rand(2, 4)

In [79]:
enc(fake_word_embeddings)

tensor([[ 0.0838,  0.7952, -1.6590,  0.7800],
        [ 0.1211,  0.7543, -1.6692,  0.7938]],
       grad_fn=<NativeLayerNormBackward0>)