# Attention Mechanisms

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from modules.Attention import DotAttention


def test_attention(N:int,D:int,D_k:int,D_v:int,Attention_function):
    X = torch.rand([N,D])
    WQ = torch.rand([D,D_k])
    WK = torch.rand([D,D_k])
    WV = torch.rand([D,D_v])

    Q = X@WQ
    K = X@WK
    V = X@WV
    
    start = time.perf_counter()
    att = Attention_function(Q,K,V)
    end = time.perf_counter()
    
    dot_attention = DotAttention()(Q,K,V)
    
    return (end-start),F.mse_loss(att,dot_attention).item()

def average_test(N:int,D:int,D_k:int,D_v:int,Attention_function,iterations):
    total_sum_duration = 0
    total_sum_loss = 0
    for i in range(iterations):
        ret_met = test_attention(N,D,D_k,D_v,Attention_function)
        total_sum_duration+=ret_met[0]
        total_sum_loss+=ret_met[1]
        
    print(f"Average Duration = {(total_sum_duration/iterations):.9f}\nAverage Loss = {total_sum_loss/iterations}")
    
    

## Dot Attention

$X \in \mathbb{R}^{n \times d}$, $W_Q \in \mathbb{R}^{d \times d_k}$, $W_k \in \mathbb{R}^{d \times d_k}$, $W_V \in \mathbb{R}^{d \times d_v}$ such that $$\text{self-attention}(X) = \sigma\left(\frac{(XW_K)^T \cdot XW_Q}{\sqrt{d_k}}\right)V$$

In [2]:
from modules.Attention import DotAttention

dot_attention = DotAttention()

N = 10
D = 20
D_k = 10
D_v = N

average_test(N,D,D_k,D_v,dot_attention,100)


Average Duration = 0.000070139
Average Loss = 0.0


## Kernal Attention

Same dimensionality as before after the matmult
$$\text{Kernal self attention} = \frac{\left(\phi({Q})\cdot\phi({K})^T\right) V}{\left(\phi({Q})\cdot\phi({K})^T\right)}$$

In [21]:
from modules.Attention import KernalAttention

kernal_attention = KernalAttention()

average_test(N,D,D_k,D_k,kernal_attention,100)


Average Duration = 0.000026162
Average Loss = 2179.9406811523436


## "Efficient Attention"


$X \in \mathbb{R}^{n \times d}$, $V \in \mathbb{R}^{n \times d_v}, K \in \mathbb{R}^{n \times d_k}, Q \in \mathbb{R}^{n \times d_k} $ $$\rho_q(Y) = \rho_q(Y) = \frac{Y}{\sqrt{n}}$$ 
and, $$\rho_q(Y) = \sigma_{\text{row}} (Y), $$ $$\rho_k(Y) = \sigma_{\text{col}}(Y)$$
 $$\text{EffAtt}(X) = \rho_q(Q) \cdot \left(\rho_k(K)^TV\right)$$

In [42]:
from modules.Attention import EfficientAttention

efficient_attention = EfficientAttention()

average_test(N,D,D_k,D_k,efficient_attention,100)

Average Duration = 0.000054338
Average Loss = 1.9434287661314011


# Multi-Headed Attention

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

from modules.Attention import MultiHeadSelfAttention

X = torch.rand([N,D])
WQ = torch.rand([D,D_k])
WK = torch.rand([D,D_k])
WV = torch.rand([D,D_v])

Q = X@WQ
K = X@WK
V = X@WV

MHA = MultiHeadSelfAttention(8,X.size(),D_k,dot_attention)
MHA(X)



RuntimeError: The size of tensor a (10) must match the size of tensor b (8) at non-singleton dimension 0