# Self Attention

Here we will look at how to work with self attention.

![](https://i.imgur.com/iBq7ZX0.png)

In [3]:
import pickle
import torch
from pathlib import Path
from lib.glove import GloveEmbeddings

In [4]:
glove_pkl_path = Path.home()/"data"/"glove.pkl"
glove = pickle.load(open(glove_pkl_path, "rb"))

## Make Embeddings

In [9]:
input = "usa is a rich country"
input_embs = glove.make(input)
input_embs.shape

torch.Size([5, 50])

## Let's Derive Weights

For now on, we are working for the word index 2.

In [58]:
input_embs.shape, input_embs[2].shape

(torch.Size([5, 50]), torch.Size([50]))

In [60]:
self_weights = input_embs @ input_embs[2].reshape(-1, 1)
self_weights.shape

torch.Size([5, 1])

In [37]:
self_weights

tensor([[ 8.1923],
        [22.7115],
        [28.0887],
        [13.4809],
        [18.6930]])

## Apply Softmax

In [46]:
prob_weights = torch.softmax(self_weights, dim=0)
torch.round(prob_weights, decimals=3)

tensor([[0.0000],
        [0.0050],
        [0.9950],
        [0.0000],
        [0.0000]])

## Multiply Weights with Embeddings

Multiple these weights with each embeddings & add them.

In [49]:
prob_weights.shape, input_embs.shape

(torch.Size([5, 1]), torch.Size([5, 50]))

In [54]:
output = prob_weights.T @ input_embs
output.shape
output

tensor([[ 0.2189,  0.4659, -0.4675,  0.1021,  1.0122,  0.7474, -0.5286, -0.2641,
          0.1686,  0.1317, -0.2459, -0.4389, -0.2166,  0.5061,  0.1349, -0.4267,
         -0.0285,  0.2090, -0.7784, -0.2003, -0.0997,  0.1588, -0.6156, -0.1816,
         -0.1229, -2.2512, -0.2246,  0.5051,  0.3214,  0.1522,  3.9628, -0.7138,
         -0.6676,  0.2802,  0.2166,  0.1421,  0.2586,  0.2341,  0.4260, -0.4434,
          0.1373,  0.3694, -0.6414,  0.0239, -0.0409, -0.2582,  0.1189, -0.0448,
          0.4096,  0.1809]])

# Let's Get the Whole Output

First let's do this for just 2 embeddings.
That's to make sure the math is correct

In [65]:
weights = input_embs @ input_embs[0:2].T
weights

tensor([[23.7851, 10.1376],
        [10.1376, 25.6866],
        [ 8.1923, 22.7115],
        [ 5.2201, 15.3940],
        [10.4412, 19.1203]])

In [78]:
probs = torch.softmax(weights, dim=0)
probs

tensor([[1.0000e+00, 1.6786e-07],
        [1.1830e-06, 9.5013e-01],
        [1.6911e-07, 4.8497e-02],
        [8.6568e-09, 3.2192e-05],
        [1.6026e-06, 1.3368e-03]])

In [80]:
probs.shape, input_embs.shape

(torch.Size([5, 2]), torch.Size([5, 50]))

In [84]:
output = probs.T @ input_embs
output.shape

torch.Size([2, 50])

**Now, let's do this for the whole embeddings**

In [89]:
weights = torch.softmax(input_embs @ input_embs.T, dim=0)
output = weights.T @ input_embs
output.shape

torch.Size([5, 50])

# Creating a Layer

In [91]:
class SelfAttention(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, inputs):
        weights = torch.softmax(inputs @ inputs.T, dim=0)
        output = weights.T @ inputs
        return output

In [93]:
m = SelfAttention()
input_embs.shape, m(input_embs).shape

(torch.Size([5, 50]), torch.Size([5, 50]))