In [3]:
import torch
import torch.nn as nn # to get the linear and module classes, and other helper functions
import torch.nn.functional as F #accessing the softmax function we will use later

In [None]:
# nn.Module is the base class for all neural network modules we make with PyTorch

class SelfAttention(nn.Module):

    def __init__(self, d_model=2,
                 row_dim = 0,
                 col_dim = 1):

            # d_model = 2 because it is the number of embedding values per token, also the dimension; # of values we have for each token
            # if we were dealing with batches of data, we would use row_dim to access rows and we would specify batch size with these
            # so row_dim and col_dim = indices we use to access rows and columns
            #i believe col_dim = 1 because we are only dealing with 1 data size
            
        super().__init__()

        # now we define and create the weights we will use for Q, K and V (query weights, key weights and value weights)
        # remember encoded_values * query_weights_transpose = Q
        # in_features = # rows in the weight matrix. out_features = # columns. So both d_model 
        # in the original Transformers manuscript, they dont add any bias, so we set bias=False
        # the weights are not just storing the values, theyre doing the math because of the nn.Linear

        self.W_q = nn.Linear(in_features= d_model, out_features = d_model, bias= False)
        self.W_k = nn.Linear(in_features= d_model, out_features = d_model, bias= False)
        self.W_v = nn.Linear(in_features= d_model, out_features = d_model, bias= False)

        self.row_dim = row_dim
        self.col_dim = col_dim

    def forward(self, token_encodings):

        # this is where we actually calculate the self-attention values for each token (remember token is word/symbol)
        # we pass in the token_encodings which are the word embeddings AND positional encoding for each token 

        q = self.W_q(token_encodings) #does the matrix multiplication
        k = self.W_k(token_encodings)
        v = self.W_v(token_encodings)

        # now we calculate self attention
        # compute unscaled dot product similarities matrix /scores: (Q * K^T)

        similarity_score = torch.matmul(q,k.transpose(dim0= self.row_dim,dim1=self.col_dim))

        # then we scale the similarities by dividing by the square root of the dimension of the key matrix; so d_model. see above comment

        scaled_sim = similarity_score/ torch.tensor(k.size(self.col_dim)**0.5)

        # then we take the softmax of scaled matrix to get the probabilities/ what percentage of each token's value to use in the finial attention values

        attention_percents = F.softmax(scaled_sim,dim=self.col_dim)

        #then we multiply by the values V to scale the values by their associated percentages which then gives us the self-attention scores

        attention_score = torch.matmul(attention_percents,v)

        return attention_score



In [None]:
# now let us test to ensure it works
# first create a matrix of token encodings

encodings_matrix = torch.tensor([[1.16, 0.23],
                                 [0.57, 1.36],
                                 [4.41, -2.16]])

# set the seed number for a random generator
torch.manual_seed(42)

selfAttention= SelfAttention(d_model=2,row_dim=0,col_dim=1)

# calculate self attention for the token encoding
selfAttention(encodings_matrix)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [10]:
# lets check and confirm that the math was done well 

selfAttention.W_q.weight.transpose(0,1)

tensor([[ 0.5406, -0.1657],
        [ 0.5869,  0.6496]], grad_fn=<TransposeBackward0>)

In [11]:
selfAttention.W_k.weight.transpose(0,1)

tensor([[-0.1549, -0.3443],
        [ 0.1427,  0.4153]], grad_fn=<TransposeBackward0>)

In [12]:
selfAttention.W_v.weight.transpose(0,1)

tensor([[ 0.6233,  0.6146],
        [-0.5188,  0.1323]], grad_fn=<TransposeBackward0>)

In [19]:
# calculating queries, keys and values 
q = selfAttention.W_q(encodings_matrix)
k = selfAttention.W_k(encodings_matrix)
v= selfAttention.W_v(encodings_matrix)
print(q)
print(k)
print(v)

tensor([[ 0.7621, -0.0428],
        [ 1.1063,  0.7890],
        [ 1.1164, -2.1336]], grad_fn=<MmBackward0>)
tensor([[-0.1469, -0.3038],
        [ 0.1057,  0.3685],
        [-0.9914, -2.4152]], grad_fn=<MmBackward0>)
tensor([[ 0.6038,  0.7434],
        [-0.3502,  0.5303],
        [ 3.8695,  2.4246]], grad_fn=<MmBackward0>)


In [20]:
sims= torch.matmul(q,k.transpose(dim0=0,dim1=1)) # calculating unscaled similarity
scaled_sim = sims/(torch.tensor(2)**0.5) #scale by dividing by d_model
attention_percents = F.softmax(scaled_sim,dim=1) #take the softmax of scaled matrix to get the probabilities
attention_score = torch.matmul(attention_percents,v)
print(sims)
print(scaled_sim)
print(attention_percents)

tensor([[-0.0990,  0.0648, -0.6523],
        [-0.4022,  0.4078, -3.0024],
        [ 0.4842, -0.6683,  4.0461]], grad_fn=<MmBackward0>)
tensor([[-0.0700,  0.0458, -0.4612],
        [-0.2844,  0.2883, -2.1230],
        [ 0.3424, -0.4725,  2.8610]], grad_fn=<DivBackward0>)
tensor([[0.3573, 0.4011, 0.2416],
        [0.3410, 0.6047, 0.0542],
        [0.0722, 0.0320, 0.8959]], grad_fn=<SoftmaxBackward0>)


In [21]:
attention_score

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)