# DeepLearningAI course

## To do

- add formatting packages from ds-base

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F  # to access softmax function

In [26]:
class SelfAttention(nn.Module):

    def __init__(self, d_model=2, row_dim=0, col_dim=1):
        """
        d_model
            Dimension of the model or number of word embedding values per token
            Will define size of the weight matrices to create queries, keys, values
            (needs to match number of word embedding values for the matrix math)

        row_dim
            convenience parameter to modify row index of data

        col_dim
            convenience parameter to modify col index of data

        Note: Usually first dimenions is batch size but we won't be using batches of data
        

        """
        super().__init__()  # to inherit parent's init method

        # Create weight for queries, using nn.Linear will create Weight matrix and
        # do math for us
        # in_features, out_features: number of rows and columns in weight matrix
        # but it can be modified as long as matrix math works out
        # bias: no additonal bias terms when False
        # will create currently untrained weights
        # in notes, weights is with transpose symbol because of how pytorch prints weights
        
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = row_dim
        self.col_dim = col_dim


    def forward(self, token_encodings, return_scaled_sims=False):
        """Calculate self-attention values for each token

        Attention(Q,K,V) = softmax( QK^T / sqrt(d_k)) * V

        """
        #Q = self.W_q @ token_encodings
        # this does matrix multiplication because the
        # input argument to nn.Linear is data
        q = self.W_q(token_encodings)

        k = self.W_k(token_encodings)

        v = self.W_v(token_encodings)

        # torch.matmul = matrix multiply 
        # get similarities/dot products
        unscaled_sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
        scaled_sims = unscaled_sims / torch.tensor(k.size(self.col_dim)**0.5)

        # to help visualize attention pattern
        if return_scaled_sims:
            return scaled_sims

        # calculate attention
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)
        attention_scores = torch.matmul(attention_percents, v)
         
        return attention_scores


    

In [27]:
encodings_matrix = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

torch.manual_seed(42)

selfAttention = SelfAttention(d_model=2, row_dim=0, col_dim=1)

# since the model inherits from nn.Module
# it will pass the encodings into the forward matrix
# giving us attention scores (self-attention values)
selfAttention(encodings_matrix)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [28]:
scaled_sims = selfAttention(encodings_matrix, return_scaled_sims=True)
print("Scaled Similarity Scores:")
print(scaled_sims)

Scaled Similarity Scores:
tensor([[-0.0700,  0.0458, -0.4612],
        [-0.2844,  0.2883, -2.1230],
        [ 0.3424, -0.4725,  2.8610]], grad_fn=<DivBackward0>)


In [14]:
# validate math was done correctly by printing out the weights in the matrix
# that we used to calculate
# and verify things manually

selfAttention.W_q.weight.transpose(0, 1)

tensor([[ 0.5406, -0.1657],
        [ 0.5869,  0.6496]], grad_fn=<TransposeBackward0>)

In [15]:
selfAttention.W_k.weight.transpose(0, 1)

tensor([[-0.1549, -0.3443],
        [ 0.1427,  0.4153]], grad_fn=<TransposeBackward0>)

In [16]:
selfAttention.W_v.weight.transpose(0, 1)

tensor([[ 0.6233,  0.6146],
        [-0.5188,  0.1323]], grad_fn=<TransposeBackward0>)