<a href="https://colab.research.google.com/github/mbrudd/LLMs/blob/main/simple_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import tiktoken

In [2]:
raw_text = "TThe cat sat on the mat"

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
enc_text = tokenizer.encode( raw_text )
print(enc_text)

[51, 464, 3797, 3332, 319, 262, 2603]


In [5]:
vocab_size = 4
output_dim = 8
inputs = torch.nn.Embedding( vocab_size, output_dim )
print( inputs.weight )

Parameter containing:
tensor([[-0.6937, -0.6726,  1.1384,  1.3864, -0.0198,  0.0271, -0.2929,  0.3828],
        [-0.4971,  0.1606,  0.6017,  0.8687, -1.0573,  0.5387,  0.0531,  1.1315],
        [-0.3628, -1.1921,  0.1930, -1.3738, -0.1367, -0.2918,  1.1546,  0.2428],
        [-0.3136, -1.4087, -0.4015, -1.3621, -0.2958,  0.8103, -0.0063, -0.4285]],
       requires_grad=True)


In [6]:
inputs = inputs.weight.data
inputs

tensor([[-0.6937, -0.6726,  1.1384,  1.3864, -0.0198,  0.0271, -0.2929,  0.3828],
        [-0.4971,  0.1606,  0.6017,  0.8687, -1.0573,  0.5387,  0.0531,  1.1315],
        [-0.3628, -1.1921,  0.1930, -1.3738, -0.1367, -0.2918,  1.1546,  0.2428],
        [-0.3136, -1.4087, -0.4015, -1.3621, -0.2958,  0.8103, -0.0063, -0.4285]])

In [7]:
inputs.shape

torch.Size([4, 8])

In [8]:
for row in inputs:
    print(row.tolist())

[-0.6937108635902405, -0.672611653804779, 1.1383668184280396, 1.3864461183547974, -0.019802803173661232, 0.02710053324699402, -0.2929268479347229, 0.3827540874481201]
[-0.4970836341381073, 0.1605609655380249, 0.6016968488693237, 0.8687182664871216, -1.0572795867919922, 0.5386615991592407, 0.05309273675084114, 1.131486415863037]
[-0.36277061700820923, -1.1921308040618896, 0.1929905116558075, -1.373810052871704, -0.13666342198848724, -0.29176580905914307, 1.1545780897140503, 0.24278685450553894]
[-0.31358978152275085, -1.4086920022964478, -0.40148651599884033, -1.3620620965957642, -0.29576554894447327, 0.8103487491607666, -0.006349856965243816, -0.4285391867160797]


In [9]:
x = torch.Tensor([1.1, 2.3])
y = torch.Tensor([3.4,-2.1])


In [10]:
1.1*3.4 + 2.3*(-2.1)

-1.0899999999999999

In [11]:
torch.dot( x, y)

tensor(-1.0900)

In [12]:
query = inputs[2]
print(query)

tensor([-0.3628, -1.1921,  0.1930, -1.3738, -0.1367, -0.2918,  1.1546,  0.2428])


In [13]:
for i in range( len( inputs )):
    print( torch.dot( query, inputs[i] ) )

tensor(-0.8820)
tensor(-0.7651)
tensor(4.9732)
tensor(3.2795)


In [14]:
attention_scores_2 = torch.zeros( len(inputs) )
for i in range( len( inputs )):
    attention_scores_2[i] = torch.dot( query, inputs[i] )
print( attention_scores_2)

tensor([-0.8820, -0.7651,  4.9732,  3.2795])


In [15]:
# normalize the attention scores using the softmax function:
# def softmax(x):
#     torch.exp(x) / torch.exp(x).sum()

In [16]:
attention_weights_2 = torch.softmax( attention_scores_2, dim = 0 )
attention_weights_2

tensor([0.0024, 0.0027, 0.8404, 0.1545])

In [17]:
attention_weights_2.sum()

tensor(1.)

In [18]:
context_vector_2 = torch.zeros( query.shape )
for i in range( len( attention_weights_2 ) ):
    context_vector_2 += attention_weights_2[i]*inputs[i]
context_vector_2

tensor([-0.3563, -1.2207,  0.1045, -1.3593, -0.1635, -0.1185,  0.9688,  0.1418])

In [19]:
inputs

tensor([[-0.6937, -0.6726,  1.1384,  1.3864, -0.0198,  0.0271, -0.2929,  0.3828],
        [-0.4971,  0.1606,  0.6017,  0.8687, -1.0573,  0.5387,  0.0531,  1.1315],
        [-0.3628, -1.1921,  0.1930, -1.3738, -0.1367, -0.2918,  1.1546,  0.2428],
        [-0.3136, -1.4087, -0.4015, -1.3621, -0.2958,  0.8103, -0.0063, -0.4285]])

In [20]:
inputs.T

tensor([[-0.6937, -0.4971, -0.3628, -0.3136],
        [-0.6726,  0.1606, -1.1921, -1.4087],
        [ 1.1384,  0.6017,  0.1930, -0.4015],
        [ 1.3864,  0.8687, -1.3738, -1.3621],
        [-0.0198, -1.0573, -0.1367, -0.2958],
        [ 0.0271,  0.5387, -0.2918,  0.8103],
        [-0.2929,  0.0531,  1.1546, -0.0063],
        [ 0.3828,  1.1315,  0.2428, -0.4285]])

In [21]:
# get all of the attention scores via a matrix multiplication:
attention_scores = inputs @ inputs.T
attention_scores

tensor([[ 4.3852,  2.5793, -0.8820, -1.3148],
        [ 2.5793,  4.0807, -0.7651, -1.2311],
        [-0.8820, -0.7651,  4.9732,  3.2795],
        [-1.3148, -1.2311,  3.2795,  5.0270]])

In [22]:
attention_weights = torch.softmax( attention_scores, dim = -1 )
attention_weights

tensor([[0.8526, 0.1401, 0.0044, 0.0029],
        [0.1803, 0.8093, 0.0064, 0.0040],
        [0.0024, 0.0027, 0.8404, 0.1545],
        [0.0015, 0.0016, 0.1479, 0.8490]])

In [23]:
attention_weights[0].sum()

tensor(1.)

In [24]:
context_vectors = attention_weights @ inputs
context_vectors

tensor([[-0.6636, -0.5603,  1.0546,  1.2939, -0.1665,  0.0996, -0.2373,  0.4847],
        [-0.5310, -0.0046,  0.6919,  0.9389, -0.8613,  0.4422, -0.0025,  0.9846],
        [-0.3563, -1.2207,  0.1045, -1.3593, -0.1635, -0.1185,  0.9688,  0.1418],
        [-0.3217, -1.3730, -0.3096, -1.3561, -0.2731,  0.6457,  0.1650, -0.3255]])