In [45]:
import torch
import torch.nn as nn
BATCH_SIZE = 1
SEQ_LEN = 4
EMBEDDING_SIZE = 1

def build_input_data(batch_size, seq_len, embedding_size):
    """
    Build a random input data tensor.
    """
    return torch.randn(batch_size, seq_len, embedding_size)
# batch size 4, each sequence of length 3, with embedding size 2
data = build_input_data(BATCH_SIZE, SEQ_LEN, EMBEDDING_SIZE)
print(data.shape)

torch.Size([1, 4, 1])


In [46]:
WQ = nn.Linear(EMBEDDING_SIZE, SEQ_LEN, bias=False)
WK = nn.Linear(EMBEDDING_SIZE, SEQ_LEN, bias=False)
WV = nn.Linear(EMBEDDING_SIZE, SEQ_LEN, bias=False)

Q = WQ(data)
K = WK(data)
V = WV(data)    

print("Q shape:", Q.shape)
print("K shape:", K.shape)  
print("V shape:", V.shape)

scores = Q*K.transpose(1,2)
print("Scores shape:", scores.shape)
print("Scores:", scores)
alpha = torch.softmax(scores, dim=1)
print("Alpha shape:", alpha.shape)
print("Alpha:", alpha)

attention_output = alpha * V
print("Attention Output shape:", attention_output.shape)
print("Attention Output:", attention_output)

Q shape: torch.Size([1, 4, 4])
K shape: torch.Size([1, 4, 4])
V shape: torch.Size([1, 4, 4])
Scores shape: torch.Size([1, 4, 4])
Scores: tensor([[[ 5.6724e-04,  4.8002e-02,  4.9069e-03,  2.1055e-02],
         [ 1.3777e-02,  1.1659e+00,  1.1918e-01,  5.1139e-01],
         [ 1.1793e-02,  9.9798e-01,  1.0202e-01,  4.3775e-01],
         [-6.8609e-03, -5.8059e-01, -5.9350e-02, -2.5467e-01]]],
       grad_fn=<MulBackward0>)
Alpha shape: torch.Size([1, 4, 4])
Alpha: tensor([[[0.2489, 0.1393, 0.2403, 0.2037],
         [0.2522, 0.4261, 0.2694, 0.3326],
         [0.2517, 0.3603, 0.2648, 0.3090],
         [0.2471, 0.0743, 0.2254, 0.1546]]], grad_fn=<SoftmaxBackward0>)
Attention Output shape: torch.Size([1, 4, 4])
Attention Output: tensor([[[-1.4934e-04,  1.6694e-03, -4.9704e-03,  5.4330e-03],
         [-1.3125e-02,  4.4282e-01, -4.8328e-01,  7.6943e-01],
         [-3.5166e-03,  1.0051e-01, -1.2754e-01,  1.9191e-01],
         [ 3.6653e-03, -2.2016e-02,  1.1525e-01, -1.0197e-01]]],
       grad_fn=<

In [44]:
from numpy import array
from numpy import random
from numpy import dot
from scipy.special import softmax

# encoder representations of four different words
word_1 = array([1, 0, 0])
word_2 = array([0, 1, 0])
word_3 = array([1, 1, 0])
word_4 = array([0, 0, 1])

# stacking the word embeddings into a single array
words = array([word_1, word_2, word_3, word_4])
print("Words shape:", words.shape)

# generating the weight matrices
random.seed(42)
W_Q = random.randint(3, size=(3, 3))
W_K = random.randint(3, size=(3, 3))
W_V = random.randint(3, size=(3, 3))
print("W_Q shape:", W_Q.shape)
print("W_K shape:", W_K.shape)  
print("W_V shape:", W_V.shape)

# generating the queries, keys and values
Q = words @ W_Q
K = words @ W_K
V = words @ W_V
print("Q shape:", Q.shape)
print("K shape:", K.shape)  
print("V shape:", V.shape)

# scoring the query vectors against all key vectors
scores = Q @ K.transpose()
print("Scores shape:", scores.shape)
print("Scores:", scores)
# computing the weights by a softmax operation
weights = softmax(scores / K.shape[1] ** 0.5, axis=1)
print("Weights shape:", weights.shape)
print("Weights:", weights)

# computing the attention by a weighted sum of the value vectors
attention = weights @ V

print(attention)

Words shape: (4, 3)
W_Q shape: (3, 3)
W_K shape: (3, 3)
W_V shape: (3, 3)
Q shape: (4, 3)
K shape: (4, 3)
V shape: (4, 3)
Scores shape: (4, 4)
Scores: [[ 8  2 10  2]
 [ 4  0  4  0]
 [12  2 14  2]
 [10  4 14  3]]
Weights shape: (4, 4)
Weights: [[2.36089863e-01 7.38987555e-03 7.49130386e-01 7.38987555e-03]
 [4.54826323e-01 4.51736775e-02 4.54826323e-01 4.51736775e-02]
 [2.39275049e-01 7.43870015e-04 7.59237211e-01 7.43870015e-04]
 [8.99501754e-02 2.81554063e-03 9.05653685e-01 1.58059922e-03]]
[[0.98522025 1.74174051 0.75652026]
 [0.90965265 1.40965265 0.5       ]
 [0.99851226 1.75849334 0.75998108]
 [0.99560386 1.90407309 0.90846923]]


In [1]:
sentence = 'Life is short, eat dessert first'

dc = {s:i for i,s in enumerate(sorted(sentence.replace(',', '').split()))}
print(dc)

{'Life': 0, 'dessert': 1, 'eat': 2, 'first': 3, 'is': 4, 'short': 5}


In [None]:
import torch

sentence_int = torch.tensor([dc[s] for s in sentence.replace(',', '').split()])
# Six words in the sentence, each represented by an integer
print(sentence_int)

tensor([0, 4, 5, 2, 1, 3])


In [None]:
torch.manual_seed(123)
embed = torch.nn.Embedding(6, 16)
embedded_sentence = embed(sentence_int).detach()

print(embedded_sentence)
print(embedded_sentence.shape)

#Single sentence = 1 batch, 6 words, each word represented by a 16-dimensional vector

tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  0.3486,  0.6603, -0.2196, -0.3792,
          0.7671, -1.1925,  0.6984, -1.4097,  0.1794,  1.8951,  0.4954,  0.2692],
        [ 0.5146,  0.9938, -0.2587, -1.0826, -0.0444,  1.6236, -2.3229,  1.0878,
          0.6716,  0.6933, -0.9487, -0.0765, -0.1526,  0.1167,  0.4403, -1.4465],
        [ 0.2553, -0.5496,  1.0042,  0.8272, -0.3948,  0.4892, -0.2168, -1.7472,
         -1.6025, -1.0764,  0.9031, -0.7218, -0.5951, -0.7112,  0.6230, -1.3729],
        [-1.3250,  0.1784, -2.1338,  1.0524, -0.3885, -0.9343, -0.4991, -1.0867,
          0.8805,  1.5542,  0.6266, -0.1755,  0.0983, -0.0935,  0.2662, -0.5850],
        [-0.0770, -1.0205, -0.1690,  0.9178,  1.5810,  1.3010,  1.2753, -0.2010,
          0.4965, -1.5723,  0.9666, -1.1481, -1.1589,  0.3255, -0.6315, -2.8400],
        [ 0.8768,  1.6221, -1.4779,  1.1331, -1.2203,  1.3139,  1.0533,  0.1388,
          2.2473, -0.8036, -0.2808,  0.7697, -0.6596, -0.7979,  0.1838,  0.2293]])
torch.Size([6, 16])


In [14]:


d = embedded_sentence.shape[1]

d_q, d_k, d_v = 24, 24, 28

print(d, d_q, d_k, d_v)

W_query = torch.nn.Parameter(torch.rand(d_q, d))
W_key = torch.nn.Parameter(torch.rand(d_k, d))
W_value = torch.nn.Parameter(torch.rand(d_v, d))

Q = (W_query @ embedded_sentence.T).T
K = (W_key @ embedded_sentence.T).T
V = (W_value @ embedded_sentence.T).T

print("Q shape:", Q.shape)
print("K shape:", K.shape)
print("V shape:", V.shape)

16 24 24 28
Q shape: torch.Size([6, 24])
K shape: torch.Size([6, 24])
V shape: torch.Size([6, 28])


In [22]:
o = Q @ K.T 
print("o shape:", o.shape)
print("o:", o)

wts = torch.nn.functional.softmax(o/d_k**0.5, dim=1)
print("Weights shape:", wts.shape)
print("Weights:", wts)

attention = wts @ V
print("Attention shape:", attention.shape)
print("Attention:", attention)

o shape: torch.Size([6, 6])
o: tensor([[   2.4085,   -4.4031,  -40.0720,  -10.5427,  -24.6705,   44.0660],
        [ -17.7738,    2.4781,   30.8294,    5.1670,    4.0020,  -22.7538],
        [ -42.4927,  -11.8778,  132.2293,   39.5114,   61.3134, -141.9247],
        [ -26.9960,  -12.5723,   90.3728,   36.1849,   37.4335,  -93.6340],
        [ -24.8323,   -8.0015,   53.0754,    6.9425,   18.4449,  -66.0415],
        [  39.0232,   16.4217, -112.1686,  -38.8169,  -57.1165,  128.8991]],
       grad_fn=<MmBackward0>)
Weights shape: torch.Size([6, 6])
Weights: tensor([[2.0274e-04, 5.0478e-05, 3.4758e-08, 1.4415e-05, 8.0612e-07, 9.9973e-01],
        [4.8515e-05, 3.0283e-03, 9.8753e-01, 5.2430e-03, 4.1334e-03, 1.7555e-05],
        [3.2426e-16, 1.6784e-13, 1.0000e+00, 6.0333e-09, 5.1678e-07, 4.9688e-25],
        [3.9378e-11, 7.4801e-10, 9.9996e-01, 1.5712e-05, 2.0273e-05, 4.8728e-17],
        [1.2390e-07, 3.8470e-06, 9.9906e-01, 8.1267e-05, 8.5035e-04, 2.7535e-11],
        [1.0777e-08, 1.0688e-