## 从头开始构建大模型（Build an LLM from Scratch）

### chapt03 构建attention

In [2]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


#### setp01 初始化
* 定义输入输出维度

In [9]:
inputs = torch.tensor( 
   [[0.43, 0.15, 0.89], # Your     (x^1) 
   [0.55, 0.87, 0.66], # journey  (x^2) 
   [0.57, 0.85, 0.64], # starts   (x^3) 
   [0.22, 0.58, 0.33], # with     (x^4) 
   [0.77, 0.25, 0.10], # one      (x^5) 
   [0.05, 0.80, 0.55]] # step     (x^6) 
   ) 

x_2 = inputs[1] # second input element
d_in = inputs.shape[1] # the input embedding size, d=3
d_out = 2 # the output embedding size, d=2

* 定义W_q,W_k,W_v

In [10]:
torch.manual_seed(123)

W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key   = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

print(W_query)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])


* 计算query, key, and value vectors

In [18]:
query_2 = x_2 @ W_query # _2 because it's with respect to the 2nd input element
key_2 = x_2 @ W_key 
value_2 = x_2 @ W_value

print("query_2:", query_2)
print("key_2  :", key_2)
print("value_2:", value_2)

query_2: tensor([0.4306, 1.4551])
key_2  : tensor([0.4433, 1.1419])
value_2: tensor([0.3951, 1.0037])


In [19]:
querys = inputs @ W_query
keys = inputs @ W_key
values = inputs @ W_value

print("querys:", querys)
print("keys  :", keys)
print("values:", values)

querys: tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]])
keys  : tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]])
values: tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2393, 0.5493],
        [0.1492, 0.3346],
        [0.3221, 0.7863]])


#### step2 我们通过计算query和每个key向量之间的点积，计算出未归一化的attention 得分

In [20]:
keys_2 = keys[1] # Python starts index at 0
attn_score_22 = query_2.dot(keys_2)
print(attn_score_22)

tensor(1.8524)


In [22]:
attn_scores_2 = query_2 @ keys.T # All attention scores for given query
print(attn_scores_2)

tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


#### step3  我们使用之前使用过的 softmax 函数计算注意力权重（总和为 1 的归一化注意力分数）。

In [24]:
d_k = keys.shape[1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print(attn_weights_2)

tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])


#### step 4 我们现在计算输入query vector_2 的上下文向量

In [25]:
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

tensor([0.3061, 0.8210])
