# 加入训练参数的注意力机制

In [1]:
import torch
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [2]:
x_2 = inputs[1]
# 输入矩阵的每个向量的维度
d_in = inputs.shape[1]
d_out = 2

In [3]:
torch.manual_seed(123)
# 权重矩阵
W_q = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_k = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_v = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [4]:
query_2 = x_2 @ W_q
key_2 = x_2 @ W_k
value_2 = x_2 @ W_v
print("Query vector for 'journey':", query_2)

Query vector for 'journey': tensor([0.4306, 1.4551])


In [5]:
# 通过矩阵乘法得到所有的key向量和value向量
keys = inputs @ W_k
values = inputs @ W_v
print("All key vectors:\n", keys)
print("All value vectors:\n", values)

All key vectors:
 tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]])
All value vectors:
 tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2393, 0.5493],
        [0.1492, 0.3346],
        [0.3221, 0.7863]])


In [6]:
print("Keys shape:", keys.shape)
print("Values shape:", values.shape)

Keys shape: torch.Size([6, 2])
Values shape: torch.Size([6, 2])


In [7]:
# 计算注意力分数
key_2 = keys[1]
attn_score_22 = query_2.dot(key_2)
print("Attention score between 'journey' and itself:", attn_score_22)

Attention score between 'journey' and itself: tensor(1.8524)


In [8]:
# 通过矩阵的乘法计算query2和所有keys的注意力分数
attn_scores_2 = query_2 @ keys.T
print("Attention scores between 'journey' and all words:", attn_scores_2)

Attention scores between 'journey' and all words: tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


In [9]:
d_k = keys.shape[1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print("Attention weights for 'journey':", attn_weights_2)

Attention weights for 'journey': tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])


In [10]:
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

tensor([0.3061, 0.8210])


# 实现一个python 注意力类

In [11]:
import torch

class SelfAttentionV1(torch.nn.Module):
    def __init__(self, d_in, d_out, *args, **kwargs):
        super().__init__()
        self.W_q = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=True)
        self.W_k = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=True)
        self.W_v = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=True)

    def forward(self,X):
        queries = X @ self.W_q
        keys = X @ self.W_k
        values = X @ self.W_v

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores/ keys.shape[1]**0.5, dim=-1)
        context = attn_weights @ values
        return context

In [12]:
# 测试一下我们写的缩放点积自注意力类
torch.manual_seed(123)
sa_v1 = SelfAttentionV1(d_in=3, d_out=2)
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


# 使用nn.linear层实现QKV参数矩阵

In [13]:
import torch
from torch import nn

class SelfAttentionV2(nn.Module):
    def __init__(self, d_in, d_out, bias = False, *args, **kwargs):
        super().__init__()
        self.W_k = nn.Linear(d_in, d_out, bias=bias)
        self.W_q = nn.Linear(d_in, d_out, bias=bias)
        self.W_v = nn.Linear(d_in, d_out, bias=bias)

    def forward(self,X):
        queries = self.W_q(X)
        keys = self.W_k(X)
        values = self.W_v(X)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)
        context = attn_weights @ values
        return context




In [14]:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

torch.manual_seed(789)
sa_v2 = SelfAttentionV2(d_in=3, d_out=2)
print(sa_v2(inputs))

tensor([[-0.0809,  0.0638],
        [-0.0784,  0.0657],
        [-0.0784,  0.0657],
        [-0.0772,  0.0666],
        [-0.0787,  0.0652],
        [-0.0769,  0.0669]], grad_fn=<MmBackward0>)
