In [5]:
import torch
import torch.nn as nn
import math

In [39]:
temp = nn.Dropout()
print(list(temp.parameters()))

[]


In [6]:
def attention(query, key, value, dropout=None):
    '''
    args:
    query: 查询值矩阵
    key: 键值矩阵
    value: 真值矩阵
    '''
    # 获取键向量的维度，键向量的维度和值向量的维度相同
    d_k = query.size(-1) 
    # 计算Q与K的内积并除以根号dk
    # transpose——相当于转置
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    # Softmax
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
        # 采样
     # 根据计算结果对value进行加权求和
    return torch.matmul(p_attn, value), p_attn

In [31]:
query = torch.randn(2, 3) 
key = query
value = torch.randn(2, 3)
output, attn = attention(query, query, value)

In [32]:
query

tensor([[ 0.7354,  0.9465, -0.7554],
        [-1.7306, -0.5662, -0.2925]])

In [33]:
key

tensor([[ 0.7354,  0.9465, -0.7554],
        [-1.7306, -0.5662, -0.2925]])

In [34]:
value

tensor([[ 0.1177, -1.1865, -0.9836],
        [-0.6811,  0.3634,  0.2954]])

In [35]:
attn

tensor([[0.8885, 0.1115],
        [0.0531, 0.9469]])

In [36]:
output

tensor([[ 0.0287, -1.0137, -0.8410],
        [-0.6386,  0.2810,  0.2274]])

In [37]:
# 创建一个上三角矩阵，用于遮蔽未来信息。
# 先通过 full 函数创建一个 1 * seq_len * seq_len 的矩阵
mask = torch.full((1, 2, 2), float("-inf"))
mask

tensor([[[-inf, -inf],
         [-inf, -inf]]])

In [38]:

# triu 函数的功能是创建一个上三角矩阵
mask = torch.triu(mask, diagonal=1)
mask

tensor([[[0., -inf],
         [0., 0.]]])