## Preparing

### 导入包

In [1]:
import torch
from torch import nn
from torch import optim
from torch.utils import data as Data
import numpy as np

### 训练设备配置

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### 准备超参数

In [3]:
d_model = 512 #嵌入向量的维度
max_len = 100 #句子的最大长度
d_ff = 2048 #前馈神经网络的隐层维度，为嵌入向量维度的4倍
d_k = d_v = 64 #注意力机制中Q、K、V的维度，Q和K的维度为d_k,V的维度为d_v
n_layers = 6 #编码器和解码器的层数
n_heads = 8 #多头注意力机制的头数
p_drop = 0.1 #dropout概率

## 模型定义

### Mask

#### Pad Mask for Attention

由于在数据中使用了padding进行填充，而不希望pad被加入到注意力中进行计算，可以使用Attention Pad Mask，其作用是确保模型在计算注意力分数时,任何Query不会去关注**Key中的padding**位置

这里假设\<pad\>在字典中的索引为0，那么当输入为0时，返回True，否则返回False

In [4]:
def get_attn_pad_mask(seq_q, seq_k):
    """
    padding部分的attention mask
    防止Query对Key中的padding部分计算attention
    
    parameters:
    seq_q: [batch_size, len_q]
    seq_k: [batch_size, len_k]

    return:
    mask: [batch_size, len_q, len_k]
    """
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)
    return pad_attn_mask.expand(batch_size, len_q, len_k)

#### Subsequent Mask for Decoder

In [5]:
def get_attn_subsequent_mask(seq):
    """
    在自回归时防止后面的信息影响前面的信息

    parameters:
    seq: [batch_size, seq_len]

    return:
    subsequent_mask: [batch_size, seq_len, seq_len]
    """
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    # np.triu()返回矩阵的上三角部分, k=1表示对角线之上的元素,k越大，则上三角范围越小
    # 与之完全相反的是np.tril()函数，返回矩阵的下三角部分
    # subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    # subsequent_mask = torch.from_numpy(subsequent_mask).byte()
    subsequent_mask = torch.triu(torch.ones(attn_shape, dtype=torch.uint8, device=seq.device), diagonal=1)
    return subsequent_mask
# attn_shape = [5, 5, 5]
# torch.from_numpy(np.triu(np.ones(attn_shape), k=1)).byte()== torch.triu(torch.ones(attn_shape, dtype=torch.uint8), diagonal=1)

### Positional Encoding

绝对位置编码

$$
PE_{(pos, 2i)} = sin(pos / 10000^{2i / d_{model}})
$$
$$
PE_{(pos, 2i+1)} = cos(pos / 10000^{2i / d_{model}})
$$


- 唯一性：每个位置需要独特编码，不会出现编码重复问题
    - 不同维度使用不同的周期长度
    - 多个维度的组合提供了足够的唯一性
    - 每个位置都有唯一的编码模式，
- 平滑性：相近位置应有相似编码
    - $10000^{2i/d_{model}}$​ 的作用：
        - 这个项会随着维度i的增加而增大
        - 导致不同维度上的正弦/余弦函数有不同的频率
        - 低维度对应高频信号，对近距离位置敏感
        - 高维度对应低频信号，可以捕捉长距离依赖
- 有界性：编码值要在固定范围内，正弦和余弦函数的值域在\[-1,1\]之间，适合作为神经网络的输入
- 可推广性：能处理任意长度序列

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model=512, p_drop=.1, max_len=1024):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=p_drop)

        pe = torch.zeros(max_len, d_model) # [max_len, d_model]
        position = torch.arange(0, max_len).float().unsqueeze(1) # [max_len, 1]
        
        # div_term = 1 / (10000^(2i/sqrt(d_model)))
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                             (-torch.log(torch.Tensor([10000])) / d_model / 2))
        # position: [max_len, 1], div_term: [d_model/2]
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # [max_len, d_model] -> [1, max_len, d_model] 

        # 存疑：为什么不直接用unsqueeze(1)而是要用unsqueeze(0).transpose(0, 1)
        # pe = pe.unsqueeze(0).transpose(0, 1)
        pe = pe.unsqueeze(0)
        # 能够申请缓冲区的张量，可以通过调用register_buffer()方法将其注册为模型的一部分,且不会参与到梯度的计算
        self.register_buffer("pe", pe)
    def forward(self, x):
        # x: [batch_size, seq_len, d_model]

        # 存疑：为什么要加上pe[:, :x.size(1)]，而不是直接加上pe
        x = x + self.pe[:, :x.size(1),:] # [batch_size, seq_len, d_model]
        return self.dropout(x)

x = torch.ones(4, 6, 6)
net = PositionalEncoding(d_model=6)
print(net(x).shape)


# x = torch.randn(10, 3, 4)
# # print("x:", x) 

# pe = torch.ones(10, 1, 4)
# # print("x + pe[:, :x.size(1)]:", x + pe[:, :x.size(1)])  # torch.Size([5, 1, 4])
# # print("x + pe:", x + pe) 
# print(x + pe[:, :x.size(1)] == x + pe)

torch.Size([4, 6, 6])


### Feed Forward Neural Network

在Transformer中，每个Encoder和Decoder都有一个全连接前馈神经网络来添加非线性特征
$$
FFN(x) = max(0, xW_1 + b_1)W_2 + b_2
$$

In [7]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model=512, d_ff=4*d_model, p_drop=0.1):
        super(FeedForwardNetwork, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(p_drop)
        self.layer_norm = nn.LayerNorm(d_model)
    def forward(self, x):
        # x: [batch_size, seq_len, d_model]
        residual = x
        # x : [batch_size, seq_len, d_model] -> [batch_size, seq_len, d_ff]
        x = self.linear1(x)
        x = self.relu(x)
        # x : [batch_size, seq_len, d_ff] -> [batch_size, seq_len, d_model]
        x = self.linear2(x)
        x = self.dropout(x)
        return self.layer_norm(x + residual)

In [8]:
from torchinfo import summary
net = FeedForwardNetwork()
print(summary(net, (3, 4 ,512)))

Layer (type:depth-idx)                   Output Shape              Param #
FeedForwardNetwork                       [3, 4, 512]               --
├─Linear: 1-1                            [3, 4, 2048]              1,050,624
├─ReLU: 1-2                              [3, 4, 2048]              --
├─Linear: 1-3                            [3, 4, 512]               1,049,088
├─Dropout: 1-4                           [3, 4, 512]               --
├─LayerNorm: 1-5                         [3, 4, 512]               1,024
Total params: 2,100,736
Trainable params: 2,100,736
Non-trainable params: 0
Total mult-adds (M): 6.30
Input size (MB): 0.02
Forward/backward pass size (MB): 0.29
Params size (MB): 8.40
Estimated Total Size (MB): 8.72


### Attention

Attention机制是Transformer的核心，它可以将输入序列的不同位置的信息进行加权求和，从而实现对不同位置的关注

#### Scaled Dot-Product Attention

单头注意力机制，输入包括三个部分：查询Q，键K，值V，计算公式如下：

$$
Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V
$$
![Scaled Dot-Product Attention](https://bex-image.oss-cn-hangzhou.aliyuncs.com/img/202412211526401.png)

In [9]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k=64, d_v=64):
        super(ScaledDotProductAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
    def forward(self, Q, K, V, attn_mask):
        # Q: [batch_size, n_heads, len_q, d_k]
        # K: [batch_size, n_heads, len_k, d_k]
        # V: [batch_size, n_heads, len_v, d_v]
        # attn_mask: [batch_size, n_heads, len_q, len_k]
        
        # Q * K^T / sqrt(d_k)
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(self.d_k) # [batch_size, n_heads, len_q, len_k]
        # masked_fill_能把传进来的Mask为True的地方全都填充上某个值,
        # 这里需要用一个很大的负数来保证，从而使softmax后的值接近于0
        scores.masked_fill_(attn_mask, -1e9)

        attn = nn.Softmax(dim=-1)(scores) # [batch_size, n_heads, len_q, len_k]

        prob = torch.matmul(attn, V) # [batch_size, n_heads, len_q, d_v]
        return prob, attn

#### Multi-Head Attention

多头注意力是指通过多个不同的注意力头来获取不同的特征，然后将这些特征拼接起来，通过线性变换得到最终的输出

$$
\text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \text{head}_2, \ldots, \text{head}_h)W^O
$$
$$
\text{where head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
$$

![Multi-Head Attention](https://bex-image.oss-cn-hangzhou.aliyuncs.com/img/202412211529171.png)

> 虽然新版本已经有reshape函数可以用了, 但是仍然不要忘记, transpose后如果接permute或者view必须要加contiguous, 这是数据真实存储连续与否的问题,

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=512, d_k=64, d_v=64, n_heads=8):
        super(MultiHeadAttention, self).__init__()        
        self.d_model = d_model
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        
        # 虽然在原结构图中使用了多个线性层，但是这里使用一个线性层进行一次性计算
        self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
        self.W_O = nn.Linear(n_heads * d_v, d_model, bias=False)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, input_Q, input_K, input_V, attn_mask):
        '''
        To make sure multihead attention can be used both in encoder and decoder, 
        we use Q, K, V respectively.
        
        parameters:
        input_Q: [batch_size, len_q, d_model]
        input_K: [batch_size, len_k, d_model]
        input_V: [batch_size, len_v, d_model]
        attn_mask: [batch_size, len_q, len_k]
        '''
        residual, batch_size = input_Q, input_Q.size(0)

        # [batch_size, len_q, d_model] -- matmul W_Q -> [batch_size, len_q, d_k * n_heads] -- reshape  -> [batch_size, n_heads, len_q, d_k]
        Q = self.W_Q(input_Q).reshape(batch_size, self.n_heads, -1, self.d_k) # [batch_size, n_heads, len_q, d_k]
        K = self.W_K(input_K).reshape(batch_size, self.n_heads, -1, self.d_k) # [batch_size, n_heads, len_k, d_k]
        V = self.W_V(input_V).reshape(batch_size, self.n_heads, -1, self.d_v) # [batch_size, n_heads, len_v, d_v]

        # attn_mask: [batch_size, len_q, len_k] -- unsqueeze(1) -> [batch_size, 1, len_q, len_k] -- repeat -> [batch_size, n_heads, len_q, len_k]
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1) 
        
        # prob: [batch_size, n_heads, len_q, d_v]
        # attn: [batch_size, n_heads, len_q, len_k]
        prob, attn = ScaledDotProductAttention(self.d_k, self.d_v)(Q, K, V, attn_mask)

        prob = prob.transpose(1, 2).contiguous() # [batch_size, len_q, n_heads, d_v]
        # prob = prob.view(prob.size(0), -1, self.n_heads * self.d_v).contiguous() # [batch_size, len_q, n_heads * d_v]
        prob = prob.reshape(prob.size(0), -1, self.n_heads * self.d_v) # [batch_size, len_q, n_heads * d_v]
        output = self.W_O(prob) # [batch, len_q, d_model]

        return self.layer_norm(residual + output), attn
        

### Encoder and Decoder
![image-20241221153148582](https://bex-image.oss-cn-hangzhou.aliyuncs.com/img/202412211531617.png)

#### Encoder

In [11]:
class Encoderlayer(nn.Module):
    def __init__(self, d_model=512, d_ff=2048, d_k=64, d_v=64, n_heads=8, p_drop=0.1):
        super(Encoderlayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(d_model=d_model, d_k=d_k, d_v=d_v, n_heads=n_heads)
        self.feed_forward_network = FeedForwardNetwork(d_model=d_model, d_ff=d_ff, p_drop=p_drop)
    def forward(self, encoder_input, encoder_pad_mask):
        """
        parameters:
        encoder_input: [batch_size, seq_len, d_model]
        encoder_pad_mask: [batch_size, seq_len, seq_len]

        return:
        encoder_output: [batch_size, seq_len, d_model]
        """
        # multi_head_attention & add & norm
        # multi_head_attention(Q, K, V, attn_mask)
        encoder_output, attn = self.multi_head_attention(encoder_input, encoder_input, encoder_input, encoder_pad_mask)
        # feed_forward_network & add & norm
        encoder_output = self.feed_forward_network(encoder_output)
        return encoder_output, attn

In [12]:
class Encoder(nn.Module):
    def __init__(self, d_model=512, max_len=1024, d_ff=2048, d_k=64, d_v=64, n_layers=6, n_heads=8, p_drop=.1, source_vocab_size=1000):
        super(Encoder, self).__init__()
        self.input_embedding = nn.Embedding(source_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, p_drop, max_len)
        self.encoder_layers = nn.ModuleList([Encoderlayer(d_model=d_model, d_ff=d_ff, d_k=d_k, d_v=d_v, n_heads=n_heads, p_drop=p_drop) for _ in range(n_layers)])
    def forward(self, encoder_input):
        """
        parameters:
        encoder_input: [batch_size, seq_len]

        return:

        """
        # 将token转换为词嵌入向量
        # [batch_size, seq_len] -> [batch_size, seq_len, d_model]
        encoder_output = self.input_embedding(encoder_input)

        # 将向量加上位置编码
        # positional_encoding的输入形状: [batch_size, seq_len, d_model]
        # positional_encoding的输出形状: [batch_size, seq_len, d_model]
        encoder_output = self.positional_encoding(encoder_output)

        # 获取padding mask
        encoder_pad_mask = get_attn_pad_mask(encoder_input, encoder_input)

        # 保存每一层的attention
        attns = []
        
        for encoder_layer in self.encoder_layers:
            encoder_output, attn = encoder_layer(encoder_output, encoder_pad_mask)
            attns.append(attn)
        return encoder_output, attns

#### Decoder

In [13]:
class Decoderlayer(nn.Module):
    def __init__(self, d_model=512, d_ff=2048, d_k=64, d_v=64, n_heads=8, p_drop=0.1):
        super(Decoderlayer, self).__init__()
        self.decoder_self_attention = MultiHeadAttention(d_model=d_model, d_k=d_k, d_v=d_v, n_heads=n_heads)
        self.encoder_decoder_attention = MultiHeadAttention(d_model=d_model, d_k=d_k, d_v=d_v, n_heads=n_heads)
        self.feed_forward_network = FeedForwardNetwork(d_model=d_model, d_ff=d_ff, p_drop=p_drop)
    def forward(self, decoder_input, encoder_output, decoder_self_mask, decoder_encoder_pad_mask):
        """
        parameters:
        decoder_input: [batch_size, seq_len, d_model]
        encoder_output: [batch_size, seq_len, d_model]
        decoder_self_mask: [batch_size, seq_len, seq_len]
        decoder_pad_mask: [batch_size, seq_len, seq_len]
        """

        # masked multihead attention & add & norm
        # decoder_self_attention(Q, K, V, attn_mask)
        # Q, K, V都来自decoder_input
        # decoder_output: [batch_size, seq_len, d_model]
        # decoder_self_attn: [batch_size, n_heads, seq_len, seq_len]
        decoder_output, decoder_self_attn = self.decoder_self_attention(decoder_input, decoder_input, decoder_input, decoder_self_mask)
        
        # multihead attention & add & norm
        # decoder_encoder_attention(Q, K, V, attn_mask)
        # Q来自decoder_output, K, V来自encoder_output
        # decoder_output: [batch_size, seq_len, d_model]
        # decoder_encoder_attn: [batch_size, n_heads, seq_len, seq_len]
        decoder_output, decoder_encoder_attn = self.encoder_decoder_attention(decoder_output, encoder_output, encoder_output, decoder_encoder_pad_mask)
        decoder_output = self.feed_forward_network(decoder_output)

        return decoder_output, decoder_self_attn, decoder_encoder_attn

In [14]:
class Decoder(nn.Module):
    def __init__(self, d_model=512, max_len=1024, d_ff=2048, d_k=64, d_v=64, n_layers=6, n_heads=8, p_drop=.1, target_vocab_size=1000):
        super(Decoder, self).__init__()
        self.target_embedding = nn.Embedding(target_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, p_drop, max_len)
        self.decoder_layers = nn.ModuleList([Decoderlayer(d_model=d_model, d_ff=d_ff, d_k=d_k, d_v=d_v, n_heads=n_heads, p_drop=p_drop) for _ in range(n_layers)])
    
    def forward(self, decoder_input, encoder_input, encoder_output):
        """
        parameters:
        decoder_input: [batch_size, seq_len]
        encoder_input: [batch_size, seq_len]
        encoder_output: [batch_size, seq_len, d_model]
        """
        # 将token转换为词嵌入向量
        # [batch_size, seq_len] -> [batch_size, seq_len, d_model]
        decoder_output = self.target_embedding(decoder_input) 
        # 将向量加上位置编码
        decoder_output = self.positional_encoding(decoder_output) # [batch_size, seq_len, d_model]

        decoder_subsequent_mask = get_attn_subsequent_mask(decoder_input)
        decoder_self_pad_mask = get_attn_pad_mask(decoder_input, decoder_input)

        decoder_encoder_pad_mask = get_attn_pad_mask(decoder_input, encoder_input)

        # 0为阈值，大于0的地方为1，小于0的地方为0
        # 从而使True+True=2>1 -> True, False+True=1>1 -> True, False+False=0<1 -> False
        decoder_self_mask = torch.gt(decoder_self_pad_mask + decoder_subsequent_mask, 0)

        decoder_self_attns, decoder_encoder_attns = [], []

        for decoder_layer in self.decoder_layers:
            # decoder_output: [batch_size, seq_len, d_model]
            decoder_output, decoder_self_attn, decoder_encoder_attn = decoder_layer(decoder_output, encoder_output, decoder_self_mask, decoder_encoder_pad_mask)
            
            decoder_encoder_attns.append(decoder_encoder_attn)
            decoder_self_attns.append(decoder_self_attn)
        
        return decoder_output, decoder_self_attns, decoder_self_attns 

### Transformer

In [None]:
class Transformer(nn.Module):
    def __init__(self, d_model=512, max_len=1024, d_ff=2048, d_k=64, d_v=64, n_layers=6, n_heads=8, p_drop=0.1, source_vocab_size=1000, target_vocab_size=1000):
        super(Transformer, self).__init__()
        self.encoder = Encoder(d_model=d_model, max_len=max_len, d_ff=d_ff, d_k=d_k, d_v=d_v, n_layers=n_layers, n_heads=n_heads, p_drop=p_drop, source_vocab_size=source_vocab_size)
        self.decoder = Decoder(d_model=d_model, max_len=max_len, d_ff=d_ff, d_k=d_k, d_v=d_v, n_layers=n_layers, n_heads=n_heads, p_drop=p_drop, target_vocab_size=target_vocab_size)
        self.projection = nn.Linear(d_model, target_vocab_size, bias=False)

        self.to(device)
    
    def forward(self, encoder_input, decoder_input):
        """
        encoder_input: [batch_size, seq_len]
        decoder_input: [batch_size, seq_len]
        """
        # encoder_output: [batch_size, seq_len, d_model]
        # encoder_self_attns: [n_layers, batch_size, n_heads, seq_len, seq_len]
        encoder_output, encoder_self_attns = self.encoder(encoder_input)

        # decoder_output: [batch_size, seq_len, d_model]
        # decoder_self_attns: [n_layers, batch_size, n_heads, seq_len, seq_len]
        # decoder_encoder_attns: [n_layers, batch_size, n_heads, seq_len, seq_len]
        decoder_output, decoder_self_attns, decoder_encoder_attns = self.decoder(decoder_input, encoder_input, encoder_output)
        
        # output: [batch_size, seq_len, target_vocab_size]
        output = self.projection(decoder_output)

        # output: [batch_size, seq_len, target_vocab_size] -- reshape --> [batch * seq_len, target_vocab_size]
        return output.reshape(-1, output.size(-1)), encoder_self_attns, decoder_self_attns, decoder_encoder_attns

In [18]:
encoder_input = torch.randint(0, 1000, (1, 100)).to(device)
decoder_input = torch.randint(0, 1000, (1, 100)).to(device)
net = Transformer(d_model=d_model, max_len=max_len, d_ff=d_ff, d_k=d_k, d_v=d_v, n_layers=n_layers, n_heads=n_heads, p_drop=p_drop)
print(net(encoder_input, decoder_input)[0].shape)
# 创建一个样例输入来查看结构
batch_size = 32
seq_len = 50

summary(net, 
        input_size=[(batch_size, seq_len), (batch_size, seq_len)],
        dtypes=[torch.long, torch.long],
        device=device,
        depth=5,  # 增加深度显示
        )  

torch.Size([100, 1000])


Layer (type:depth-idx)                        Output Shape              Param #
Transformer                                   [1600, 1000]              --
├─Encoder: 1-1                                [32, 50, 512]             --
│    └─Embedding: 2-1                         [32, 50, 512]             512,000
│    └─PositionalEncoding: 2-2                [32, 50, 512]             --
│    │    └─Dropout: 3-1                      [32, 50, 512]             --
│    └─ModuleList: 2-3                        --                        --
│    │    └─Encoderlayer: 3-2                 [32, 50, 512]             --
│    │    │    └─MultiHeadAttention: 4-1      [32, 50, 512]             --
│    │    │    │    └─Linear: 5-1             [32, 50, 512]             262,144
│    │    │    │    └─Linear: 5-2             [32, 50, 512]             262,144
│    │    │    │    └─Linear: 5-3             [32, 50, 512]             262,144
│    │    │    │    └─Linear: 5-4             [32, 50, 512]             262