## 0. parameters

In [1]:
# 导包
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.optim import Adam
from torch import optim
# 定义模型的参数
enc_voc_size = 5893
dec_voc_size = 7853
d_model = 512
max_len = 256
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 100
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

## 1. transformer module

## 1.1 Token Embedding

In [2]:

# 构造一个线性层
a = torch.nn.Linear(4,5)
# 构造一个embedding层
b = torch.nn.Embedding(4,5)

print('a', a)
print('b', b)
print('a.weight.shape', a.weight.shape)
print('b.weight.shape', b.weight.shape)

a Linear(in_features=4, out_features=5, bias=True)
b Embedding(4, 5)
a.weight.shape torch.Size([5, 4])
b.weight.shape torch.Size([4, 5])


In [3]:
# 这个部分是探究embedding layer 编码之后， 词表内相同的元素的embedding都是相同的

# 构建一个词表大小为14，embedding维度为512的embedding层
embd_layer = torch.nn.Embedding(14, 512)
print('embedding.weight', embd_layer.weight.shape)
# 构造输入数据
input_id = torch.tensor([[2, 4, 5, 6, 7, 8, 3, 1, 1, 1], 
                      [2, 4, 9, 10,11,12,13,3, 1, 1],
                      [2, 6, 7, 8, 9, 10,11,12,13,3]])

embedding = embd_layer(input_id)
print("输入数据",input_id.shape)
print("输入数据的embedding", embedding.shape)
# 取 embd_layer 的 第四个 和 input_id 中 4 数字的编码
print('embd_layer 4 weight:', embd_layer.weight[4,:10])
print('embedding 4 ouptput:',embedding[0][1][:10])

# 注意：embedding的词表大小维度必须包含input_id中的不同id，否则会报错
# 再次构建一个只有14个词的embedding layer
embd_layer = torch.nn.Embedding(14, 512)
print('embedding.weight', embd_layer.weight.shape)
# 构造输入数据
# 但是现在input_id中有15种不同id
input_id = torch.tensor([[2, 4, 5, 6, 7, 8, 3, 1, 1, 1], 
                      [2, 4, 9, 10,11,12,14,15, 1, 1],
                      [2, 6, 7, 8, 9, 10,11,12,13,3]])
try:
    embedding = embd_layer(input_id)
except Exception as e:
    print('error:', e)

embedding.weight torch.Size([14, 512])
输入数据 torch.Size([3, 10])
输入数据的embedding torch.Size([3, 10, 512])
embd_layer 4 weight: tensor([ 1.5772,  0.6077,  0.5554, -0.3491,  0.5295, -0.8412,  0.0257,  0.2443,
        -0.1485, -0.4112], grad_fn=<SliceBackward0>)
embedding 4 ouptput: tensor([ 1.5772,  0.6077,  0.5554, -0.3491,  0.5295, -0.8412,  0.0257,  0.2443,
        -0.1485, -0.4112], grad_fn=<SliceBackward0>)
embedding.weight torch.Size([14, 512])
error: index out of range in self


In [4]:
# 创建tokenembedding类

class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, d_model):
        # 这里是继承了nn.embedding 中的 init方法
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)

test_src_token = TokenEmbedding(enc_voc_size, d_model)
test_trg_token = TokenEmbedding(dec_voc_size, d_model)
print('test_src_tokne: ', test_src_token)
print('test_trg_token: ', test_trg_token)

test_src_tokne:  TokenEmbedding(5893, 512, padding_idx=1)
test_trg_token:  TokenEmbedding(7853, 512, padding_idx=1)


### 1.2 positional Encoding
采用正余弦函数 主要是为了通过周期函数的组合来表示相对位置信息
可以参考：https://blog.csdn.net/m0_37605642/article/details/132866365

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model, max_len, device):
        super(PositionalEncoding, self).__init__()
        # 构建position embedding
        self.embedding = torch.zeros(max_len, d_model, device=device)
        self.embedding.requires_grad = False

        # 构建位置向量
        # [max_len] -> [max_len,1]
        pos = torch.arange(0, max_len, device=device).float().unsqueeze(dim = 1)

        # 构建维度向量
        _2i = torch.arange(0, d_model, step=2, device=device).float()

        # 计算位置编码
        # x:y:z 指的是从x到y，每隔z个位置取一次
        self.embedding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.embedding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
    
    def forward(self, x):
        batch_size, seq_len = x.size()
        # 从这里可以看出 positional embedding 构建好之后 其实是定值了
        return self.embedding[:seq_len, :]

# 测试
test_pos_encoding = PositionalEncoding(d_model, max_len, device)
print('test_pos_encoding: ',test_pos_encoding.embedding.shape)

test_pos_encoding:  torch.Size([256, 512])


### 1.3 LayerNorm

In [6]:
class LayerNorm(nn.Module):
    def __init__(self,d_model, eps = 1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        # layernorm 是在最后一维的归一化 通俗的来说，就是对一个样本的所有特征值进行归一化
        # 1. 计算均值
        mean = x.mean(-1, keepdim = True)
        # 2. 计算方差
        var = x.var(-1, keepdim = True, unbiased = False)
        # 3. 计算标准化x
        out = (x - mean) / torch.sqrt(var + self.eps)
        # 4. 计算归一化 用可学习的参数拟合
        out = self.gamma * out + self.beta
        return out

test_layer_norm = LayerNorm(d_model)
print('test_layerNorm gamma shape: ', test_layer_norm.gamma.shape)
print('test_layerNorm beta shape: ', test_layer_norm.beta.shape)

test_layerNorm gamma shape:  torch.Size([512])
test_layerNorm beta shape:  torch.Size([512])


### 1.4 scaled dot product attention
单头注意力机制 并且不带mask

In [7]:
class scaleDotProductAttention(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.softmax = nn.Softmax(dim = -1) # 在最后一维进行softmax

    def forward(self, q, k, v, mask = None, e = 1e-10):
        # 1. 得到q,k,v的维度, 这里是单头注意力机制，所以head_num = 1
        batch_size, head_num, seq_len, d_model = k.size()
        # 2. 计算q,k的点积 并进行scale，进行scale的目的是如果数值过大 进行softmax后梯度都会很小
        score = (q @ k.transpose(2, 3)) / math.sqrt(d_model)
        # 3. mask fill
        if mask is not None:
            score = score.masked_fill(mask == 0, -10000)
        score = self.softmax(score)
        # 4. 计算 v
        v = score @ v

        return v, score


### 1.5 position wise feed forward
其实就是mlp

In [8]:

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob = 0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)

        return x

### 1.6 multi-head attention
把单头注意力改成多头注意力 本质上相当于cnn的卷积核

In [9]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, d_model, n_head):
        super().__init__()
        # 确定头的数量
        self.n_head = n_head
        # 注意力机制仍然是一样的算法
        self.attention = scaleDotProductAttention()
        # 用线性层得到q 注意在上面的attention计算中是没有qkv的产生过程的
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        # 多头注意力concat后经过一个线性层
        self.concat_linear = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask = None):
        # 传入的qkv应该是都是x
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
        # 将qkv进行拆分
        q, k, v = self.split(q), self.split(k), self.split(v)
        # 计算attention
        out, attention = self.attention(q, k, v, mask = mask)
        # 再将out拼接在一起
        out = self.concat(out)
        # 经过一次线性变换后输出
        out = self.concat_linear(out)
        return out

    def split(self, tensor):
        batch_size, seq_len, d_model = tensor.size()
        # 这里其实要注意是否会整除不尽
        d_tensor = d_model // self.n_head
        # 这里要将头拿到前面来
        tensor = tensor.view(batch_size, seq_len, self.n_head, d_tensor).transpose(1, 2)
        return tensor
    
    def concat(self, tensor):
        batch_size, n_head, seq_len, d_tensor = tensor.size()
        d_model = n_head * d_tensor
        # 拼接，这里的 contiguous 意思是连续 是为了保证在 transpose 后能够进行view。参考：https://zhuanlan.zhihu.com/p/64551412
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        return tensor

## 2. transformer model
 接下来 我们会将上面的module 逐步组合成 transformer 模型

### 2.1 transformer embedding 层
embedding层就是将input_id 的编码和 位置编码 `加和` 在一起

In [10]:
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super().__init__()
        self.token_emb = TokenEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.position_emb = PositionalEncoding(d_model=d_model, max_len=max_len, device=device)
        self.drop_out = nn.Dropout(p = drop_prob)

    def forward(self, x):
        token_embeding = self.token_emb(x)
        position_embedding = self.position_emb(x)
        # 相加后然后还要drop
        embedding = self.drop_out(token_embeding + position_embedding)

        return embedding

### 2.2 encoder layer
编码后输入encoding层
这里需要注意一些措辞上的区别： encode layer或者 encode block 堆叠在一起 形成 一个 transformer encoder

In [11]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super().__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p = drop_prob)
        self.ffn = PositionWiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p = drop_prob)

    def forward(self, x, s_mask):
        # 输入的x是经过位置编码后的
        # 1. 保留x 作为后面的short cut
        _x = x
        # 2. 先经过一次attention的计算
        x = self.attention(q = x, k = x, v = x, mask = s_mask)
        # 3. add and norm (要先drop)
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        # 4. ffn 保留一次short cut
        _x = x
        x = self.ffn(x)
        # 5. 再次add and norm
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        return x


### 2.3 decoder layer
注意 在构建decoder layer 的时候 一个是每个decoder layer都有接受来自于encoder的输出 实际上是k和v 其次是decoder当中会有mask的操作

In [12]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super().__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head = n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p = drop_prob)

        # decoder 当中还会涉及到一个attention encoder和decoder的交叉注意力机制 使用的是encoder的K和V decoder的Q 但实际上指的就是encoder的输出经过俩个线性变化后成了K和V
        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p = drop_prob)

        self.ffn = PositionWiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model=d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, t_mask, s_mask):
        # 1. 先计算self attention 并保留short cut
        _x = dec
        # 2. 这里的t_mask 是下三角矩阵 decoder的目的是预测next token 所以要屏蔽下一个词
        x = self.self_attention(q = dec, k = dec, v = dec, mask = t_mask)

        # 3. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        
        # 4. cross attention
        if enc is not None:
            _x = x
            x = self.enc_dec_attention(q = x, k = enc, v = enc, mask = s_mask)

            # add and norm
            x = self.dropout2(x)
            x = self.norm2(x + _x)

        # 5. ffn
        _x = x
        x = self.ffn(x)

        # 6. add and norm
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x

### 2.4 encoder
许多个encoder layer组在一起形成encoder block

In [13]:
class Encoder(nn.Module):
    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(
            d_model=d_model,
            max_len=max_len,
            vocab_size=enc_voc_size,
            drop_prob=drop_prob,
            device=device
        )

        self.layers = nn.ModuleList(
            [
                EncoderLayer(
                    d_model=d_model,
                    ffn_hidden=ffn_hidden,
                    n_head=n_head,
                    drop_prob=drop_prob
                ) 
                for _ in range(n_layers)
            ]
        )

    def forward(self, x, s_mask):
        x = self.emb(x)

        for layer in self.layers:
            x = layer(x, s_mask)
        
        return x
    


### 2.5 decoder

In [14]:
class Decoder(nn.Module):
    def __init__(
            self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device
    ):
        super().__init__()
        # 注意 decoder的embedding 层 和 encoder 的只有 vocab size 不一致
        self.emb = TransformerEmbedding(
            d_model=d_model,
            max_len=max_len,
            vocab_size=dec_voc_size,
            drop_prob=drop_prob,
            device=device
        )
                
        self.layers = nn.ModuleList(
            [
                DecoderLayer(
                    d_model=d_model,
                    ffn_hidden=ffn_hidden,
                    n_head=n_head,
                    drop_prob=drop_prob
                ) 
                for _ in range(n_layers)
            ]
        )
        # 注意 线性层输出是dec的词汇表大小 实际上是每一个字 属于词表当中每个字的概率
        self.linear = nn.Linear(d_model, dec_voc_size)
    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)

        # 对于decoder每一层来说 都有 encoder 的输入到cross attention当中
        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)
        
        out = self.linear(trg)

        return out
        

### 2.6 构建transformer

In [15]:
class Transformer(nn.Module):
    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len, ffn_hidden, n_layers, drop_prob, device):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)
    
    def forward(self, src, trg):
        # 三个mask对应着 三个attention的计算： encode self-attention decode self attention计算 cross attention 计算
        # 先生成src的mask 用于src的attention计算
        src_mask  = self.make_pad_mask(src, src, self.src_pad_idx, self.src_pad_idx)
        # 注意这里是trg在前 因为用的是trg的q
        src_trg_mask = self.make_pad_mask(trg, src, self.trg_pad_idx, self.src_pad_idx)
        # 这是将pad 和 下三角矩阵融合 因为一方面要找出pad 另一方面要掩盖下一个词
        trg_mask = self.make_pad_mask(trg, trg, self.trg_pad_idx, self.trg_pad_idx) * self.make_no_peak_mask(trg, trg)

        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_trg_mask)
        return output
    def make_pad_mask(self, q, k, q_pad_idx, k_pad_idx):
        # 得到q, k 的长度
        len_q, len_k = q.size(1), k.size(1)
        # 此时输入的qk都是input_id 也就是根据词表转化过来的，会有pad
        # 这一步就是根据pad 生成mask 因为pad是不进行attention的计算的
        # 如果等于pad 那么 就会返回0 则 不是pad的位置都是1
        # 所以生成的k的形状是(batch, 1, 1, len_k)
        k = k.ne(k_pad_idx).unsqueeze(1).unsqueeze(2)
        # batch_size x 1 x len_q x len_k
        k = k.repeat(1, 1, len_q, 1)

        q = q.ne(q_pad_idx).unsqueeze(1).unsqueeze(3)
        q = q.repeat(1, 1, 1, len_k)
        # & 是位与操作 只有当两位上都是1的时候才是1
        mask = (k & q).to(self.device)
        return mask
    
    def make_no_peak_mask(self, q, k):
        len_q, len_k = q.size(1), k.size(1)
        # torch.tril 是用来形成下三角矩阵, 详见：https://blog.csdn.net/qq_38406029/article/details/122059507
        mask = torch.tril(torch.ones(len_q, len_k)).type(torch.BoolTensor).to(self.device)
        return mask

### 3. 调试transformer数据流
参考资料：https://blog.csdn.net/zhaohongfei_358/article/details/125858248

#### 3.1 创建transformer模型

In [16]:
src_pad_idx = 1
trg_pad_idx = 1
trg_sos_idx = 2
ffn_hidden = 2048
n_heads = 8
n_layers = 6
drop_prob = 0.1

model = Transformer(src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    trg_sos_idx=trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=enc_voc_size,
                    dec_voc_size=dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob,
                    device=device).to(device)

# 使用kaiming_uniform对model初始化
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)
        
model.apply(initialize_weights)

  nn.init.kaiming_uniform(m.weight.data)


Transformer(
  (encoder): Encoder(
    (emb): TransformerEmbedding(
      (token_emb): TokenEmbedding(5893, 512, padding_idx=1)
      (position_emb): PositionalEncoding()
      (drop_out): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (attention): MultiHeadAttention(
          (attention): scaleDotProductAttention(
            (softmax): Softmax(dim=-1)
          )
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (concat_linear): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionWiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
        

#### 3.2 加载数据

In [17]:

src = torch.load('tensor_src.pt').to(device)
trg = torch.load('tensor_trg.pt').to(device)
print('src shape: ', src.shape)
print('trg shape: ', trg.shape)

src shape:  torch.Size([128, 27])
trg shape:  torch.Size([128, 28])


#### 3.3 创建mask

mask是为了在计算attention的时候将pad的位置的score去除掉 这样不会影响 z 的计算。一般用一个非常大的负数来表示

mask的过程实际上是模拟了attention计算的过程 即`q@k`

In [18]:
src_mask = model.make_pad_mask(src, src, src_pad_idx, src_pad_idx)
src_trg_mask = model.make_pad_mask(trg, src, trg_pad_idx, src_pad_idx)
trg_mask = model.make_pad_mask(trg, trg, trg_pad_idx, trg_pad_idx) * model.make_no_peak_mask(trg, trg)
print('src_mask: ', src_mask.shape)
print('src_trg_mask: ', src_trg_mask.shape)
print('trg_mask: ', trg_mask.shape)

# 取出mask的值观察
# mask 在计算qkscore后 会以mask_filled 的方式填入，所以形状上必须符合qkscore的情况
# trg mask是个下三角矩阵 trg mask 取倒数第二维 就对应 trg中的一句话
print('src: ', src[0])
print('src mask: ', src_mask[0][0][0].int())
print('==============================================================split line==============================================================')
print('src_trg_mask: ', src_trg_mask[0][0][0].int())
print('==============================================================split line==============================================================')
print('trg: ', trg[0])
print('trg mask: ', trg_mask[0][0][0].int())

src_mask:  torch.Size([128, 1, 27, 27])
src_trg_mask:  torch.Size([128, 1, 28, 27])
trg_mask:  torch.Size([128, 1, 28, 28])
src:  tensor([  2,  48,  53, 127,  36,  71,  18,  11,   8,   4, 268,   5,   3,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
       device='cuda:0')
src mask:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], device='cuda:0', dtype=torch.int32)
src_trg_mask:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], device='cuda:0', dtype=torch.int32)
trg:  tensor([  2,  43, 103,  80,  52,  47,  10,  12,   6, 320,   4,   3,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
       device='cuda:0')
trg mask:  tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0], device='cuda:0', dtype=torch.int32)


#### 3.4 调试embedding
input_id -> input embedding

In [19]:
# 提取两个embedding层
tok_emb = model.encoder.emb.token_emb
print('token_emb: \n', tok_emb)
input_tok_emb = tok_emb(src)
print('input_tok_emb:\n', input_tok_emb.shape)
print('input_tok_emb[0]:\n', input_tok_emb[0][0][:10])
position_emb = model.encoder.emb.position_emb
print('position_emb: \n', position_emb)
input_position_emb = position_emb(src)
# 这里对positional embedding 做了一次广播机制
print('input_position_emb:\n', input_position_emb.shape)
print('input_position_emb[0]:\n', input_position_emb[0][:10])
input_emb = input_tok_emb + input_position_emb
print('input_emb: \n', input_emb.shape)
print('input_emb[0]:\n', input_emb[0][0][:10])
input_emb = model.encoder.emb.drop_out(input_emb)

token_emb: 
 TokenEmbedding(5893, 512, padding_idx=1)
input_tok_emb:
 torch.Size([128, 27, 512])
input_tok_emb[0]:
 tensor([ 0.0659, -0.0807,  0.0928,  0.0404, -0.0871,  0.0316,  0.0440, -0.0285,
         0.0706, -0.0998], device='cuda:0', grad_fn=<SliceBackward0>)
position_emb: 
 PositionalEncoding()
input_position_emb:
 torch.Size([27, 512])
input_position_emb[0]:
 tensor([0., 1., 0., 1., 0., 1., 0., 1., 0., 1.], device='cuda:0')
input_emb: 
 torch.Size([128, 27, 512])
input_emb[0]:
 tensor([ 0.0659,  0.9193,  0.0928,  1.0404, -0.0871,  1.0316,  0.0440,  0.9715,
         0.0706,  0.9002], device='cuda:0', grad_fn=<SliceBackward0>)


#### 3.5 调试encoder
input embedding -> encoder layers -> output


In [20]:
# 获取encoder当中的一个层 调试计算流程
encoder_layer = model.encoder.layers[0]

# 1. 先保留short cut 再经过attention 计算
    # short cut
_input_emb = input_emb
    # 注意力机制
    # 通过linear层 得到qkv
w_q = encoder_layer.attention.w_q
w_k = encoder_layer.attention.w_k
w_v = encoder_layer.attention.w_v

    # 输入input embedding，分别得到qkv
q = w_q(input_emb)
k = w_k(input_emb)
v = w_v(input_emb)

    # 以上就是经过线性变换 使得qkv不同
print('input_emb shape: \n', input_emb.shape)
print('input_emb: \n', input_emb[0][0][:10])
print('q: \n', q[0][0][:10])
print('k: \n', k[0][0][:10])
print('v: \n', v[0][0][:10])

    # 然后我们需要将qkv拆分成多头, 这里因为qkv形状相等 所以以q为例
batch_size, seq_len, d_model = q.size()
    # 这里要注意是否能被整除
try:
    d_tensor = d_model // n_heads
except Exception as e:
    print('Wrong when split heads: \n', e)
    # batch_size, seq_len, d_model -> batch_size, n_heads, seq_len, d_tensor
q = q.view(batch_size, seq_len, n_heads, d_tensor).transpose(1, 2)
k = k.view(batch_size, seq_len, n_heads, d_tensor).transpose(1, 2)
v = v.view(batch_size, seq_len, n_heads, d_tensor).transpose(1, 2)

print('multi-head q: \n', q.shape)

    # self-attention计算
temp = q @ k.transpose(2,3)
print('attention temp shape: \n', temp.shape)
print('attention temp: \n', temp[0][0][0][:10])
    # scale 操作
score = temp / math.sqrt(d_model)
print('score shape: \n', score.shape)
print('score: \n', score[0][0][0][:10])
    # mask 操作 目的是为了mask掉其中padding的部分
score = score.masked_fill(src_mask == 0, -1e10)
print('masked score shape: \n', score.shape)
print('masked score: \n', score[0][0][0])
    # softmax 可以观察到 mask后的地方都变成了0 
    # 关于softmax：https://openatomworkshop.csdn.net/66470371b12a9d168eb6e9c9.html
score = nn.Softmax(dim=-1)(score)
print('score shape: \n', score.shape)
print('score: \n', score[0][0][0])
    # 计算v
score = score @ v
print('score shape: \n', score.shape)
print('score: \n', score[0][0][0][:10])
    # 将多头再拼接回来
batch_size, n_head, seq_len, d_tensor = score.size()
d_model = n_head * d_tensor
score = score.transpose(2, 3).contiguous().view(batch_size, seq_len, d_model)
print('score shape: \n', score.shape)
print('score: \n', score[0][0][:10])
# multi-head attention 和 self-attention 不同的地方在于 需要在输出的地方再乘以一个权重
score = encoder_layer.attention.concat_linear(score)
print('score shape: \n', score.shape)
print('score: \n', score[0][0][:10])
print('可以发现输入的input emb的形状和计算完attention后的形状是一致的')

input_emb shape: 
 torch.Size([128, 27, 512])
input_emb: 
 tensor([ 0.0732,  1.0215,  0.1031,  1.1560, -0.0968,  1.1462,  0.0489,  1.0794,
         0.0785,  1.0002], device='cuda:0', grad_fn=<SliceBackward0>)
q: 
 tensor([-0.6794, -1.6800,  2.2284, -0.2421, -0.5112,  0.0564, -1.5653, -2.3242,
        -0.1343, -1.2660], device='cuda:0', grad_fn=<SliceBackward0>)
k: 
 tensor([-0.7530, -1.3307,  1.3520, -0.7096, -0.8103,  0.6370,  0.3142,  1.0733,
         1.8861, -3.4619], device='cuda:0', grad_fn=<SliceBackward0>)
v: 
 tensor([ 1.1294, -0.3531,  0.3073, -1.6134,  0.2602,  0.1717,  2.8820, -1.1430,
         1.6829,  1.6666], device='cuda:0', grad_fn=<SliceBackward0>)
multi-head q: 
 torch.Size([128, 8, 27, 64])
attention temp shape: 
 torch.Size([128, 8, 27, 27])
attention temp: 
 tensor([ 6.2526,  7.1083, 10.3776, 11.2454, 13.3260,  4.1047,  6.2703,  7.6537,
        11.2654, 13.0507], device='cuda:0', grad_fn=<SliceBackward0>)
score shape: 
 torch.Size([128, 8, 27, 27])
score: 
 tensor(

In [21]:
# 2. add and norm
score = encoder_layer.dropout1(score)
    # 1) add
score = score + _input_emb
    # 2) layerNorm: 可以看小冬瓜AIGC小红书有详细解释
    # 定义参数 这些参数都是可以训练的
gamma = nn.Parameter(torch.ones(d_model)).to(device)
beta = nn.Parameter(torch.zeros(d_model)).to(device)
eps = 1e-12
    # 在特征维度上计算均值
mean = score.mean(dim=-1, keepdim=True)
print('mean shape: \n', mean.shape)
print('mean: \n', mean[0][0][0])
    # 在特征维度上计算方差
var = score.var(dim = -1, keepdim=True, unbiased = False)
print('var shape: \n', var.shape)
print('var: \n', var[0][0][0])
    # 计算统计学上的归一化
out = (score - mean) / torch.sqrt(var + eps)
print('out shape: \n', out.shape)
print('out: \n', out[0][0][0])
score = gamma * out + beta
print('score shape: \n', score.shape)
print('score: \n', score[0][0][0])


mean shape: 
 torch.Size([128, 27, 1])
mean: 
 tensor(0.4363, device='cuda:0', grad_fn=<SelectBackward0>)
var shape: 
 torch.Size([128, 27, 1])
var: 
 tensor(1.9876, device='cuda:0', grad_fn=<SelectBackward0>)


out shape: 
 torch.Size([128, 27, 512])
out: 
 tensor(0.4483, device='cuda:0', grad_fn=<SelectBackward0>)
score shape: 
 torch.Size([128, 27, 512])
score: 
 tensor(0.4483, device='cuda:0', grad_fn=<SelectBackward0>)


In [22]:
# ffn
# 先来一次short cut
_score = score

# 经过前向传播 比较简单 不再调试到每一步
score = encoder_layer.ffn(score)
score = encoder_layer.dropout2(score)

# 再进行一次 layerNorm
score = encoder_layer.norm2( _score + score)
print('score shape: \n', score.shape)
print('score: \n', score[0][0][0])

score shape: 
 torch.Size([128, 27, 512])
score: 
 tensor(-0.2670, device='cuda:0', grad_fn=<SelectBackward0>)


#### 3.6 调试decoder
decoder操作稍微复杂一点 主要是经过两次attention 一次是self-attention 另外一次是交叉注意力机制 是和encoder一起的

decoder的self-attention不再赘述 操作方式和encoder差不多 这里会详细debug 交叉注意力层

前面我们已经获取到了encoder 的输出 score， decoder 的注意力分数我们用 x 来代替

In [23]:
# 先对trg进行位置编码 这些和encoder操作都是一致的
# note： encoder的输出是score
trg_emb = model.decoder.emb(trg)
print('trg_emb shape: \n', trg_emb.shape)
print('trg_emb: \n', trg_emb[0][0][0])
decoder_layer = model.decoder.layers[0]

# 先进行short cut 换个名字
_x = trg_emb
x = trg_emb
# 计算self-attention
x = decoder_layer.self_attention(q = x, k = x, v = x, mask = trg_mask)
print('x shape: \n', x.shape)
print('x: \n', x[0][0][:10])
x = decoder_layer.dropout1(x)
x = decoder_layer.norm1(x + _x)

# short cut第二次
_x = x
# 进行交叉注意力的计算 注意：attention的计算方式还是一致的 不同的是qkv和mask
    # 通过linear层 得到qkv
w_q = decoder_layer.enc_dec_attention.w_q
w_k = decoder_layer.enc_dec_attention.w_k
w_v = decoder_layer.enc_dec_attention.w_v

    # 我们用decoder的q 来查询 encoder的k和v
q = w_q(x)
k = w_k(score)
v = w_v(score)

    # 拆分成多头，由于前面debug过详细内容 这里不再赘述
split = decoder_layer.enc_dec_attention.split
q, k, v = split(q), split(k), split(v)
print('multi-head q: \n', q.shape)
print('multi-head k: \n', k.shape)
print('multi-head v: \n', v.shape)

    # 注意这里的q和kv的形状是不同的
    # self-attention计算
temp = q @ k.transpose(2,3)
print('attention temp shape: \n', temp.shape)
print('attention temp: \n', temp[0][0][0][:10])
    # scale 操作
x = temp / math.sqrt(d_model)
print('scaled score shape: \n', x.shape)
print('scaled score: \n', x[0][0][0][:10])
    # mask 操作 目的是为了mask掉其中padding的部分
x = x.masked_fill(src_trg_mask == 0, -1e10)
print('masked score shape: \n', x.shape)
print('masked score: \n', x[0][0][0])
    # softmax 可以观察到 mask后的地方都变成了0 
    # 关于softmax：https://openatomworkshop.csdn.net/66470371b12a9d168eb6e9c9.html
x = nn.Softmax(dim=-1)(x)
print('softmax score shape: \n', x.shape)
print('softmax score: \n', x[0][0][0])
    # 计算v
x = x @ v
print('output score shape: \n', x.shape)
print('output score: \n', x[0][0][0][:10])
    # 将多头再拼接回来
batch_size, n_head, seq_len, d_tensor = x.size()
d_model = n_head * d_tensor
x = x.transpose(2, 3).contiguous().view(batch_size, seq_len, d_model)
print('concat score shape: \n', x.shape)
print('concat score: \n', x[0][0][:10])
# multi-head attention 和 self-attention 不同的地方在于 需要在输出的地方再乘以一个权重
x = decoder_layer.enc_dec_attention.concat_linear(x)
print('score shape: \n', x.shape)
print('score: \n', x[0][0][:10])
print('可以发现输入的input emb的形状和计算完attention后的形状是一致的')
print('需要注意交叉注意力计算的时候 矩阵大小的变化')

trg_emb shape: 
 torch.Size([128, 28, 512])
trg_emb: 
 tensor(-0.0381, device='cuda:0', grad_fn=<SelectBackward0>)
x shape: 
 torch.Size([128, 28, 512])
x: 
 tensor([ 0.8951,  1.6015,  0.8205, -0.8543, -1.7190, -1.9628, -4.4652,  0.7496,
        -0.0703, -1.4640], device='cuda:0', grad_fn=<SliceBackward0>)
multi-head q: 
 torch.Size([128, 8, 28, 64])
multi-head k: 
 torch.Size([128, 8, 27, 64])
multi-head v: 
 torch.Size([128, 8, 27, 64])
attention temp shape: 
 torch.Size([128, 8, 28, 27])
attention temp: 
 tensor([ -4.3989,  -1.7746,  -2.8873, -14.5719,  -0.9342,   5.7105, -16.0071,
        -12.5798, -16.5159,   3.6476], device='cuda:0',
       grad_fn=<SliceBackward0>)
scaled score shape: 
 torch.Size([128, 8, 28, 27])
scaled score: 
 tensor([-0.1944, -0.0784, -0.1276, -0.6440, -0.0413,  0.2524, -0.7074, -0.5560,
        -0.7299,  0.1612], device='cuda:0', grad_fn=<SliceBackward0>)
masked score shape: 
 torch.Size([128, 8, 28, 27])
masked score: 
 tensor([-1.9441e-01, -7.8429e-02, -

In [24]:
# 接下来就是add and norm 和ffn了
x = decoder_layer.dropout2(x)
x = decoder_layer.norm2(x + _x)

_x = x
x = decoder_layer.ffn(x)

x = decoder_layer.dropout3(x)
x = decoder_layer.norm3(x + _x)
# 此时得到的x 就是transformer模型的输出
print('transformer output shape: \n', x.shape)
print('transformer output: \n', x[0][0][0])
# 比较一下 和 输入的 差别
print('decoder trg input shape: \n', trg_emb.shape)
print('decoder trg input: \n', trg_emb[0][0][0])

transformer output shape: 
 torch.Size([128, 28, 512])
transformer output: 
 tensor(0.7944, device='cuda:0', grad_fn=<SelectBackward0>)
decoder trg input shape: 
 torch.Size([128, 28, 512])
decoder trg input: 
 tensor(-0.0381, device='cuda:0', grad_fn=<SelectBackward0>)


### 4. transformer 训练和推理流程
模型在训练和推理时候的流程会有所区别 训练会一次性输入trg 但是 推理 是next token generation的任务

#### 4.1 loss
训练的时候需要注意损失函数的计算， 尤其要注意input output和label的构造

In [25]:
# 先计算loss
# 看一下原始的输入和输出
print('src input shape: \n', src.shape)
# 取出其中的一句话 观察如何被tokenize
print('src input: \n', src[0])
print('发现 一句话 有2 作为 <SOS>; 3 作为 <EOS>; 1 作为 padding')

print('trg input shape: \n', trg.shape)
# 取出其中的一句话 观察如何被tokenize
print('trg input: \n', trg[0])
print('发现 一句话 有2 作为 <SOS>; 3 作为 <EOS>; 1 作为 padding')

# 在做decoder模型输入的时候，要将输入的序列shift right 
# 因为当decoder最后一个词输入时，直接输出eos，不需要decoder的eos

trg_right = trg[:, :-1]

# 实例化一个模型
model = Transformer(src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    trg_sos_idx=trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=enc_voc_size,
                    dec_voc_size=dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob,
                    device=device).to(device)

# 使用kaiming_uniform对model初始化
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)
        
model.apply(initialize_weights)

output = model(src, trg_right)
print('model output shape: \n', output.shape)

output_reshape = output.contiguous().view(-1, output.shape[-1])
print('model output_reshape shape: \n', output_reshape.shape)
print('model output_reshape: \n', output_reshape[0])
# 这里处理groud truth的时候 注意是取消了sos这个token 
# 原因是要让decoder输入的 sos 对应上下一个词是 groud truth 句子的开头
trg_reshape = trg[:, 1:].contiguous().view(-1)
print('trg_reshape shape: \n', trg_reshape.shape)
print('trg_reshape: \n', trg_reshape[0])
# 需要注意的是 我们的 trg_reshape 是 indices 可以输入到pytorch的交叉熵损失函数中自动的one_hot
# https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
# 实例化一个损失函数,在pad的地方不需要计算损失
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)
loss = criterion(output_reshape, trg_reshape)
print('loss: \n', loss)

src input shape: 
 torch.Size([128, 27])
src input: 
 tensor([  2,  48,  53, 127,  36,  71,  18,  11,   8,   4, 268,   5,   3,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
       device='cuda:0')
发现 一句话 有2 作为 <SOS>; 3 作为 <EOS>; 1 作为 padding
trg input shape: 
 torch.Size([128, 28])
trg input: 
 tensor([  2,  43, 103,  80,  52,  47,  10,  12,   6, 320,   4,   3,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
       device='cuda:0')
发现 一句话 有2 作为 <SOS>; 3 作为 <EOS>; 1 作为 padding
model output shape: 
 torch.Size([128, 27, 7853])
model output_reshape shape: 
 torch.Size([3456, 7853])
model output_reshape: 
 tensor([ 1.1452, -0.8259,  2.2421,  ..., -1.7233, -0.0876,  1.8815],
       device='cuda:0', grad_fn=<SelectBackward0>)
trg_reshape shape: 
 torch.Size([3456])
trg_reshape: 
 tensor(43, device='cuda:0')
loss: 
 tensor(10.0268, device='cuda:0', grad_fn=<NllLossBackward0>)


  nn.init.kaiming_uniform(m.weight.data)


#### 4.2 训练

In [26]:
# 配置参数
optimizer = Adam(
    params=model.parameters(),
    lr=init_lr,
    weight_decay=weight_decay,
    eps=adam_eps
)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer,
    verbose=True,
    factor=factor,
    patience=patience
)

criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

# 定义训练函数

def train(model, epoch, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i in range(len(iterator)):
        src, trg = iterator[i]
        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg_reshape = trg[:, 1:].contiguous().reshape(-1)
        loss = criterion(output_reshape, trg_reshape)
        loss.backward()
        # 这里的clip是为了防止梯度太大时 引起梯度爆炸使用的
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        print(f'eopch: {epoch}, step: {round((i/len(iterator)) * 100, 2)}%, step_loss: {loss}, epoch_loss: {epoch_loss / len(iterator)}')

epoch_num = 100
for epoch in range(epoch_num):
    train_loss = train(model, epoch, [(src, trg)], optimizer, criterion, clip)



eopch: 0, step: 0.0%, step_loss: 10.056924819946289, epoch_loss: 10.056924819946289
eopch: 1, step: 0.0%, step_loss: 9.769620895385742, epoch_loss: 9.769620895385742
eopch: 2, step: 0.0%, step_loss: 9.516295433044434, epoch_loss: 9.516295433044434
eopch: 3, step: 0.0%, step_loss: 9.316094398498535, epoch_loss: 9.316094398498535
eopch: 4, step: 0.0%, step_loss: 9.120285987854004, epoch_loss: 9.120285987854004
eopch: 5, step: 0.0%, step_loss: 8.998662948608398, epoch_loss: 8.998662948608398
eopch: 6, step: 0.0%, step_loss: 8.820260047912598, epoch_loss: 8.820260047912598
eopch: 7, step: 0.0%, step_loss: 8.691749572753906, epoch_loss: 8.691749572753906
eopch: 8, step: 0.0%, step_loss: 8.572705268859863, epoch_loss: 8.572705268859863
eopch: 9, step: 0.0%, step_loss: 8.440896034240723, epoch_loss: 8.440896034240723
eopch: 10, step: 0.0%, step_loss: 8.35954761505127, epoch_loss: 8.35954761505127
eopch: 11, step: 0.0%, step_loss: 8.260976791381836, epoch_loss: 8.260976791381836
eopch: 12, ste