<a href="https://colab.research.google.com/github/bluetinue/transforner/blob/main/%E5%8F%82%E8%80%83%E4%BB%A3%E7%A0%81_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy

In [3]:
# todo:1.定义WordEmbedding
class Embeddings(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        # vocab_size:代表单词的总个数
        self.vocab_size = vocab_size
        # embed_dim:代表词嵌入维度
        self.embed_dim = embed_dim
        # 定义Embedding层
        self.embed = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):
        # x--》[batch_size, seq_len]
        return self.embed(x) * math.sqrt(self.embed_dim)


In [4]:
# todo:2.定义位置编码模型PositionEncoding
class PositionEncoding(nn.Module):
    def __init__(self, d_model, dropout_p, max_len=60):
        super().__init__()
        # d_model:代表词嵌入维度
        self.d_model = d_model
        # dropout_p:代表随机失活的系数
        self.dropout_p = dropout_p
        # max_len:代表最大句子长度
        self.max_len = max_len
        # 定义dropout层
        self.dropout = nn.Dropout(p=dropout_p)
        # 根据三角函数的公式实现位置的编码
        # 定义位置编码矩阵[max_len, d_model]-->[60, 512]
        pe = torch.zeros(max_len, d_model)
        # 定义位置列矩阵--》[max_len, 1]-->[60, 1]
        position = torch.arange(0, max_len).unsqueeze(dim=1)
        # 定义转换矩阵：根据三角函数的计算公式，是其中的除了pos之外的系数（频率）
        # temp_vec-->[256]
        temp_vec = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000)/d_model))
        # 根据三角函数的计算公式，计算角度:pos_vec-->[60, 256]
        pos_vec = position * temp_vec
        # 将奇数位用sin处理，偶数位用cos处理
        pe[:, 0::2] = torch.sin(pos_vec)
        pe[:, 1::2] = torch.cos(pos_vec)
        # 需要对上述的位置编码结果升维:pe-->[1, max_len, d_model]-->[1, 60, 512]
        pe = pe.unsqueeze(dim=0)
        # pe位置编码结果不随着模型的训练而更新，因此需要进行注册到缓存区
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x--》来自于embedding之后的结果--》[batch_size, seq_len, embed_dim]-->[2, 4, 512]
        # 将x和位置编码的信息进行融合
        x = x + self.pe[:, :x.size()[1]]
        return self.dropout(x)

In [5]:
# todo:1. 定义attention计算的方法
def attention(query, key, value, mask=None, dropout=None):
    # query/key/value-->[batch_size, seq_len, embed_dim]
    # mask-->shape-->[batch_size, seq_len, seq_len]
    # dropout--》实例化的对象
    # 第一步：获得词嵌入表达的维度
    d_k = query.size(-1)
    # 第二步：计算query和key之间的相似性分数（注意力权重分数（未经过softmax归一化的结果））
    # query-->[2, 4, 512];key-->[2, 4, 512]-->转置--》[2, 512,4]. 相乘后--》scores-->[2, 4, 4]
    scores = torch.matmul(query, torch.transpose(key, -1, -2)) / math.sqrt(d_k)
    # 第三步：判断是否需要mask
    if mask is not None:
        scores = scores.masked_fill(mask==0, -1e9)
    # print(f'未归一化的scores--》{scores}')
    # 第四步：进行softmax归一化
    atten_weights = F.softmax(scores, dim=-1)
    # print(f'atten_weights--》{atten_weights}')
    # 第五步：如果有dropout 就进行随机失活防止过拟合
    if dropout is not None:
        atten_weights = dropout(atten_weights)

    return torch.matmul(atten_weights, value), atten_weights


In [6]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [7]:
# todo:2. 定义多头注意力类
class MutiHeadAttention(nn.Module):
    def __init__(self, head, embed_dim, dropout_p=0.1):
        super().__init__()
        # 第一步：确定embed_dim是否能被head整除
        assert embed_dim % head == 0
        # 第二步：确定每个head应该处理多少维度特征
        self.d_k = embed_dim // head
        # 第三步：定义head的属性
        self.head = head
        # 第四步：定义4个全连接层
        self.linears = clones(nn.Linear(embed_dim, embed_dim), 4)
        # 第五步：定义atten权重属性
        self.atten = None
        # 第六步：实例化dropout对象
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, query, key, value, mask=None):
        # 需要对mask的形状进行升维度
        # mask-->输入的形状--》[head, seq_len, seq_len]-->[8, 4, 4],升维之后--》[1, 8, 4, 4]
        if mask is not None:
            mask = mask.unsqueeze(dim=0)
        # 获取当前输入的batch_size
        batch_size = query.size(0)
        # 开始处理query，key，value，都要经过线性变化并且切分为8个头
        # model(x)-->就是将数据经过linear层处理x-->[2, 4, 512]-->经过Linear-->[2, 4, 512]-->分割--》[2, 4, 8, 64]-->transpose-->[2, 8, 4, 64]
        # query，key，value--》shape-->[2, 8, 4, 64]
        query, key, value = [model(x).view(batch_size, -1, self.head, self.d_k).transpose(1, 2)
                             for model, x in zip(self.linears, (query, key, value))]
        # 接下来将上述处理后的query，key，value--》shape-->[2, 8, 4, 64]送入attention方法进行注意力的计算:
        # query--》[2, 8, 4, 64]和key--》[2, 8, 4, 64]转置结果[2, 8, 64, 4]进行相乘--》shape--》[2,8, 4, 4](所以传的mask矩阵是4维的)
        # [2, 8, 4, 4]要和value-->[2, 8, 4, 64]-->相乘--》shape--》x-->[2, 8, 4, 64]
        x, self.atten = attention(query, key, value, mask=mask, dropout=self.dropout)
        # 需要将多头注意力的结果进行合并
        #  x.transpose(1, 2)-->【2,4, 8, 64】
        # y 合并后的结果-->[2, 4, 512]
        y = x.transpose(1, 2).contiguous().view(batch_size, -1, self.head*self.d_k)
        # 经过线性变化得到指定输出维度的结果
        return self.linears[-1](y)

In [8]:
# todo:3. 定义前馈全连接层：两层线性层
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout_p=0.1):
        super().__init__()
        # d_model:第一个全连接层输入的特征维度；第二个全连接层输出的特征维度
        self.d_model = d_model
        # d_ff: 第一个全连接层输出的特征维度；第二个全连接层输入的特征维度
        self.d_ff = d_ff
        # 定义第一个全连接层
        self.linear1 = nn.Linear(d_model, d_ff)
        # 定义第二个全连接层
        self.linear2 = nn.Linear(d_ff, d_model)
        # 定义dropout层
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

In [10]:
# todo:4. 定义规范化层：让数据符合标准正态分布
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super().__init__()
        # 定义属性
        self.features = features # 代表词嵌入维度
        # eps
        self.eps = eps
        # 定义一个模型的参数（系数）
        self.a = nn.Parameter(torch.ones(features))
        self.b = nn.Parameter(torch.zeros(features))

    def forward(self, x):
        # x--->[2, 4, 512]
        # 1.求出均值:x_mean-->[2, 4, 1]
        x_mean = torch.mean(x, dim=-1, keepdim=True)
        # 2.求出标准差
        x_std = torch.std(x, dim=-1, keepdim=True)
        return self.a * (x - x_mean) / (x_std + self.eps) + self.b

In [11]:
# todo:5. 定义子层连接结构
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout_p=0.1):
        super().__init__()
        # 定义size属性：词嵌入的维度大小
        self.size = size
        # 实例化规范化层
        self.layer_norm = LayerNorm(features=size)
        # 实例化dropout层
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, x, sublayer):
        # x--》来自于输入部分：positionEncoding+WordEmbedding;[batch_size, seq_len, embed_dim]-->[2, 4, 512]
        # sublayer-->代表函数的对象：可以是处理多头自注意力机制函数的对象，也可以是前馈全连接层对象
        # post_norm
        x1 = x + self.dropout(self.layer_norm(sublayer(x)))
        # pre_norm
        # x1 = x + self.dropout(sublayer(self.layer_norm(x)))
        return x1


In [12]:
# todo:6. 定义编码器层
class EncoderLayer(nn.Module):
    def __init__(self, size, self_atten, feed_forward, dropout_p):
        super().__init__()
        # size:代表词嵌入的维度
        self.size = size
        # self_atten:代表多头自注意力机制的对象
        self.self_atten = self_atten
        # feed_forward:代表前馈全连接层的对象
        self.feed_forward = feed_forward
        # 定义两层子层连接结构
        self.sub_layers = clones(SublayerConnection(size, dropout_p), 2)

    def forward(self, x, mask):
        # x-->来自输入部分--》[batch_size, seq_len, embed_dim]:[2, 4, 512]
        # mask-->[head, seq_len, seq_len]-=-->[8, 4, 4]
        # 经过第一个子层连接结构：先经过多头自注意力层--》然后经过norm-->最后残差连接
        x1 = self.sub_layers[0](x, lambda x: self.self_atten(x, x, x, mask))
        # 经过第二个子层连接结构：先经过前馈全连接层--》然后经过norm-->最后残差连接
        x2 = self.sub_layers[1](x1, self.feed_forward)
        return x2

In [13]:
# todo:7. 定义编码器
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()
        # layer：代表编码器层
        self.layer = layer
        # N:代表有几个编码器层
        # 定义N个编码器层
        self.layers = clones(layer, N)
        # 实例化规范化层
        self.norm = LayerNorm(features=layer.size)

    def forward(self, x, mask):
        # x-->来自输入部分--》[batch_size, seq_len, embed_dim]:[2, 4, 512]
        # mask-->[head, seq_len, seq_len]-=-->[8, 4, 4]
        # for循环迭代N个编码器层得到最终的结果
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [14]:
# todo:1. 定义解码器层
class DecoderLayer(nn.Module):
    def __init__(self, size, self_atten, src_atten, feed_forward, dropout_p):
        super().__init__()
        # size:代表词嵌入维度的大小
        self.size = size
        # self_atten:自注意力机制的对象：Q=K=V
        self.self_atten = self_atten
        # src_atten:一般注意力机制的对象：Q!=K=V
        self.src_atten = src_atten
        # feed_forward:前馈全连接层对象
        self.feed_forward = feed_forward

        # 定义三个子层连接结构
        self.sub_layers = clones(SublayerConnection(size, dropout_p), 3)

    def forward(self, y, encoder_output, source_mask, target_mask):
        # y:代表解码器的输入--》[batch_size, seq_len, embed_dim]
        # encoder_output:代表编码器的输出结果--》[batch_size, seq_len, emebed_dim]
        # target_mask防止未来信息被提前看到/target_mask-->[head, y_seq_len, y_seq_len]
        # source_mask消除padding的影响# source_mask--shape-->[head, y_seq_len, x_seq_len]
        # 经过第一个子层连接结构
        y1 = self.sub_layers[0](y, lambda x: self.self_atten(x, x, x, target_mask))
        # 经过第二个子层连接结构
        # query--》[2,6,512]-->[2, 8, 6, 64],key/value-->[2, 4, 512]-->[2, 8, 4, 64]
        # [2, 8, 6, 64]--和[2, 8, 4, 64]转置[2,8, 64, 4]-->[2, 8, 6, 4]
        y2 = self.sub_layers[1](y1, lambda x: self.src_atten(x, encoder_output, encoder_output, source_mask))
        # 经过第三个子层连接结构
        y3 = self.sub_layers[2](y2, self.feed_forward)
        return y3

In [16]:
#todo:2.定义解码器
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()
        # layer：代表解码器层
        self.layer = layer
        # N:代表有几个解码器层
        # 定义N个解码层
        self.layers = clones(layer, N)
        # 实例化规范化层
        self.norm = LayerNorm(features=layer.size)

    def forward(self, y, encoder_output, source_mask, target_mask):
        # y:代表解码器的输入--》[batch_size, seq_len, embed_dim]
        # encoder_output:代表编码器的输出结果--》[batch_size, seq_len, emebed_dim]
        # target_mask防止未来信息被提前看到/target_mask-->[head, y_seq_len, y_seq_len]
        # source_mask消除padding的影响# source_mask--shape-->[head, y_seq_len, x_seq_len]
        # for循环迭代N个编码器层得到最终的结果
        for layer in self.layers:
            y = layer(y, encoder_output, source_mask, target_mask)
        return self.norm(y)

In [25]:
#输出部分
class Output(nn.Module):
  def __init__(self,d_model,vocab_size):
    super().__init__()
    self.linear = nn.Linear(d_model,vocab_size)

  def forward(self,x):
    return F.log_softmax(self.linear(x),dim=-1)

In [43]:
d_model = 512
vocab_size = 10000
output = Output(d_model,vocab_size)

x = torch.randn(2,6,512)
y = output(x)
print(y)
print(y.shape)

tensor([[[ -9.7111,  -8.1889,  -8.9062,  ...,  -8.6487,  -9.8830,  -9.3482],
         [ -8.6914,  -8.5643,  -9.8532,  ..., -10.1607,  -8.9868,  -9.0823],
         [-10.6884,  -9.0723,  -9.5504,  ...,  -9.4564, -10.1345,  -9.2651],
         [ -9.5655,  -9.3086,  -9.1617,  ...,  -9.9297, -10.0550,  -8.7121],
         [ -8.7577,  -9.6381,  -9.0351,  ...,  -9.3632, -10.1828,  -9.0489],
         [ -9.7267, -10.1449,  -9.1718,  ...,  -8.6863,  -9.1839,  -9.9713]],

        [[ -8.9630,  -7.8263,  -9.5879,  ...,  -8.8451, -10.6653,  -9.1960],
         [ -8.7935,  -9.0358,  -9.1744,  ...,  -9.8542, -11.0156, -10.0952],
         [ -9.8142,  -9.8237, -10.1341,  ...,  -9.9889,  -9.0870,  -9.4520],
         [ -9.4690,  -8.9369,  -9.8516,  ...,  -9.8868,  -9.5688, -10.1899],
         [ -8.8104,  -8.5893,  -8.6542,  ...,  -9.0583,  -9.5103,  -9.5428],
         [ -9.3941,  -9.3329,  -8.3968,  ...,  -9.2477,  -9.0286,  -9.2541]]],
       grad_fn=<LogSoftmaxBackward0>)
torch.Size([2, 6, 10000])


In [32]:
#定义Encoder2Decoder类
class Encoder2Decoder(nn.Module):
  def __init__(self,encoder,decoder,source_embed,target_embed,Output):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.source_embed = source_embed
    self.target_embed = target_embed
    self.output = output
  def forward(self,source,target,source_mask1,source_mask2,target_mask):
    encoder_word_embed = self.source_embed(source)
    encoder_output = self.encoder(encoder_word_embed,source_mask1)
    decoder_word_embed = self.target_embed(target)
    decoder_output = self.decoder(decoder_word_embed,encoder_output,source_mask2,target_mask)
    output = self.output(decoder_output)
    return output

In [53]:
#构建transformer模型
def mk_model():
  #实例化编码器对象
  MutiHead = MutiHeadAttention(embed_dim=512,head=8,dropout_p=0.1)
  feed_forward = FeedForward(512,2048)
  encoder_layer = EncoderLayer(512,MutiHead,feed_forward,0.1)
  encoder = Encoder(encoder_layer,6)

  #实例化解码器对象
  self_attn = copy.deepcopy(MutiHead)
  src_attn = copy.deepcopy(MutiHead)
  ff = copy.deepcopy(feed_forward)
  decoder_layer = DecoderLayer(512,self_attn,src_attn,ff,0.1)
  decoder = Decoder(decoder_layer,6)

  #源语言经过wordEmbedding--》positionEncoding
  vocab_size = 1000
  d_model = 512
  encoder_embed = Embeddings(vocab_size=vocab_size,embed_dim=d_model)
  encoder_pos = PositionEncoding(d_model,0.1)
  #输入
  source_embed = nn.Sequential(encoder_embed,encoder_pos)

  decoder_embed = copy.deepcopy(encoder_embed)
  decoder_pos = copy.deepcopy(encoder_pos)
  target_embed = nn.Sequential(decoder_embed,decoder_pos)

  output = Output(d_model,vocab_size)
  #实例化model
  model = Encoder2Decoder(encoder,decoder,source_embed,target_embed,output)
  print(model)
  #准备数据
  source = torch.randint(0, vocab_size, (2, 4))
  target = torch.randint(0, vocab_size, (2, 6))
  print(source)
  print(target)

  source_mask1 = torch.zeros(8,4,4)
  source_mask2 = torch.zeros(8,6,4)
  target_mask = torch.zeros(8,6,6)

  result =model(source,target,source_mask1,source_mask2,target_mask)
  print(result)

In [54]:
mk_model()

Encoder2Decoder(
  (encoder): Encoder(
    (layer): EncoderLayer(
      (self_atten): MutiHeadAttention(
        (linears): ModuleList(
          (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): FeedForward(
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (sub_layers): ModuleList(
        (0-1): 2 x SublayerConnection(
          (layer_norm): LayerNorm()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_atten): MutiHeadAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward):