<a href="https://colab.research.google.com/github/bluetinue/transforner/blob/main/%E6%89%8B%E6%92%95Tansformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import copy
# 数学计算工具包
import math

# 输入部分

In [45]:
#@title 词嵌入层
class Embed(nn.Module):
  def __init__(self,vocab,embed_dim):
    super().__init__()
    self.vocab = vocab
    self.embed_dim = embed_dim
    self.lcut = nn.Embedding(self.vocab,self.embed_dim)

  def forward(self,x):
    return self.lcut(x) * math.sqrt(self.embed_dim)

In [46]:
#@title 位置编码器
class PostionalEncoding(nn.Module):
  def __init__(self,d_model,dropout,max_len=60):
    super().__init__()
    self.d_model = d_model
    #drop层防止过拟合
    self.dropout = nn.Dropout(p=dropout)
    #[60,512]
    pe = torch.zeros(max_len,d_model)
    #[60,1]
    position = torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)

    #定义变化矩阵 [256]
    div_term = torch.exp(torch.arange(0,d_model,2) * -(math.log(10000.0) / d_model))

    #矩阵相乘 [60,256]
    my_matmulres = position * div_term
    #按照奇数位*sin，偶数位置 *cos
    pe[:,0::2] = torch.sin(my_matmulres)
    pe[:,1::2] = torch.cos(my_matmulres)

    #[60,512] -->[1,60,512]
    pe = pe.unsqueeze(0)

    #持久化pe
    self.register_buffer("pe",pe)

  def forward(self,x):
    #对句子长度对应的位置索引进行相加
    x = x + self.pe[:,x.size()[1]]
    return self.dropout(x)

# 编码器部分

In [47]:
#@title 自注意力层
def attention(q,k,v,mask=None,dropout=None):
  d_k = q.size(-1)

  #计算权重矩阵q*k的转置
  scores = torch.matmul(q,k.transpose(-2,-1)) / math.sqrt(d_k)

  #是否对权重进行掩码计算和dropout
  if mask is not None:
    scores = scores.masked_fill(mask==0,-1e9)

  #经过softmax输出权重分布
  p_attn = F.softmax(scores,dim=-1)
  if dropout is not None:
    p_attn = dropout(p_attn)

  #返回权重计算结果和权重矩阵
  return torch.matmul(p_attn,v),p_attn

In [48]:
#@title 多头自注意力层
def clones(module,N):
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class MultiHeadAttention(nn.Module):
  def __init__(self,head,embed_dim,dropout_p=0.1):
    super().__init__()
    #确认维度可以给分
    assert embed_dim % head == 0
    #计算每个头的维度
    self.d_k = embed_dim // head
    self.head = head
    #随即失活；
    self.dropout = nn.Dropout(p=dropout_p)
    #定义线性层
    self.linears = clones(nn.Linear(embed_dim,embed_dim),4)
    #定义atten权重属性
    self.atten = None

  def forward(self,q,k,v,mask=None):
    if mask is not None:
      mask = mask.unsqueeze(0)
    #计算数据有多少个批次 [2,4,512]
    batch_size = q.size(0)
    #数据变换，将数据经过线性层组合链接在一起[2,8,4,64]
    q,k,v = [model(x).view(batch_size,-1,self.head,self.d_k).transpose(1,2)
     for model, x in zip(self.linears,(q,k,v))]
    #经过自注意力计算求个各个头之间的自注意力
    x,self.attn = attention(q,k,v,mask=mask,dropout=self.dropout)
    #数据合并
    x = x.transpose(1,2).contiguous().view(batch_size,-1,self.head*self.d_k)
    #返回线性层输出
    return self.linears[-1](x)

In [49]:
#@title 前馈连接层
class FeedForward(nn.Module):
  def __init__(self,d_model,d_ff,dropout=0.1):
    super().__init__()
    #定义两个线性层让数据走一遍，不改变原有的形状
    self.w_1 = nn.Linear(d_model,d_ff)
    self.w_2 = nn.Linear(d_ff,d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    #第一层经过relu激活函数，在经过dropout，在进第二层
    return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [50]:
#@title 规范化层
class LayerNorm(nn.Module):
  def __init__(self,features,eps=1e-6):
    super().__init__()
    #权重
    self.a1 = nn.Parameter(torch.ones(features))
    #偏置
    self.w1 = nn.Parameter(torch.zeros(features))

    self.eps = eps

  def forward(self,x):
    mean = x.mean(-1,keepdim=True)
    std = x.std(-1,keepdim=True)
    return self.a1 * (x-mean) / (std+self.eps) + self.w1


In [51]:
#@title 拼接两个子层起来形成编码器层

# 子层连接结构 子层(前馈全连接层 或者 注意力机制层)+ norm层 + 残差连接
# SublayerConnection实现思路分析
# 1 init函数  (self, size, dropout=0.1):
# 定义self.norm层 self.dropout层, 其中LayerNorm(size)
# 2 forward(self, x, sublayer) 返回+以后的结果
# 数据self.norm() -> sublayer()->self.dropout() + x
class SublayerConnection(nn.Module):
  def __init__(self,size,dropout):
    super().__init__()
    #定义norm和dropout层
    self.norm = LayerNorm(size)
    self.dropout = nn.Dropout(dropout)

  def forward(self,x,sublayer):
    myres = x + self.dropout(self.norm(sublayer(x)))
    return myres

#编码器层
class EncoderLayer(nn.Module):
  def __init__(self,size,self_atten,feed_forward,dropout):
    super().__init__()
    self.self_atten = self_atten
    self.feed_forward = feed_forward
    self.size = size
    self.sublayers = clones(SublayerConnection(size,dropout),2)

  def forward(self,x,mask):
    x = self.sublayers[0](x,lambda a:self.self_atten(a,a,a,mask))
    x = self.sublayers[1](x,self.feed_forward)
    return x

In [52]:
#@title 拼接6个编码层起来形成编码器
#编码器
class Encoder(nn.Module):
  def __init__(self,layer,N):
    super().__init__()
    self.layer = layer
    self.N = N

    self.layers = clones(layer,N)
    #实例化规范化层
    self.norm = LayerNorm(layer.size)

  def forward(self,x,mask):
    for layer in self.layers:
      x = layer(x,mask)
    return self.norm(x)

In [59]:
#@title 测试
embed_dim = 512
vocab = 1000
x = torch.tensor([[1,2,3,4],[40,50,60,70]])
embedding = Embed(vocab,embed_dim)
embr = embedding(x)
dropout = 0.1
x = embr
pe = PostionalEncoding(embed_dim,dropout)
pe_result = pe(x)

x = pe_result
mask = torch.zeros(8,4,4)
self_attn = MultiHeadAttention(8,embed_dim)
feed_forward = FeedForward(embed_dim,2048)
encoder_layer = EncoderLayer(embed_dim,self_attn,feed_forward,dropout)

encoder = Encoder(encoder_layer,6)
encoder_result = encoder(x,mask)
encoder_result

tensor([[[ 0.0453,  0.0159, -1.7737,  ...,  0.6770, -0.3944,  0.0782],
         [ 0.2380, -0.0404,  1.1892,  ...,  2.2801, -0.0617,  0.6955],
         [ 1.9359, -0.9223,  0.4601,  ...,  1.2048,  2.2784, -1.1281],
         [ 1.3250, -1.0286,  2.1739,  ...,  1.2817, -0.9785,  1.4144]],

        [[-0.0831,  0.1433, -2.2373,  ...,  1.6918, -0.0959,  1.7624],
         [ 0.0648,  0.4727, -1.0945,  ..., -1.3974, -1.0920, -0.1429],
         [ 1.9970,  0.6371,  1.1651,  ...,  0.0591,  0.9907, -0.6546],
         [ 0.0590, -1.4551,  2.1309,  ..., -0.3201,  0.0919,  0.0973]]],
       grad_fn=<AddBackward0>)

# 解码器层


In [55]:
#@title 解码器层
class DecoderLayer(nn.Module):
  def __init__(self,size,self_attn,src_attn,feed_forward,dropout):
    super().__init__()
    self.size = size
    self.self_attn = self_attn
    self.src_attn = src_attn
    self.feed_forward = feed_forward
    self.sublayers = clones(SublayerConnection(size,dropout),3)

  def forward(self,y,encoder_output,source_mask,target_mask):
    y1 = self.sublayers[0](y,lambda a:self.self_attn(a,a,a,target_mask))
    y2 = self.sublayers[1](y1,lambda a:self.src_attn(a,encoder_output,encoder_output,source_mask))
    y3 = self.sublayers[2](y2,self.feed_forward)
    return y3

In [63]:
#@title 解码器由若干个解码器层堆叠而成
class Decoder(nn.Module):
  def __init__(self,layer,N):
    super().__init__()
    self.layer = layer
    self.N = N

    self.layers = clones(layer,N)
    #实例化规范化层
    self.norm = LayerNorm(layer.size)

  def forward(self,y,encoder_output,source_mask,target_mask):
    for layer in self.layers:
      y = layer(y,encoder_output,source_mask,target_mask)
    return self.norm(y)

In [64]:
#@title 测试
y0 = torch.tensor([[2, 4, 10, 29, 67, 89],
          [34, 56, 78, 20, 19, 6]])
embed_y = embedding(y0)
position_y = pe(embed_y)

muti_head_atten = MultiHeadAttention(8,embed_dim)
self_attn = copy.deepcopy(muti_head_atten)
src_atten = copy.deepcopy(muti_head_atten)
ff = FeedForward(embed_dim,2048)
decoder_layer = DecoderLayer(embed_dim,self_attn,src_atten,feed_forward,dropout)
decoder = Decoder(decoder_layer,6)
#掩码张量
source_mask = torch.zeros(8,6,4)
target_mask = torch.zeros(8,6,6)

result = decoder(y=position_y,encoder_output=encoder_result,source_mask=source_mask,target_mask=target_mask)
print(result)

tensor([[[ 0.4328, -0.1102,  1.3320,  ...,  2.5533,  0.3870,  0.3272],
         [ 0.7533, -1.0068,  2.4373,  ...,  1.4720, -0.5158,  1.1373],
         [ 0.3494, -0.7216,  1.2888,  ...,  1.7521, -1.0551,  0.7279],
         [-0.7565, -0.3076,  0.2183,  ..., -1.3209, -0.9969,  3.3321],
         [ 1.2997,  1.2106, -0.5191,  ...,  0.8102,  0.4786, -1.2705],
         [-0.9406,  0.3187, -0.1868,  ...,  0.1667,  0.7771, -0.5894]],

        [[-0.3276,  0.4991,  0.3318,  ..., -1.3001,  0.1996, -0.1494],
         [ 0.7510,  0.1927, -0.5018,  ..., -1.6089, -1.2535,  0.8717],
         [-1.6750, -0.0944,  1.5664,  ..., -0.6827, -0.2770, -0.0150],
         [-0.0236,  0.9054,  0.3642,  ..., -0.0687,  0.8433, -1.2040],
         [-0.8430,  1.3097,  0.3392,  ..., -0.2594,  0.3848,  0.3626],
         [-1.8906, -0.9670, -0.4066,  ..., -0.0601,  0.1691, -0.2599]]],
       grad_fn=<AddBackward0>)
