<a href="https://colab.research.google.com/github/bluetinue/transforner/blob/main/%E6%89%8B%E6%92%95Tansformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import copy
# 数学计算工具包
import math

# 输入部分

In [8]:
#@title 词嵌入层
class Embed(nn.Module):
  def __init__(self,vocab,embed_dim):
    super().__init__()
    self.vocab = vocab
    self.embed_dim = embed_dim
    self.lcut = nn.Embedding(self.vocab,self.embed_dim)

  def forward(self,x):
    return self.lcut(x) * math.sqrt(self.embed_dim)

In [9]:
#@title 位置编码器
class PostionalEncoding(nn.Module):
  def __init__(self,d_model,dropout,max_len=60):
    super().__init__()
    self.d_model = d_model
    #drop层防止过拟合
    self.dropout = nn.Dropout(p=dropout)
    #[60,512]
    pe = torch.zeros(max_len,d_model)
    #[60,1]
    position = torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)

    #定义变化矩阵 [256]
    div_term = torch.exp(torch.arange(0,d_model,2) * -(math.log(10000.0) / d_model))

    #矩阵相乘 [60,256]
    my_matmulres = position * div_term
    #按照奇数位*sin，偶数位置 *cos
    pe[:,0::2] = torch.sin(my_matmulres)
    pe[:,1::2] = torch.cos(my_matmulres)

    #[60,512] -->[1,60,512]
    pe = pe.unsqueeze(0)

    #持久化pe
    self.register_buffer("pe",pe)

  def forward(self,x):
    #对句子长度对应的位置索引进行相加
    x = x + self.pe[:,x.size()[1]]
    return self.dropout(x)

# 编码器部分

In [10]:
#@title 自注意力层
def attention(q,k,v,mask=None,dropout=None):
  d_k = q.size(-1)

  #计算权重矩阵q*k的转置
  scores = torch.matmul(q,k.transpose(-2,-1)) / math.sqrt(d_k)

  #是否对权重进行掩码计算和dropout
  if mask is not None:
    scores = scores.masked_fill(mask==0,-1e9)

  #经过softmax输出权重分布
  p_attn = F.softmax(scores,dim=-1)
  if dropout is not None:
    p_attn = dropout(p_attn)

  #返回权重计算结果和权重矩阵
  return torch.matmul(p_attn,v),p_attn

In [11]:
#@title 多头自注意力层
def clones(module,N):
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class MultiHeadAttention(nn.Module):
  def __init__(self,head,embed_dim,dropout_p=0.1):
    super().__init__()
    #确认维度可以给分
    assert embed_dim % head == 0
    #计算每个头的维度
    self.d_k = embed_dim // head
    self.head = head
    #随即失活；
    self.dropout = nn.Dropout(p=dropout_p)
    #定义线性层
    self.linears = clones(nn.Linear(embed_dim,embed_dim),4)
    #定义atten权重属性
    self.atten = None

  def forward(self,q,k,v,mask=None):
    if mask is not None:
      mask = mask.unsqueeze(0)
    #计算数据有多少个批次 [2,4,512]
    batch_size = q.size(0)
    #数据变换，将数据经过线性层组合链接在一起[2,8,4,64]
    q,k,v = [model(x).view(batch_size,-1,self.head,self.d_k).transpose(1,2)
     for model, x in zip(self.linears,(q,k,v))]
    #经过自注意力计算求个各个头之间的自注意力
    x,self.attn = attention(q,k,v,mask=mask,dropout=self.dropout)
    #数据合并
    x = x.transpose(1,2).contiguous().view(batch_size,-1,self.head*self.d_k)
    #返回线性层输出
    return self.linears[-1](x)

In [12]:
#@title 前馈连接层
class FeedForward(nn.Module):
  def __init__(self,d_model,d_ff,dropout=0.1):
    super().__init__()
    #定义两个线性层让数据走一遍，不改变原有的形状
    self.w_1 = nn.Linear(d_model,d_ff)
    self.w_2 = nn.Linear(d_ff,d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    #第一层经过relu激活函数，在经过dropout，在进第二层
    return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [17]:
#@title 规范化层
class Norm(nn.Module):
  def __init__(self,features,eps=1e-6):
    super().__init__()
    #权重
    self.a1 = nn.Parameter(torch.ones(features))
    #偏置
    self.w1 = nn.Parameter(torch.zeros(features))

    self.eps = eps

  def forward(self,x):
    mean = x.mean(-1,keepdim=True)
    std = x.std(-1,keepdim=True)
    return self.a1 * (x-mean) / (std+self.eps) + self.w1


In [None]:
#@title 残差连接层

In [23]:
#@title 测试
vocab = 1000
embed_dim = 512
embedding = Embed(vocab,embed_dim)
posEncodeing = PostionalEncoding(embed_dim,0.1)
x = torch.tensor([[1,2,3,4],[40,50,60,70]])
embed_result = embedding(x)
position_result = posEncodeing(embed_result)
mask = torch.zeros(2,4,4)
q = k = v = position_result
# #不带mask
# attn,p_attn = attention(q,k,v)
# print(attn.shape)
# print(p_attn.shape)
# #带mask
# attn,p_attn = attention(q,k,v,mask)
# my_mutilhead = MultiHeadAttention(8,embed_dim)
# print(my_mutilhead)
my_mutilhead = MultiHeadAttention(8,embed_dim)
att = my_mutilhead(q,k,v)

print(att.shape)

my_ff = FeedForward(embed_dim,2048)
result = my_ff(att)
my_norm = Norm(embed_dim)
print(my_norm(result))

torch.Size([2, 4, 512])
tensor([[[-0.3992, -1.8512,  1.0561,  ..., -0.9457, -0.8954,  0.1163],
         [ 0.9359, -1.7135,  1.3636,  ..., -0.9879,  0.6600, -1.2491],
         [-1.1123, -1.3216, -0.0183,  ..., -1.6209, -0.2633, -0.7066],
         [-0.8027, -0.1744,  0.8086,  ..., -0.0366,  0.1630, -1.8758]],

        [[ 1.1042, -1.0164, -0.4056,  ...,  0.3457, -0.7491,  0.7653],
         [ 0.6335, -1.4128,  0.3975,  ..., -1.3829, -0.2537, -0.4565],
         [ 0.8767, -0.1942, -0.6632,  ..., -1.7068,  0.1544, -0.0264],
         [ 0.1903, -1.0553,  1.1933,  ..., -0.6232,  0.2391, -0.8882]]],
       grad_fn=<AddBackward0>)
