In [8]:
import math
import time

import torch

from torch import nn, optim
from torch.optim import Adam
from torch import tensor

In [9]:
# Transformer 配置参数
# GPU device setting
 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('当前在的机器是：', device)
# 模型参数
batch_size = 128 # 训练批次 句话
max_len = 256    # 单句最大长度 
##
# padding=10

d_model = 512    # 词嵌入向量维度
n_layers = 6     # encoder/decoder层数量
n_heads = 8      # 注意力头数： 假如有词嵌入维度d_model = 512 / n_heads = 8 => 单头向量维度 512 / 8 = 64，即QKV维度
ffn_hidden = 2048 # 前向传播维度。 512 -> 2048 -> 512, 通常也称作proj
drop_prob = 0.1  # dropout提升鲁棒性，随机失活一些节点
n_hidden = ffn_hidden

# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 100
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

当前在的机器是： cpu


In [10]:
src_pad_idx = 1
trg_pad_idx = 1
trg_sos_idx = 2
enc_voc_size = 5893
dec_voc_size = 7853

test_src = torch.load('./data/tensor_src.pt')
test_trg = torch.load('./data/tensor_trg.pt')
print("load src shape", test_src.shape)
print("load trg shape", test_trg.shape)

load src shape torch.Size([128, 27])
load trg shape torch.Size([128, 28])


# 1.embedding

In [6]:
import torch
import torch.nn.functional as F
embd_layer = torch.nn.Embedding(14, 512)
print('embedding.weight', embd_layer.weight.shape)


input_id = torch.tensor([[2, 4, 5, 6, 7, 8, 3, 1, 1, 1], 
                      [2, 4, 9, 10,11,12,13,3, 1, 1],
                      [2, 6, 7, 8, 9, 10,11,12,13,3]])


print("输入数据",input_id.shape)
print("输入数据的embedding", embd_layer(input_id).shape)

print(embd_layer(input_id)[0][1][:12])

embedding.weight torch.Size([14, 512])
输入数据 torch.Size([3, 10])
输入数据的embedding torch.Size([3, 10, 512])
tensor([ 1.5583,  1.1276, -0.7070, -0.9933, -0.8856,  1.0061, -0.1717, -0.2618,
         0.9074,  0.4190,  0.9231, -0.2430], grad_fn=<SliceBackward0>)


In [11]:
a = nn.Embedding(enc_voc_size, d_model)
# embedding_layer = nn.Embedding(14, 128)
print(a.weight.shape) # 14 * 128
print(input_id.shape) # 
x = a(input_id)
print(x.shape)

torch.Size([5893, 512])
torch.Size([3, 10])
torch.Size([3, 10, 512])


In [12]:
# 创建Token embedding类
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, d_model):
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)
        
test_src_token = TokenEmbedding(enc_voc_size, d_model) #对 src：en 进行embedding
test_trg_token = TokenEmbedding(dec_voc_size, d_model) #对 trg：de 进行embedding
print(test_src_token) 
print(test_trg_token)

TokenEmbedding(5893, 512, padding_idx=1)
TokenEmbedding(7853, 512, padding_idx=1)


# 2. 位置编码position encoding
pos代表max_len里的位置, i代表d_model里的维度
$$PE_{pos, 2i} =  sin(pos / 10000^{2i / d})$$
$$PE_{pos, 2i+1} = cos(pos / 10000^{2i /d})$$

In [43]:

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, device):
        super().__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device) # [256, 512]
        self.encoding.requires_grad = False

        pos = torch.arange(0, max_len, device=device) # torch.size([256])
        pos = pos.float().unsqueeze(dim=1) # 增加一个维度[256, 1]
        _2i = torch.arange(0, d_model, 2, device=device) # torch.size([256])

        # pos / (10000 ** (_2i / d_model))的维度是[256, 256]
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
    
    # x.shape: [batch_size, seq_len]
    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :]

test_pos_encoding = PositionalEncoding(d_model, max_len, device)
print(test_pos_encoding.encoding.shape)
print(test_pos_encoding.encoding[0:10,:].shape) # 255 is position 

torch.Size([256, 512])
torch.Size([10, 512])


# 3. LayerNorm
$$LayerNorm(x) = α * \frac{(x-mean)} {\sqrt{var+ eps}}  + β$$


In [20]:
import torch
from torch import nn
d_model = 512

class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        # layernorm作用在(-1) 最后一维进行归一化
        mean = x.mean(dim=-1, keepdim=True) # [2, 4, 1]
        var = x.var(dim=-1, unbiased=False, keepdim=True)# [2, 4, 1]
        out = (x - mean) / torch.sqrt(var + self.eps)# [2, 4, 512]
        out = self.gamma * out + self.beta
        return out
    



torch.Size([2, 4, 512])
torch.Size([2, 4, 512])
tensor([[[-0.9189,  1.4861,  0.7582,  ..., -0.3212,  1.7563, -0.9268],
         [-0.3603, -0.2978, -0.1365,  ..., -0.8413,  0.3929, -0.3002],
         [-0.4081,  1.7437,  0.8381,  ...,  0.4693, -0.9644,  1.3948],
         [-1.3597, -1.6293, -1.1009,  ..., -1.0459,  0.6226, -1.3483]],

        [[-0.8746, -0.0809,  0.1776,  ..., -1.2659,  1.2809,  1.6349],
         [ 0.2639, -0.6621, -1.4123,  ...,  1.4496,  1.0469,  1.2344],
         [ 1.2481,  1.4071, -0.3268,  ..., -1.1881, -1.0544, -1.2130],
         [-0.8888, -0.8657, -0.8292,  ...,  0.4949,  0.9049, -0.5779]]],
       grad_fn=<AddBackward0>)


# 4. 单头注意力机制
$$Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$$

In [55]:
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None):
        batch_size, seq_len, d_model = k.size()
        # d_model是标量，用math.sqrt()
        score = q * k.transpose(2, 3) / math.sqrt(d_model)
        if not mask:
            score = score.masked_fill(mask==0, -10000)
        attn = self.softmax(score) @ v 
        return attn, score


# 5. FFN
$$FFN(x) = ln2(relu(ln1(x)))$$

In [50]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden_size, drop_prob=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, d_model)
        self.dropout = nn.Dropout(p=drop_prob)
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.dropout(x)
        return x
ffw = PositionwiseFeedForward(d_model, ffn_hidden)
print(ffw)

PositionwiseFeedForward(
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (relu): ReLU()
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


# 6. mutli-head attention
$$MutilHead(Q, K, V) = Concat(head_1, ..., head_h) W^O$$
$$head_i = Attention(Q{W_i}^Q, K{W_i}^K, V{W_i}^V)$$


In [56]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
        batch_size, seq_len, d_model = q.size()
        q = q.view(batch_size, -1, self.n_head, d_model).transpose(1, 2)
        k = k.view(batch_size, -1, self.n_head, d_model).transpose(1, 2)
        v = v.view(batch_size, -1, self.n_head, d_model).transpose(1, 2)
        attn, _score = self.attention(q, k, v, mask=mask)
        attn = attn.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        attn = self.w_o(attn)
        return attn
        
test_multihead_attention = MultiHeadAttention(d_model, n_heads)
print(test_multihead_attention)
print(d_model, n_heads)


MultiHeadAttention(
  (attention): ScaleDotProductAttention(
    (softmax): Softmax(dim=-1)
  )
  (w_q): Linear(in_features=512, out_features=512, bias=True)
  (w_k): Linear(in_features=512, out_features=512, bias=True)
  (w_v): Linear(in_features=512, out_features=512, bias=True)
  (w_o): Linear(in_features=512, out_features=512, bias=True)
)
512 8
