In [15]:
import torch
from torch import nn
import dltools
import math
import  pandas

In [16]:
class PositionWiseFFN(nn.Module):
    def __init__(self,ffn_num_input,ffn_num_hiddens,ffn_num_outputs, **kwargs):
        super().__init__(**kwargs)
        self.densel = nn.Linear(ffn_num_input,ffn_num_hiddens)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(ffn_num_hiddens,ffn_num_outputs)

    def forward(self,X):
        return self.dense2(self.relu(self.densel(X)))

In [17]:
fnn = PositionWiseFFN(6,4,8)
fnn.eval()
X=torch.ones(2,3,6)
fnn(X).shape

torch.Size([2, 3, 8])

In [18]:
ln = nn.LayerNorm(3)
bn = nn.BatchNorm1d(3)
X = torch.tensor([[1,2,3],[2,3,4]],dtype=torch.float32)
print('LN:',ln(X),'\n BN:',bn(X))

LN: tensor([[-1.2247,  0.0000,  1.2247],
        [-1.2247,  0.0000,  1.2247]], grad_fn=<NativeLayerNormBackward0>) 
 BN: tensor([[-1.0000, -1.0000, -1.0000],
        [ 1.0000,  1.0000,  1.0000]], grad_fn=<NativeBatchNormBackward0>)


In [19]:
class AddNorm(nn.Module):
    def __init__(self, normalized_shape,dropout, **kwargs):
        super().__init__( **kwargs)
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm(normalized_shape)

    def forward(self,X,Y):
        return self.ln(self.dropout(Y)+X)
add_norm = AddNorm([6,8],0.2)
add_norm.eval()
X = torch.ones((2,6,8),dtype=torch.float32)
Y = torch.ones((2,6,8),dtype=torch.float32)
add_norm(X,Y)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.]]],
       grad_fn=<NativeLayerNormBackward0>)

In [20]:
class EncoderBlock(nn.Module):
    def __init__(self, key_size,query_size,value_size,num_hiddens,norm_shape,ffn_num_input,ffn_num_hiddens,ffn_num_output,num_heads,dropout,use_bias=False, **kwargs):
        super().__init__(**kwargs)
        self.attention = dltools.MultiHeadAttention(key_size,query_size,value_size,num_hiddens,num_heads,dropout,use_bias)
        self.addnormal1 = AddNorm(norm_shape,dropout)
        self.fnn = PositionWiseFFN(ffn_num_input,ffn_num_hiddens,ffn_num_output)
        self.addnormal2 = AddNorm(norm_shape,dropout)

    def forward(self,X,valid_lens):
        Y = self.addnormal1(X,self.attention(X,X,X,valid_lens))
        return self.addnormal2(Y,self.fnn(Y))
    

In [21]:
X = torch.ones((2,100,24))
valid_len = torch.tensor([3,2])
encoder_blk = EncoderBlock(24,24,24,24,[100,24],24,48,24,8,0.5)
encoder_blk.eval()
encoder_blk(X,valid_len).shape

torch.Size([2, 100, 24])

In [22]:
class TransformerEncoer(dltools.Encoder):
    def __init__(self, key_size,query_size,value_size,num_hiddens,norm_shape,ffn_num_input,ffn_num_hiddens,num_heads,num_layers,dropout,vocab_size,use_bias=False, **kwargs):
        super().__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(vocab_size,num_hiddens)
        self.pos_encoding = dltools.PositionalEncoding(num_hiddens,dropout)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module('block' + str(i),EncoderBlock(key_size,query_size,value_size,num_hiddens,norm_shape,ffn_num_input,ffn_num_hiddens,num_hiddens,num_heads,dropout,use_bias=False))
    def forward(self,X,valid_len,*args):
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self.attention_weight = [None] * len(self.blks)
        for i , blk in enumerate(self.blks):
            X = blk(X,valid_len)
            self.attention_weight[i] = blk.attention.attention.attention_weights
        return X


In [23]:
encoder = TransformerEncoer(24,24,24,24,[100,24],24,48,8,2,0.3,200)
encoder.eval()
X =torch.ones((2,100),dtype=torch.long)
encoder(X,valid_len)[0]

tensor([[-0.8129,  1.0395,  1.1544,  ...,  0.0796, -1.6223,  0.6850],
        [-0.6137,  0.9483,  1.2418,  ...,  0.0799, -1.6087,  0.6658],
        [-0.5961,  0.7422,  1.2920,  ...,  0.0841, -1.6042,  0.6447],
        ...,
        [-0.6428,  0.5592,  1.2265,  ...,  0.1346, -1.6269,  0.6115],
        [-0.8640,  0.5734,  1.2530,  ...,  0.1422, -1.6292,  0.6085],
        [-0.9662,  0.7591,  1.2546,  ...,  0.1398, -1.6183,  0.6114]],
       grad_fn=<SelectBackward0>)

In [46]:
# decoder block
class DecoderBlock(nn.Module):
    def __init__(self, key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, dropout, i, **kwargs):
        super().__init__(**kwargs)
        self.i = i
        self.attention1 = dltools.MultiHeadAttention(key_size, query_size, value_size, num_hiddens, num_heads, dropout)
        self.addnorm1 = AddNorm(norm_shape, dropout)
        self.attention2 = dltools.MultiHeadAttention(key_size, query_size, value_size, num_hiddens, num_heads, dropout)
        self.addnorm2 = AddNorm(norm_shape, dropout)
        self.ffn = PositionWiseFFN(ffn_num_input, ffn_num_hiddens, num_hiddens)
        self.addnorm3 = AddNorm(norm_shape, dropout)
        
    def forward(self, X, state):
        enc_outputs, enc_valid_lens = state[0], state[1]
        if state[2][self.i] is None:
            key_values = X
        else:
            # 预测: 预测需要把前面时刻预测得到的信息和当前block的输出得到的信息拼到一起. 
            key_values = torch.cat((state[2][self.i], X), axis=1)
        state[2][self.i] = key_values
        # 在训练的时候需要对真实值进行遮蔽
        if self.training:
            # (batch_size, num_steps), 每一行是[1, 2, ..., num_steps]
            batch_size, num_steps, _ = X.shape
            dec_valid_lens = torch.arange(1, num_steps + 1, device=X.device).repeat(batch_size, 1)
        else:
            dec_valid_lens = None
         
        # 自注意力
        X2 = self.attention1(X, key_values, key_values, dec_valid_lens)
        Y = self.addnorm1(X, X2)
        Y2 = self.attention2(Y, enc_outputs, enc_outputs, enc_valid_lens)
        Z = self.addnorm2(Y, Y2)
        return self.addnorm3(Z, self.ffn(Z)), state

In [47]:
decoder_blk = DecoderBlock(24,24,24,24,[100,24],24,48,8,0.5,0)
decoder_blk.eval()
X = torch.ones((2,100,24))
state = [encoder_blk(X,valid_len),valid_len,[None]]
result1,result2 = decoder_blk(X,state)

In [26]:
result1.shape

torch.Size([2, 100, 24])

In [48]:
# transformer解码器部分
class TransformerDecoder(dltools.AttentionDecoder):
    def __init__(self, vocab_size, key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout, **kwargs):
        super().__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, num_hiddens)
        self.pos_embedding = dltools.PositionalEncoding(num_hiddens, dropout)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module('block' + str(i), DecoderBlock(key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, dropout, i))
            
        self.dense = nn.Linear(num_hiddens, vocab_size)
        
    def init_state(self, enc_outputs, enc_valid_lens, *args):
        return [enc_outputs, enc_valid_lens, [None]*self.num_layers]

    def forward(self, X, state):
        X = self.pos_embedding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self._attention_weights = [[None] * len(self.blks) for _ in range(2)]
        for i, blk in enumerate(self.blks):
            X, state = blk(X, state)
            self._attention_weights[0][i] = blk.attention1.attention.attention_weights
            self._attention_weights[1][i] = blk.attention2.attention.attention_weights
        return self.dense(X), state
    
    @property
    def attention_weights(self):
        return self._attention_weights

In [49]:
num_hiddens, num_layers, dropout, batch_size, num_steps = 32, 2, 0.1, 64, 10
lr, num_epochs, device = 0.005, 200, dltools.try_gpu()
ffn_num_input, ffn_num_hiddens, num_heads = 32, 64, 4
key_size, query_size, value_size = 32, 32, 32
norm_shape = [32]

train_iter, src_vocab, tgt_vocab = dltools.load_data_nmt(batch_size, num_steps)

encoder = TransformerEncoer( key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout,len(src_vocab))
decoder =TransformerDecoder(len(tgt_vocab), key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout)
net = dltools.EncoderDecoder(encoder, decoder)
dltools.train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)

AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [40]:
print("len(src_vocab):", len(src_vocab))
print("len(tgt_vocab):", len(tgt_vocab))

for name, vocab in [("src_vocab", src_vocab), ("tgt_vocab", tgt_vocab)]:
    for token in ["<pad>", "<bos>", "<eos>", "<unk>"]:
        print(f"{name} contains {token}:", token in vocab)
        if token in vocab:
            print(f"{token} index =", vocab[token])


len(src_vocab): 184
len(tgt_vocab): 201


KeyboardInterrupt: 