In [5]:
import torch
import torch.nn as nn
import torch.utils.data as Data
print(f"torch-version:{torch.__version__}")

torch-version:2.0.1+cpu


In [6]:
# 模拟数据
# E 表示 end of sentence
# S 表示 Start of sentence
# P 表示 Padding and it Should be Zero
sentences = [
        # enc_input               dec_input            dec_output
        ['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
        ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']
]

src_vocab = {'P' : 0, 'ich' : 1, 'mochte' : 2, 'ein' : 3, 'bier' : 4, 'cola' : 5}
src_vocab_size = len(src_vocab)

tgt_vocab = {'P' : 0, 'i' : 1, 'want' : 2, 'a' : 3, 'beer' : 4, 'coke' : 5, 'S' : 6, 'E' : 7, '.' : 8}
tgt_vocab_size = len(tgt_vocab)

idx2word = {i: w for i, w in enumerate(tgt_vocab)}

src_len = 5 # enc_input max sequence length
tgt_len = 6 # dec_input(=dec_output) max sequence length
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def MakeData():
   enc_inputs, dec_inputs, dec_outputs = [], [], []
   for i in range(len(sentences)):
       enc_inputs.append([src_vocab[c] for c in sentences[i][0].split()])
       dec_inputs.append([tgt_vocab[c] for c in sentences[i][1].split()])
       dec_outputs.append([tgt_vocab[c] for c in sentences[i][2].split()])

   return torch.LongTensor(enc_inputs), torch.LongTensor(dec_inputs), torch.LongTensor(dec_outputs)

enc_inputs, dec_inputs, dec_outputs = MakeData()

In [7]:
# 构造数据集
class MyDataSet(Data.Dataset):
    def __init__(self, enc_inputs, dec_inputs, dec_outputs):
        super(MyDataSet, self).__init__()
        self.enc_inputs = enc_inputs
        self.dec_inputs = dec_inputs
        self.dec_outputs = dec_outputs

    def __len__(self):
        return self.enc_inputs.shape[0]

    def __getitem__(self, item):
        return self.enc_inputs[item], self.dec_inputs[item], self.dec_outputs[item]

loader = Data.DataLoader(MyDataSet(enc_inputs, dec_inputs, dec_outputs), 2, True)

In [8]:
# 定义transformer超参数
d_model = 512  # Embedding Size
d_ff = 2048 # FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_layers = 1  # number of Encoder of Decoder Layer
n_heads = 8  # number of heads in Multi-Head Attention

In [None]:
# 不同与RNN序列顺序输入的模型，基于注意力机制的Transformer无法获取次序因此，需要对位置信息进行表示
# vanilla transformer 使用了基于周期函数的位置嵌入方法
import math
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        '''
        :param d_model: 位置嵌入的维度
        :param dropout: dropout_rate
        :param max_len: 最大长度
        '''
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model) # 初始化位置参数
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        '''
        x: [seq_len, batch_size, d_model]
        '''
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)