In [21]:
import os
import torch
from d2l import torch as d2l

In [22]:
#@save
d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip',
                           '94646ad1522d915e7b0f9296181140edcf86a4f5')

#@save
def read_data_nmt():
    """载入“英语－法语”数据集"""
    data_dir = d2l.download_extract('fra-eng')
    with open(os.path.join(data_dir, 'fra.txt'), 'r',
             encoding='utf-8') as f:
        return f.read()

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



In [23]:
#@save
def preprocess_nmt(text):
    """预处理“英语－法语”数据集"""
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '

    # 使用空格替换不间断空格
    # 使用小写字母替换大写字母
    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
    # 在单词和标点符号之间插入空格
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text)]
    return ''.join(out)

text = preprocess_nmt(raw_text)
print(text[:80])

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !


In [24]:
#@save
def tokenize_nmt(text, num_examples=None):
    """词元化“英语－法语”数据数据集"""
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts = line.split('\t')
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source, target

source, target = tokenize_nmt(text)
source[:6], target[:6]

([['go', '.'],
  ['hi', '.'],
  ['run', '!'],
  ['run', '!'],
  ['who', '?'],
  ['wow', '!']],
 [['va', '!'],
  ['salut', '!'],
  ['cours', '!'],
  ['courez', '!'],
  ['qui', '?'],
  ['ça', 'alors', '!']])

In [25]:
src_vocab = d2l.Vocab(source, min_freq=2,
                      reserved_tokens=['<pad>', '<bos>', '<eos>'])
len(src_vocab)

10012

In [26]:
#@save
def truncate_pad(line, num_steps, padding_token):
    """截断或填充文本序列"""
    if len(line) > num_steps:
        return line[:num_steps]  # 截断
    return line + [padding_token] * (num_steps - len(line))  # 填充

truncate_pad(src_vocab[source[0]], 10, src_vocab['<pad>'])

[3919, 80, 208, 208, 208, 208, 208, 208, 208, 208]

In [27]:
#@save
def build_array_nmt(lines, vocab, num_steps):
    """将机器翻译的文本序列转换成小批量"""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = torch.tensor([truncate_pad(
        l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
    return array, valid_len

In [28]:
#@save
def load_data_nmt(batch_size, num_steps, num_examples=600):
    """返回翻译数据集的迭代器和词表"""
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
    src_vocab = d2l.Vocab(source, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = d2l.Vocab(target, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = d2l.load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab

In [29]:
train_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size=2, num_steps=8)
for X, X_valid_len, Y, Y_valid_len in train_iter:
    print('X:', X.type(torch.int32))
    print('X的有效长度:', X_valid_len)
    print('Y:', Y.type(torch.int32))
    print('Y的有效长度:', Y_valid_len)
    break

X: tensor([[ 59, 114,   2,   4,   5,   5,   5,   5],
        [141, 164,   0,   4,   5,   5,   5,   5]], dtype=torch.int32)
X的有效长度: tensor([4, 4])
Y: tensor([[  6,   2,   4,   5,   5,   5,   5,   5],
        [  6, 140,  83,   0,   4,   5,   5,   5]], dtype=torch.int32)
Y的有效长度: tensor([3, 5])


In [30]:
import collections
import math
import torch
from torch import nn
from d2l import torch as d2l

In [44]:
class MyEncoder(d2l.Encoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(MyEncoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers, bidirectional=True, batch_first=True,
                          dropout=dropout)
    def forward(self, X, *args):
        # 输出'X'的形状：(batch_size,num_steps,embed_size)
        X = self.embedding(X)
        # print('X',X.shape)
        # 在循环神经网络模型中，第一个轴对应于时间步
        # 如果未提及状态，则默认为0
        output, enc_hidden = self.rnn(X)
        # output的形状:(batch_size,num_steps,2*num_hiddens)，两个方向的隐藏状态
        # state的形状:(2,batch_size,num_hiddens)
        # print('output',output.shape)
        # print('enc_hidden',enc_hidden.shape)
        return output, enc_hidden

In [33]:
encoder = MyEncoder(vocab_size=10, embed_size=8, num_hiddens=16,
                         num_layers=1)
encoder.eval()
X = torch.zeros((4, 7), dtype=torch.long)
output, state = encoder(X)
output.shape

output torch.Size([4, 7, 32])
enc_hidden torch.Size([2, 4, 16])


torch.Size([4, 7, 32])

In [57]:
import torch.nn.functional as F
from torch.autograd import Variable
class MyDecoder(d2l.Decoder):
    """用于序列到序列学习的循环神经网络解码器"""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(MyDecoder, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size + 2*num_hiddens, num_hiddens, num_layers,bias=False,batch_first=True,
                          dropout=dropout)
        self.fc = nn.Linear(num_hiddens,num_hiddens,bias=False)
        self.fc1 = nn.Linear(2*num_hiddens, num_hiddens, bias=False)
        self.fc2 = nn.Linear(num_hiddens, num_hiddens, bias=False)
        self.fc3 = nn.Linear(num_hiddens,1)

        self.attention_fc = nn.Linear(3 * num_hiddens, 1)
        self.dense = nn.Linear(num_hiddens, vocab_size)

    def init_state(self, enc_outputs, *args):
        return enc_outputs[1]

    def attention(self,state, enc_hidden):
        # enc_hidden.shape:batch_size,num_steps,2*num_hiddens ; state.shape:batch_size,1,num_hiddens
        state = state.repeat(1,enc_hidden.shape[1],1) #shape: batch_size,num_steps,num_hiddens

        e = self.fc3(F.tanh(self.fc2(state)+self.fc1(enc_hidden)))
        
        alpha = F.softmax(e,dim=1) # attention_weights shape:batch_size,1,
        atten_output = torch.sum(alpha*enc_hidden,dim=1)
        # atten_output.shape:batch_size,2*num_hiddens
        return atten_output
    def forward(self, Y, enc_hidden):
        # enc_hidden.shape:(batch_size,num_steps,2*num_hiddens) , Y.shape:(batch_size,num_steps)
        # 输出'Y'的形状：(batch_size,num_steps,embed_size)
        Y = self.embedding(Y)
        # state.shape=(batch_size,num_layers,num_hiddens)
        state = F.tanh(self.fc(enc_hidden[:,:1,:self.num_hiddens])).reshape(Y.shape[0],1,-1)
        self.atten_outputs = Variable(torch.zeros(Y.shape[0],
                                                  Y.shape[1],
                                                  enc_hidden.shape[2])).cuda()  # batch_ size*num_steps*(2*num_hidden)
        self.dec_outputs = Variable(torch.zeros(Y.shape[0],
                                                Y.shape[1],
                                                state.shape[2])).cuda() # batch_ size*num_steps*num_hidden
        for i in range(Y.shape[1]):
            # input_embedding.shape:(batch_size,1,embed_size)
            input_embedding = Y[:, i, :].unsqueeze(1)
            # atten_output.shape:(batch_size,2*num_hiddens)
            atten_output = self.attention(state,enc_hidden)
            self.atten_outputs[:,i,:] = atten_output.squeeze()
            # dec_gru_input.shape:(batch_size,1,embed_size+2*num_hiddens)
            dec_gru_input = torch.cat([input_embedding,atten_output.unsqueeze(1)],dim=2)
            # dec_output.shape:(batch_size,1,embed_size + 2*num_hiddens), state.shape:(batch_size,num_layers,num_hiddens)
            dec_output,state = self.rnn(dec_gru_input,state.permute(1,0,2))
            state = state.permute(1,0,2)            
            self.dec_outputs[:,i] = dec_output.squeeze()
        # output的形状:(batch_size,num_steps,vocab_size)
        output = self.dense(self.dec_outputs)
        return output,state

In [58]:
#@save
def sequence_mask(X, valid_len, value=0):
    """在序列中屏蔽不相关的项"""
    maxlen = X.size(1)
    mask = torch.arange((maxlen), dtype=torch.float32,
                        device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X

X = torch.tensor([[1, 2, 3], [4, 5, 6]])
sequence_mask(X, torch.tensor([1, 2]))

tensor([[1, 0, 0],
        [4, 5, 0]])

In [59]:
#@save
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    """带遮蔽的softmax交叉熵损失函数"""
    # pred的形状：(batch_size,num_steps,vocab_size)
    # label的形状：(batch_size,num_steps)
    # valid_len的形状：(batch_size,)
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction='none'
        unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(
            pred.permute(0, 2, 1), label)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss

In [64]:
#@save
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """训练序列到序列模型"""
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])

    net.apply(xavier_init_weights)
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    net.train()
    # animator = d2l.Animator(xlabel='epoch', ylabel='loss',
    #                  xlim=[10, num_epochs])
    for epoch in range(num_epochs):
        timer = d2l.Timer()
        # metric = d2l.Accumulator(2)  # 训练损失总和，词元数量
        for batch in data_iter:
            optimizer.zero_grad()
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]

            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                          device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # 强制教学
            Y_hat = net(X, dec_input) #Y_hat,_ = net(X, dec_input,X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()      # 损失函数的标量进行“反向传播”
            # d2l.grad_clipping(net, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
        print(f'epoch {epoch + 1}, loss {l.sum() / num_tokens:.3f}')

In [67]:
embed_size, num_hiddens, num_layers, dropout = 32, 32, 1, 0.1
batch_size, num_steps = 64, 10
lr, num_epochs, device = 0.005, 300, d2l.try_gpu()

train_iter, src_vocab, tgt_vocab = d2l.load_data_nmt(batch_size, num_steps)
encoder = MyEncoder(len(src_vocab), embed_size, num_hiddens, num_layers,
                        dropout)
decoder = MyDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers,
                        dropout)
net = d2l.EncoderDecoder(encoder, decoder)
train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)

epoch 1, loss 0.391
epoch 2, loss 0.310
epoch 3, loss 0.286
epoch 4, loss 0.252
epoch 5, loss 0.263
epoch 6, loss 0.244
epoch 7, loss 0.236
epoch 8, loss 0.219
epoch 9, loss 0.193
epoch 10, loss 0.189
epoch 11, loss 0.178
epoch 12, loss 0.200
epoch 13, loss 0.144
epoch 14, loss 0.149
epoch 15, loss 0.147
epoch 16, loss 0.139
epoch 17, loss 0.129
epoch 18, loss 0.124
epoch 19, loss 0.113
epoch 20, loss 0.114
epoch 21, loss 0.102
epoch 22, loss 0.092
epoch 23, loss 0.111
epoch 24, loss 0.067
epoch 25, loss 0.077
epoch 26, loss 0.078
epoch 27, loss 0.084
epoch 28, loss 0.060
epoch 29, loss 0.054
epoch 30, loss 0.066
epoch 31, loss 0.061
epoch 32, loss 0.064
epoch 33, loss 0.063
epoch 34, loss 0.060
epoch 35, loss 0.056
epoch 36, loss 0.058
epoch 37, loss 0.055
epoch 38, loss 0.042
epoch 39, loss 0.044
epoch 40, loss 0.036
epoch 41, loss 0.049
epoch 42, loss 0.045
epoch 43, loss 0.030
epoch 44, loss 0.038
epoch 45, loss 0.046
epoch 46, loss 0.037
epoch 47, loss 0.034
epoch 48, loss 0.036
e

In [80]:
#@save
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps,
                    device, save_attention_weights=False):
    """序列到序列模型的预测"""
    # 在预测时将net设置为评估模式
    net.eval()
    src_tokens = src_vocab[src_sentence.lower().split(' ')] + [
        src_vocab['<eos>']]
    enc_valid_len = torch.tensor([len(src_tokens)], device=device)
    src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    # 添加批量轴
    enc_X = torch.unsqueeze(
        torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs,enc_hiddens = net.encoder(enc_X, enc_valid_len)
    # dec_state = net.decoder.init_state(enc_outputs, enc_valid_len)
    dec_state = F.tanh(net.decoder.fc(enc_outputs[:,:1,:num_hiddens]))
    # 添加批量轴
    dec_X = torch.unsqueeze(torch.tensor(
        [tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0)
    output_seq, attention_weight_seq = [], []
    for _ in range(num_steps):
        print(dec_X.shape,enc_outputs.shape)
        Y, dec_state = net.decoder(dec_X, enc_outputs)
        # dec_state = dec_state.permute(1,0,2)
        # 我们使用具有预测最高可能性的词元，作为解码器在下一时间步的输入
        dec_X = Y.argmax(dim=2)
        pred = dec_X.squeeze(dim=0).type(torch.int32).item()
        # 保存注意力权重（稍后讨论）
        if save_attention_weights:
            attention_weight_seq.append(net.decoder.attention_weights)
        # 一旦序列结束词元被预测，输出序列的生成就完成了
        if pred == tgt_vocab['<eos>']:
            break
        output_seq.append(pred)
    return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq

In [104]:
#@save
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps,
                    device, save_attention_weights=False):
    """序列到序列模型的预测"""
    # 在预测时将net设置为评估模式
    net.eval()
    src_tokens = src_vocab[src_sentence.lower().split(' ')] + [
        src_vocab['<eos>']]
    enc_valid_len = torch.tensor([len(src_tokens)], device=device)
    src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    # 添加批量轴
    enc_X = torch.unsqueeze(
        torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs,enc_hiddens = net.encoder(enc_X, enc_valid_len)
    # dec_state = net.decoder.init_state(enc_outputs, enc_valid_len)
    dec_state = F.tanh(net.decoder.fc(enc_outputs[:,:1,:num_hiddens])).reshape(1,1,-1)
    print(dec_state.shape)
    # 添加批量轴
    dec_X = torch.unsqueeze(torch.tensor(
        [tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0)
    output_seq, attention_weight_seq = [], []
    input_embedding = net.decoder.embedding(dec_X)
    for _ in range(num_steps):
        atten_output = net.decoder.attention(dec_state,enc_outputs)
        dec_gru_input = torch.cat([input_embedding,atten_output.unsqueeze(1)],dim=2)
        dec_output,dec_state = net.decoder.rnn(dec_gru_input,dec_state.permute(1,0,2))
        dec_state = dec_state.permute(1,0,2)
        pred = dec_output.argmax(dim=2).squeeze(dim=0).type(torch.int32).item()
        output_seq.append(pred)
        # 一旦序列结束词元被预测，输出序列的生成就完成了
        if pred == tgt_vocab['<eos>']:
            break
    # 保存注意力权重（稍后讨论）
    if save_attention_weights:
        attention_weight_seq = net.decoder.atten_outputs
    return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq

In [105]:
def bleu(pred_seq, label_seq, k):  #@save
    """计算BLEU"""
    pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ')
    len_pred, len_label = len(pred_tokens), len(label_tokens)
    score = math.exp(min(0, 1 - len_label / len_pred))
    for n in range(1, k + 1):
        num_matches, label_subs = 0, collections.defaultdict(int)
        for i in range(len_label - n + 1):
            label_subs[' '.join(label_tokens[i: i + n])] += 1
        for i in range(len_pred - n + 1):
            if label_subs[' '.join(pred_tokens[i: i + n])] > 0:
                num_matches += 1
                label_subs[' '.join(pred_tokens[i: i + n])] -= 1
        score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
    return score

In [106]:
engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .']
fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .']
for eng, fra in zip(engs, fras):
    translation, attention_weight_seq = predict_seq2seq(
        net, eng, src_vocab, tgt_vocab, num_steps, device)
    print(f'{eng} => {translation}, bleu {bleu(translation, fra, k=2):.3f}')

torch.Size([1, 1, 32])
go . => bizarre <eos>, bleu 0.000
torch.Size([1, 1, 32])
i lost . => avons <eos>, bleu 0.000
torch.Size([1, 1, 32])
he's calm . => <unk> allons-y <eos>, bleu 0.000
torch.Size([1, 1, 32])
i'm home . => <unk> <eos>, bleu 0.000
