In [1]:
#动手练习：Seq2Seq实现机器翻译

In [3]:
import os
import sys
import math
from collections import Counter#计数统计（统计元素出现次数）
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import nltk#import nltk

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [9]:
#数据集加载处理
def load_data(in_file):
    cn = []#存储中文句子
    en = []#存储英文句子
    num_examples = 0
    with open(in_file, 'r', encoding='utf8') as f:## 以UTF-8编码打开文件，确保能正确处理中文字符
        for line in f:
            line = line.strip().split('\t')
            #文件格式：英文句子\t中文句子
            en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS'])#lower小写 word_tokenize分词
            cn.append(['BOS'] + [c for c in line[1]] + ['EOS'])
    
    return en, cn

train_file = './train.txt'
dev_file = './test.txt'
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)

In [11]:
#构建单词表函数
UNK_IDX = 0#处理为在词汇表中的词
PAD_IDX = 1#用于将序列填充到相同长度

def build_dict(sentences, max_words = 50000):
    word_count = Counter()
    for sentence in sentences:
        for word in sentence:
            word_count[word] += 1
    
    ls = word_count.most_common(max_words)#获取出现频率最高的前max_words词
    total_words = len(ls) + 2
    
    word_dict = {w[0] : index + 2 for index, w in enumerate(ls)}
    word_dict['UNK'] = UNK_IDX
    word_dict['PAD'] = PAD_IDX
    
    return word_dict, total_words

en_dict, en_total_words = build_dict(train_en)#构建英文词典
cn_dict, cn_total_words = build_dict(train_cn)#构建中文词典
inv_en_dict = {v: k for k, v in en_dict.items()}
inv_cn_dict = {v: k for k, v in cn_dict.items()}

In [12]:
#将单词全部转为数字
def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):#sort_by_len设置为True是为了使一个batch中的句子长度差不多
    length = len(en_sentences)
    out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
    out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences]
#按长度排序的辅助函数
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    #顺序排序
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
    
    return out_en_sentences, out_cn_sentences

train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)

In [14]:
#创建批次索引 将几个句子作为一批
def get_minibatches(n, minibatch_size, shuffle=True):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches#输出为批次
#数据填充对齐
def prepare_data(seqs):   
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)
    max_len = np.max(lengths)

    x = np.zeros((n_samples, max_len)).astype('int32')
    x_lengths = np.array(lengths).astype('int32')

    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
    
    return x, x_lengths

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)    
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x, mb_x_len = prepare_data(mb_en_sentences)
        mb_y, mb_y_len = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    
    return all_ex   

batch_size = 64
train_data = gen_examples(train_en, train_cn, batch_size)
dev_data = gen_examples(dev_en, dev_cn, batch_size)

In [15]:
#搭建网络模型

In [18]:
#设置损失函数
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()

    def forward(self, input, target, mask):
        input = input.contiguous().view(-1, input.size(2))
        target = target.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        output = -input.gather(1, target) * mask#计算负对数似然
        output = torch.sum(output) / torch.sum(mask)
        
        return output

In [20]:
class PlainEncoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)#词嵌入层
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)#GRU层/RNN
        self.dropout = nn.Dropout(dropout)#dropout
        
    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)#序列按长度降序排列
        x_sorted = x[sorted_idx.long()]#返回排序后的长度和对应的索引
        embedded = self.dropout(self.embed(x_sorted))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        packed_out, hidden = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        
        _, original_idx = sorted_idx.sort(0, descending=False)
        
        out = out[original_idx.long()].contiguous()
        hidden = hidden[:, original_idx.long()].contiguous()
        
        return out, hidden[[-1]]

#测试维度
p = PlainEncoder(en_total_words, 100)

mb_x = torch.from_numpy(train_data[0][0]).long()
mb_x_len = torch.from_numpy(train_data[0][1]).long()
print("数据集:", mb_x.shape, mb_x_len.shape)

o, h = p(mb_x, mb_x_len)

print(o.shape, h.shape)
print(o[:, -1].shape, '\n', o[:, -1] == h)

class PlainDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]
        
        y_sorted = self.dropout(self.embed(y_sorted))
        
        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        
        output = F.log_softmax(self.fc(output_seq), -1)
        
        return output, hid

数据集: torch.Size([64, 8]) torch.Size([64])
torch.Size([64, 8, 100]) torch.Size([1, 64, 100])
torch.Size([64, 100]) 
 tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]])


In [21]:
class PlainSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(PlainSeq2Seq, self).__init__()
        self.encoder = encoder 
        self.decoder = decoder
        
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_cut, hid = self.encoder(x, x_lengths)
        output, hid = self.decoder(y, y_lengths, hid)
            
        return output, None
    
    
    def translate(self, x, x_lengths, y, max_length=10):
        encoder_cut, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        
        for i in range(max_length):
            output, hid = self.decoder(y=y, y_lengths=torch.ones(batch_size).long().to(device), hid=hid)

            y = output.max(2)[1].view(batch_size, 1) 
            preds.append(y)

        return torch.cat(preds, 1), None

dropout = 0.2
hidden_size = 100
encode = PlainEncoder(vocab_size=en_total_words, hidden_size=hidden_size, dropout=dropout)
decoder = PlainDecoder(vocab_size=cn_total_words, hidden_size=hidden_size, dropout=dropout)

model = PlainSeq2Seq(encode, decoder)
model = model.to(device)

loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

In [32]:
def train(model, data, num_epochs=1000):
    for epoch in range(num_epochs):
        model.train()
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            
            mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long()
            mb_y_len[mb_y_len <= 0] = 1
            
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()
            
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            
            #更新模型
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            optimizer.step()
            
            if it % 100 == 0:
                print("迭代次数: ", epoch, '迭代', it, 'loss:', loss.item())
                   
        print("迭代次数", epoch, "训练损失", total_loss / total_num_words)
        
        if epoch % 5 == 0:
            evaluate(model, dev_data)
    
    torch.save(model.state_dict(), 'model.pt')

def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    
    with torch.no_grad():
        
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            
    print("损失评估", total_loss / total_num_words)

train(model, train_data, num_epochs=10)

def translate_dev(i):
    en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]])
    print(en_sent)
    cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]])
    print("".join(cn_sent))

    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device)
    
    translation, attn = model.translate(mb_x, mb_x_len, bos)
    translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != "EOS":
            trans.append(word)
        else:
            break
    print("".join(trans))

迭代次数:  0 迭代 0 loss: 1.4076074361801147
迭代次数:  0 迭代 100 loss: 1.5525585412979126
迭代次数:  0 迭代 200 loss: 2.25791072845459
迭代次数:  0 迭代 300 loss: 1.7978249788284302
迭代次数:  0 迭代 400 loss: 1.2585357427597046
迭代次数:  0 迭代 500 loss: 1.8542312383651733
迭代次数 0 训练损失 1.81715430173393
损失评估 1.5388896465301514
迭代次数:  1 迭代 0 loss: 1.3545321226119995
迭代次数:  1 迭代 100 loss: 1.5669461488723755
迭代次数:  1 迭代 200 loss: 2.2932708263397217
迭代次数:  1 迭代 300 loss: 1.815649390220642
迭代次数:  1 迭代 400 loss: 1.2158253192901611
迭代次数:  1 迭代 500 loss: 1.8196275234222412
迭代次数 1 训练损失 1.7961463607834547
迭代次数:  2 迭代 0 loss: 1.360666275024414
迭代次数:  2 迭代 100 loss: 1.5364938974380493
迭代次数:  2 迭代 200 loss: 2.2342727184295654
迭代次数:  2 迭代 300 loss: 1.790238857269287
迭代次数:  2 迭代 400 loss: 1.224421501159668
迭代次数:  2 迭代 500 loss: 1.7448128461837769
迭代次数 2 训练损失 1.780696681307134
迭代次数:  3 迭代 0 loss: 1.3125011920928955
迭代次数:  3 迭代 100 loss: 1.5186328887939453
迭代次数:  3 迭代 200 loss: 2.213425636291504
迭代次数:  3 迭代 300 loss: 1.8208574056625366

In [36]:
#导入训练好模型
model.load_state_dict(torch.load('model.pt', map_location=device))
for i in range(0, 6):
    translate_dev(i)
    print()

BOS classes begin tomorrow . EOS
BOS 课 程 明 天 开 始 。 EOS
明天上课。

BOS choose what we like . EOS
BOS 选 我 们 喜 欢 的 。 EOS
服我们想知道。

BOS choose one you like . EOS
BOS 选 一 个 你 喜 欢 的 。 EOS
选你喜欢的。

BOS i want to eat candy . EOS
BOS 我 想 吃 糖 。 EOS
我想要一些甜的东西。

BOS i was n't busy tomorrow . EOS
BOS 我 明 天 不 忙 。 EOS
我不明天是不懂。

BOS do n't you speak chinese ? EOS
BOS 你 会 说 中 文 吗 ? EOS
你不会游泳吗？

