In [1]:
import os
import sys
import tqdm
import math
from collections import Counter
import numpy as np

import torch
import torch.nn as nn

import nltk

In [2]:
def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file, 'r') as f:
        for line in f:
            line = line.strip().split("\t")
            
            en.append(["BOS"] + nltk.word_tokenize(line[0]) + ["EOS"])
            # split chinese sentence into characters
            cn.append(["BOS"] + [c for c in line[1]] + ["EOS"])
    return en, cn

train_file = "nmt/en-cn/train.txt"
dev_file = "nmt/en-cn/dev.txt"
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)

构建单词表

In [3]:
def build_dict(sentences, max_words=50000):
    word_count = Counter()
    for sentence in sentences:
        for s in sentence:
            word_count[s] += 1
    ls = word_count.most_common(max_words)
    total_words = len(ls) + 1
    word_dict = {w[0]: index+1 for index, w in enumerate(ls)}
    word_dict["UNK"] = 0
    return word_dict, total_words

en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)
inv_en_dict = {v: k for k, v in en_dict.items()}
inv_cn_dict = {v: k for k, v in cn_dict.items()}

把单词全部转变成数字

In [4]:
def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
    '''
        Encode the sequences. 
    '''
    length = len(en_sentences)
    out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
    out_cn_sentences = [[en_dict.get(w, 0) for w in sent] for sent in cn_sentences]

    # sort sentences by english lengths
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
       
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
        
    return out_en_sentences, out_cn_sentences

train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)

把全部句子分成batch

In [5]:
def get_minibatches(n, minibatch_size, shuffle=False):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches

def prepare_data(seqs):
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)
    max_len = np.max(lengths)

    x = np.zeros((n_samples, max_len)).astype('int32')
#     x_mask = np.zeros((n_samples, max_len)).astype('float32')
    x_lengths = np.array(lengths).astype("int32")
    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
#         x_mask[idx, :lengths[idx]] = 1.0
    return x, x_lengths #x_mask

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x, mb_x_len = prepare_data(mb_en_sentences)
        mb_y, mb_y_len = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    return all_ex

batch_size = 64
train_data = gen_examples(train_en, train_cn, batch_size)
dev_data = gen_examples(dev_en, dev_cn, batch_size)

数据全部处理完成，现在我们开始构建seq2seq模型

In [6]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True, bidirectional=True)

    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        x_sorted = self.embed(x_sorted)
        packed_seq = nn.utils.rnn.pack_padded_sequence(x_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        
        return output_seq
    
    
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.linear_out = nn.Linear(hidden_size*2, hidden_size)
        
    def forward(self, output, context, mask):
        # output: batch_size, output_len, hidden_size
        # context: batch_size, context_len, hidden_size
    
        batch_size = output.size(0)
        input_len = context.size(1)
        hidden_size = output.size(2)
        
        attn = torch.bmm(output, context.transpose(1,2)) # batch_size, output_len, context_len
        attn.data.masked_fill(mask, -1e6)
#         print(attn)
        attn = F.softmax(attn, dim=2) # batch_size, output_len, context_len
#         print(attn)
        mix = torch.bmm(attn, context) # batch_size, output_len, hidden_size
        
        combined = torch.cat((mix, output), dim=2) # batch_size, output_len, hidden_size*2
        output = F.tanh(self.linear_out(combined.view(-1, 2*hidden_size))).view(batch_size, -1, hidden_size)
        return output, attn

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)

    def create_mask(self, x_len, y_len):
        device = x_len.device
        max_x_len = x_len.max()
        max_y_len = y_len.max()
        x_mask = torch.arange(max_x_len, device=self.device)[None, :] < x_len[:, None]
        y_mask = torch.arange(max_y_len, device=self.device)[None, :] < y_len[:, None]
        mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte()
        
        
    def forward(self, ctx, ctx_lengths, y, y_lengths):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        print(sorted_len)
        y_sorted = y[sorted_idx.long()].contiguous()
        y_sorted = self.embed(y_sorted)
        
        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        
        mask = self.create_mask(ctx_lengths, y_lengths)
        output, attn = self.attention(y, ctx, mask)
        return output
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_outputs = self.encoder(x, x_lengths)
        output = self.decoder(ctx=encoder_outputs, 
                    ctx_lengths=x_lengths,
                    y=y,
                    y_lengths=y_lengths)
        return output

训练

In [7]:
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()

    def forward(self, input, target, mask):
        input = to_contiguous(input).view(-1, input.size(2))
        target = to_contiguous(target).view(-1, 1)
        mask = to_contiguous(mask).view(-1, 1)
        output = -input.gather(1, target) * mask
        output = torch.sum(output) / torch.sum(mask)

        return output

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

en_vocab_size = len(en_dict)
cn_vocab_size = len(cn_dict)
embed_size = hidden_size = 100
encoder = Encoder(vocab_size=en_vocab_size, 
                  embed_size=embed_size, 
                  hidden_size=hidden_size)
decoder = Decoder(vocab_size=cn_vocab_size, 
                  embed_size=embed_size, 
                  hidden_size=hidden_size)
model = Seq2Seq(encoder, decoder)
model = model.to(device)
crit = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())


In [9]:
num_epochs = 5
total_num_words = loss = 0.
for epoch in range(num_epochs):
    for it, (mb_x, mb_x_lengths, mb_y, mb_y_lengths) in enumerate(train_data):
#         break
        mb_x = torch.from_numpy(mb_x).long().to(device)
        mb_x_lengths = torch.from_numpy(mb_x_lengths).long().to(device)
        mb_input = torch.from_numpy(mb_y[:,:-1]).long().to(device)
        mb_out = torch.from_numpy(mb_y[:, 1:]).long().to(device)
        mb_y_lengths = torch.from_numpy(mb_y_lengths-1).long().to(device)
#         print(mb_input.shape)
#         print(mb_y_lengths)
#         print(mb_y_lengths.max())
        mb_y_lengths[mb_y_lengths <= 0] = 1
        
        mb_pred = model(mb_x, mb_x_lengths, mb_input, mb_y_lengths)
        
        mb_out_mask = torch.arange(mb_y_lengths, device=self.device)[None, :] < mb_y_lengths[:, None]
        loss = crit(mb_pred, mb_out, mb_out_mask)
        
        num_words = torch.sum(mb_y_lengths).item()
        total_loss += loss.item() * num_words
        total_num_words += num_words
        
        if it % 100 == 0:
            print("epoch", epoch, "iteration", it, "loss", loss.item())
            

tensor([10, 10,  9,  9,  8,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  3], device='cuda:0')


RuntimeError: CUDA error: device-side assert triggered

In [None]:
mb_y_lengths