In [32]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
import copy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [149]:
PAD_IDX = 0
SOS_IDX = 1
EOS_IDX = 2
UNK_IDX = 3


class Lang:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {'<unk>': 3}
        self.word2count = {}
        self.index2word = {PAD_IDX: "<pad>", SOS_IDX: "<SOS>", EOS_IDX: "<EOS>", UNK_IDX: "<unk>"}
        self.n_words = 4  # Count PAD, UNK, SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            

In [150]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString_en(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"&apos", r"", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

from zhon.hanzi import punctuation
def normalizeString_zh(s):
    punc = '＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·.'
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([。！？])", r" \1", s)
    s = re.sub(r"[0-9]", r" ", s)
    s = re.sub(r"[%s]+" %punc, r" ", s)
    return s

In [151]:
def readLangs(lang1, lang2, dataset, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    with open('zh-en/%s.tok.%s' %(dataset, lang1), encoding='utf-8') as f_in:
        lines_1 = list(line for line in (l.strip() for l in f_in))
    with open('zh-en/%s.tok.%s' %(dataset, lang2), encoding='utf-8') as f_in:
        lines_2 = list(line for line in (l.strip() for l in f_in))
    #lines_zh = open('iwslt-zh-en-processed/train.%s.' % (lang1, lang2), encoding='utf-8').\
        #read().strip().split('\n')
    

    # Split every line into pairs and normalize
    #pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    if lang1 == 'en':
        norm_lines1 = [[normalizeString_en(s) for s in l.split('\t')] for l in lines_1]
        norm_lines2 = [[normalizeString_zh(s) for s in l.split('\t')] for l in lines_2]
    else:
        norm_lines1 = [[normalizeString_zh(s) for s in l.split('\t')] for l in lines_1]
        norm_lines2 = [[normalizeString_en(s) for s in l.split('\t')] for l in lines_2]
    pairs = []
    for i in range(len(norm_lines1)):
        pairs.append(norm_lines1[i] + norm_lines2[i])
        
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [152]:
MAX_LENGTH = 100


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [153]:
def prepareData(lang1, lang2, dataset, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, dataset, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


train_input_lang, train_output_lang, train_pairs = prepareData('en', 'zh','train', True)
val_input_lang, val_output_lang, val_pairs = prepareData('en', 'zh', 'dev', True)
test_input_lang, test_output_lang, test_pairs = prepareData('en', 'zh', 'test', True)
#print(random.choice(pairs))

Reading lines...
Read 213377 sentence pairs
Trimmed to 212147 sentence pairs
Counting words...
Counted words:
zh 87456
en 50228
Reading lines...
Read 1261 sentence pairs
Trimmed to 1251 sentence pairs
Counting words...
Counted words:
zh 4799
en 3608
Reading lines...
Read 1397 sentence pairs
Trimmed to 1395 sentence pairs
Counting words...
Counted words:
zh 4165
en 3205


In [155]:
pairs_update = []

for pair in train_pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    input_keep = True
    output_keep = True
    
    for word in input_sentence.split(' '):
        if word not in train_input_lang.word2index:
            input_keep = False
            break

    for word in output_sentence.split(' '):
        if word not in train_output_lang.word2index:
            output_keep = False
            break


    if input_keep and output_keep:
        pairs_update.append(pair)

print("Trimmed from %d pairs to %d, %.4f of total" % (len(train_pairs), len(pairs_update), len(pairs_update) / len(train_pairs)))
train_pairs = pairs_update

Trimmed from 212147 pairs to 130308, 0.6142 of total


In [156]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index.keys() else UNK_IDX for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_IDX)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [157]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [158]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [159]:
##dataloader
import numpy as np
import torch
from torch.utils.data import Dataset

class MTDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, input_lang, output_lang, pairs):
        """
        @param candiate_list: list of candidate sentence
        @param reference_list: list of reference sentence

        """
        self.pairs = pairs
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.candidate_list = [indexesFromSentence(self.input_lang, pair[0]) for pair in pairs]
        self.reference_list = [indexesFromSentence(self.output_lang, pair[1]) for pair in pairs]
        assert (len(self.candidate_list) == len(self.reference_list))

    def __len__(self):
        return len(self.pairs)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        candidate_idx = self.candidate_list[key][:MAX_LENGTH]
        reference_idx = self.reference_list[key][:MAX_LENGTH]
        candidate_idx.append(EOS_IDX)
        reference_idx.append(EOS_IDX)
        return [candidate_idx, len(candidate_idx), reference_idx, len(reference_idx)]
    

def MT_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    candidate_list = []
    reference_list = []
    candidate_length_list = []
    reference_length_list = []
    for datum in batch:
        candidate_length_list.append(datum[1])
        reference_length_list.append(datum[3])
    # padding
    MAX_LENGTH = [max(candidate_length_list), max(reference_length_list)]
    for datum in batch:
        padded_vec_1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_LENGTH[0]-datum[1])), 
                                mode="constant", constant_values=0)
        candidate_list.append(padded_vec_1)
        
        padded_vec_2 = np.pad(np.array(datum[2]), 
                                pad_width=((0,MAX_LENGTH[1]-datum[3])), 
                                mode="constant", constant_values=0)
        reference_list.append(padded_vec_2)
    
    sorted_order = np.argsort(candidate_length_list)[::-1]
    candidate_list, candidate_length_list = np.array(candidate_list)[sorted_order], np.array(candidate_length_list)[sorted_order]
    reference_list, reference_length_list = np.array(reference_list)[sorted_order], np.array(reference_length_list)[sorted_order]
    
    return [torch.from_numpy(np.array(candidate_list)), torch.LongTensor(candidate_length_list), 
            torch.from_numpy(np.array(reference_list)), torch.LongTensor(reference_length_list)]


BATCH_SIZE = 32
train_dataset = MTDataset(train_input_lang, train_output_lang, train_pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=MT_collate_func,
                                           shuffle=True)

val_dataset = MTDataset(val_input_lang, val_output_lang, val_pairs)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=MT_collate_func,
                                           shuffle=True)

test_dataset = MTDataset(test_input_lang, test_output_lang, test_pairs)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=MT_collate_func,
                                           shuffle=True)



# for i, (candidate, length_1, reference, length_2) in enumerate(train_loader):
#     print (candidate)
#     print (reference)
#     break

# Self Attention

In [182]:
'''adopted and modified from https://github.com/harvardnlp/annotated-transformer'''
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt):
        return self.decode(self.encode(src), tgt, subsequent_mask(tgt))
    
    def encode(self, src):
        return self.encoder(self.src_embed(src))
    
    def decode(self, memory, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, tgt_mask)

In [183]:
class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [184]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [185]:
class LayerNormalization(nn.Module):
    def __init__(self, d_hid, eps=1e-6):
        super(LayerNormalization, self).__init__()
        self.a = nn.Parameter(torch.ones(d_hid))
        self.b = nn.Parameter(torch.zeros(d_hid))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True,)
        std = x.std(dim=-1, keepdim=True,)
        ln_out = (x - mean) / (std + self.eps)
        ln_out = self.a * ln_out + self.b

        return ln_out

In [186]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNormalization(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [187]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNormalization(layer.size)
        
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)
    
    
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x))
        return self.sublayer[1](x, self.feed_forward)

In [188]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNormalization(layer.size)
        
    def forward(self, x, memory, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, tgt_mask)
        return self.norm(x)
    
    
    
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, tgt_mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, memory, memory))
        return self.sublayer[2](x, self.feed_forward)

In [189]:
def subsequent_mask(input):
    batch_size, seq_len = input.size()
    subsequent_mask = torch.triu(torch.ones((seq_len, seq_len), dtype=torch.uint8), diagonal=1)
    subsequent_mask = subsequent_mask.unsqueeze(0).expand(batch_size, seq_len, seq_len)
    return Variable(subsequent_mask == 0)

In [190]:
def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [191]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        batch_size = query.size(0)
    
        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) for l, x in zip(self.linears, (query, key, value))]
        
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [206]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PoswiseFeedForwardNet, self).__init__()
        self.relu = nn.ReLU()
        self.conv1 = nn.Conv1d(d_model, d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(d_ff, d_model, kernel_size=1)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = LayerNormalization(d_model)

    def forward(self, inputs):
        # inputs: [b_size x len_q x d_model]
        residual = inputs
        output = self.relu(self.conv1(inputs.transpose(1, 2)))

        # outputs: [b_size x len_q x d_model]
        output = self.conv2(output).transpose(1, 2)
        output = self.dropout(output)

        return self.layer_norm(residual + output)

In [207]:
class Embeddings(nn.Module):
    def __init__(self, d_word_vec, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_word_vec)
        self.d_word_vec = d_word_vec

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_word_vec)
    
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_word_vec, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.tensor([[pos / np.power(10000, 2.0 * (j // 2) / d_word_vec) for j in range(d_word_vec)] for pos in range(max_len)])
        pe[:, 0::2] = torch.sin(pe[:, 0::2])
        pe[:, 1::2] = torch.cos(pe[:, 1::2])
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

In [208]:
def make_model(can=train_input_lang.n_words, ref=train_output_lang.n_words, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PoswiseFeedForwardNet(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, can), c(position)),
        nn.Sequential(Embeddings(d_model, ref), c(position)),
        Generator(d_model, ref))
    
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

In [209]:
def train(input_tensor, target_tensor, input_length, output_length, model, model_optimizer, criterion):
    batch_size= input_tensor.size(0)
    target_length = target_tensor.size(1)
    max_target_length = max(output_length)
    
    model_optimizer.zero_grad()
    

    loss = 0
    
    output = model.generator(model.forward(input_tensor, target_tensor))
    
    
    for di in range(max_target_length):
        loss += criterion(output[:,di], target_tensor[:,di])
    

    loss.backward()
    #torch.nn.utils.clip_grad_norm_(decoder.parameters(), 0.05)

    model_optimizer.step()

    return loss.item() / target_length

In [212]:
def trainEpochs(model, n_epochs, print_every=100, plot_every=100, learning_rate=0.0002):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    model_optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()
    
    for epoch in range(n_epochs):
        for i, (candidate, length_1, reference, length_2) in enumerate(train_loader):
             
            loss = train(candidate, reference, length_1, length_2, model, model_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if i > 0 and i % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                #print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg))
                print('Time: {}, Epoch: [{}/{}], Step: [{}/{}], Train Loss: {}'.format(timeSince(start, i+1 / len(train_loader)), 
                    epoch+1, n_epochs, i+1, len(train_loader), print_loss_avg))
            if i >0 and i % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
                
        print_loss_total = 0
        print_loss_total = 0

    showPlot(plot_losses)

In [213]:
model = make_model().to(device)
trainEpochs(model, 10)



Time: 29m 31s (- -30m 46s), Epoch: [1/5], Step: [101/4073], Train Loss: 3.6231094701399793


KeyboardInterrupt: 