In [1]:
import numpy as np
import time
import os.path

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [2]:
### change word start with '&apos' to are
### preprocess token including delete null token
def preposs_toekn(tokens):
    return [token for token in tokens if token != '']

In [3]:
train_en_add = './iwsltzhen/iwslt-zh-en/train.tok.en'
train_zh_add = './iwsltzhen/iwslt-zh-en/train.tok.zh'
val_en_add = './iwsltzhen/iwslt-zh-en/dev.tok.en'
val_zh_add = './iwsltzhen/iwslt-zh-en/dev.tok.zh'

train_en = []
with open(train_en_add) as f:
    for line in f:
        train_en.append(preposs_toekn(line[:-1].strip().split(' ')))

In [4]:
train_en = []
with open(val_en_add) as f:
    for line in f:
        train_en.append(preposs_toekn(line[:-1].strip().split(' ')))

In [5]:
len(train_en)

1261

In [4]:
train_zh = []
with open(train_zh_add) as f:
    for line in f:
        train_zh.append(preposs_toekn(line[:-1].strip().split(' ')))

In [5]:
train_en[0]

['Life', 'in', 'the', 'deep', 'oceans']

In [6]:
#words_ft,idx2words_ft = read_embedding(fasttest_home = './')

In [7]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
UNK_token = 3

def read_embedding(fasttest_home = './wiki-news-300d-1M.vec'):
    words_to_load = 50000

    words_ft = {}
    idx2words_ft = {}
    
    words_ft['$PAD$'] = PAD_token
    idx2words_ft[PAD_token] = '$PAD$'
    words_ft['$SOS$'] = SOS_token
    idx2words_ft[SOS_token] = '$SOS$'
    words_ft['$EOS$'] = EOS_token
    idx2words_ft[EOS_token] = '$EOS$'
    words_ft['$UNK$'] = UNK_token
    idx2words_ft[UNK_token] = '$UNK$'
    
    with open(fasttest_home) as f:
        loaded_embeddings_ft = np.zeros((words_to_load, 300)) 
        ordered_words_ft = []
        f.readline()
        for i, line in enumerate(f):
            i = i+4
            if i >= words_to_load: 
                break
            s = line.split()
            try:
                loaded_embeddings_ft[i, :] = np.asarray(s[1:])
            except:
                print('')
                
            words_ft[s[0]] = i
            idx2words_ft[i] = s[0]
            ordered_words_ft.append(s[0])
    
    return words_ft,idx2words_ft,loaded_embeddings_ft

In [8]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"PAD" : PAD_token,"$SOS$" : SOS_token, "$EOS$" : EOS_token, "$UNK$" : UNK_token}
        self.word2count = {"PAD" : 0, "$SOS$" : 0, "$EOS$" : 0, "$UNK$" : 0}
        self.index2word = {PAD_token: "PAD", SOS_token: "$SOS$", EOS_token: "$EOS$", UNK_token: "$UNK$"}
        self.n_words = 3  # Count SOS and EOS
        self.embedding_matrix = None

#     def addSentence(self, sentence):
#         for word in sentence.split(' '):
#             self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def load_embedding(self,address = './'):
        self.word2index, self.index2word,self.embedding_matrix = read_embedding(address)

In [9]:
def text2index(data,word2index):
    indexdata = []
    for line in data:
        indexdata.append([word2index[c] if c in word2index.keys() else UNK_token  for c in line])
        indexdata[-1].append(EOS_token)
    print('finish')
    return indexdata

In [10]:
def preparelang(name,data):
    lang = Lang(name)
    for line in data:
        for word in line:
            lang.addWord(word)
    return lang

In [11]:
enLang = Lang('en')
enLang.load_embedding('/scratch/tw1682/embedding/wiki.en.vec')
zhLang = Lang('zh')
zhLang.load_embedding('/scratch/tw1682/embedding/wiki.zh.vec')
#enLang = preparelang('en',train_en)
#zhLang = preparelang('zh',train_zh)








In [12]:
train_input_index = text2index(train_en,enLang.word2index)
train_output_index = text2index(train_zh,zhLang.word2index)

finish
finish


In [18]:
enLang.index2word[50000]

KeyError: 50000

In [19]:
enLang.n_words

3

In [14]:
train_input_index = text2index(train_en,enLang.word2index)
train_output_index = text2index(train_zh,zhLang.word2index)

finish
finish


In [13]:
############################ Data Loader #########################

In [65]:
class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, train_input, train_ouput):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list, self.target_list = train_input, train_ouput
        assert (len(self.data_list) == len(self.target_list))
        #self.word2index = word2index

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        train = self.data_list[key]
        label = self.target_list[key]
        train_length = len(train)
        label_length = len(label)
        
        return train,train_length,label,label_length

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list = []
    label_list = []
    train_length_list = []
    label_length_list = []

    for datum in batch:
        label_length_list.append(datum[3])
        train_length_list.append(datum[1])
    
    batch_max_input_length = np.max(train_length_list)
    batch_max_output_length = np.max(label_length_list)
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]),
                                pad_width=((0,batch_max_input_length-datum[1])),
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
        
        padded_vec = np.pad(np.array(datum[2]),
                                pad_width=((0,batch_max_output_length-datum[3])),
                                mode="constant", constant_values=0)
        label_list.append(padded_vec)
        
    ind_dec_order = np.argsort(train_length_list)[::-1]
    data_list = np.array(data_list)[ind_dec_order]
    train_length_list = np.array(train_length_list)[ind_dec_order]
    label_list = np.array(label_list)[ind_dec_order]
    label_length_list = np.array(label_length_list)[ind_dec_order]
    
    #print(type(np.array(data_list)),type(np.array(label_list)))
    
    return [torch.from_numpy(np.array(data_list)).to(device), 
            torch.LongTensor(train_length_list).to(device), 
            torch.from_numpy(np.array(label_list)).to(device), 
            torch.LongTensor(label_length_list).to(device)]



In [66]:
# Build train, valid and test dataloaders

batch_size = 10 

train_dataset = VocabDataset(train_input_index,train_output_index)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

# val_dataset = VocabDataset(val_data)
# val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=True)

# test_dataset = VocabDataset(test_data)
# test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=False)

In [67]:
for data, data_lengths, labels, label_lengths in train_loader:
    print(data.shape)
    print(data_lengths)
    print(labels.shape)
    print(label_lengths)
    break

torch.Size([10, 49])
tensor([49, 34, 30, 17, 17, 16, 11,  9,  9,  6])
torch.Size([10, 46])
tensor([46, 27, 18, 12, 13, 11, 11,  8,  9,  7])


In [68]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_direction):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_direction = num_direction
        self.embedding = nn.Embedding(vocab_size, embed_size)
        if num_direction == 1:
            self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
        elif num_direction == 2:
            self.gru = nn.GRU(embed_size, hidden_size, batch_first=True, bidirectional = True)
        else:
            print('number of direction out of bound')

    def forward(self, x, hidden, lengths):
        embed = self.embedding(x)
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths.numpy(), batch_first=True)
        rnn_out, hidden = self.gru(embed, hidden)
        rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        return rnn_out, hidden

    def initHidden(self, batch_size):
        hidden = torch.randn(self.num_direction, batch_size, self.hidden_size, device=device)
        return hidden

In [125]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, vocab_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, src_input, hidden):
        output = self.embedding(src_input)
        #print(output.size())
        output, hidden = self.gru(output, hidden)
        logits = self.out(output[:,0,:])
        output = self.softmax(logits)
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [126]:
teacher_forcing_ratio = 0
max_length_src = 50
max_length_tgt = 50

def train(input_tensor, input_lengths, target_tensor, target_lengths,
          encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
          batch_size, max_length_src):

    encoder_hidden = encoder.initHidden(batch_size)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs = torch.zeros(max_length_src, encoder.hidden_size, device=device)

    loss = 0

#     for ei in range(input_length):
#         encoder_output, encoder_hidden = encoder(
#             input_tensor[ei], encoder_hidden,input_tensor_length)
#         encoder_outputs[ei] = encoder_output[0, 0]
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden, input_lengths)

    decoder_input = torch.tensor([[SOS_token]*batch_size], device=device).transpose(0,1)
    decoder_hidden = encoder_hidden
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        target_lengths = target_lengths.numpy()
        sent_not_end_index = list(range(batch_size))
        decoding_token_index = 0
        while len(sent_not_end_index) > 0:
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            sent_not_end_index = torch.LongTensor(sent_not_end_index).to(device)
            loss += criterion(decoder_output.index_select(0,sent_not_end_index), 
                              target_tensor[:,decoding_token_index].index_select(
                                  0,sent_not_end_index))
            decoder_input = target_tensor[:,decoding_token_index].unsqueeze(1)  # Teacher forcing
            decoding_token_index += 1
            end_or_not = target_lengths > decoding_token_index
            sent_not_end_index = list(np.where(end_or_not)[0])
            

    else:
        # Without teacher forcing: use its own predictions as the next input
        target_lengths = target_lengths.numpy()
        sent_not_end_index = list(range(batch_size))
        decoding_token_index = 0
        while len(sent_not_end_index) > 0:
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.detach()  # detach from history as input
            #print(type(sent_not_end_index[0]))
            sent_not_end_index = torch.LongTensor(sent_not_end_index).to(device)
            loss += criterion(decoder_output.index_select(0,sent_not_end_index), 
                              target_tensor[:,decoding_token_index].index_select(
                                  0,sent_not_end_index))
            decoding_token_index += 1
            end_or_not = (target_lengths > decoding_token_index)*(
                decoder_input.squeeze().numpy() != EOS_token)
            sent_not_end_index = list(np.where(end_or_not)[0])
            

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_lengths

In [127]:
def trainIters(loader, encoder, decoder, n_iters = 340000, print_every=1000, 
               plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()
    n_iter = 0
    
    for input_tensor, input_lengths, target_tensor, target_lengths in train_loader:
        n_iter += 1
        loss = train(input_tensor, input_lengths, target_tensor, target_lengths, 
                     encoder, decoder, encoder_optimizer, decoder_optimizer, 
                     criterion, batch_size, max_length_src)
            
        print_loss_total += loss
        plot_loss_total += loss

        if n_iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('(%d %d%%) %.4f' % (n_iter, n_iter / n_iter * 100, print_loss_avg))

        if n_iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [128]:
input_size = enLang.n_words
emb_size = 300
hidden_size = 100
num_direction = 1
output_size = zhLang.n_words
encoder = EncoderRNN(input_size, emb_size,hidden_size,num_direction = 1)
decoder = DecoderRNN(hidden_size, output_size)
trainIters(train_loader,encoder, decoder, 3, print_every=1000, plot_every=100, learning_rate=0.01)

KeyboardInterrupt: 

In [114]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [74]:
# import time
# import math


# def asMinutes(s):
#     m = math.floor(s / 60)
#     s -= m * 60
#     return '%dm %ds' % (m, s)


# def timeSince(since, percent):
#     now = time.time()
#     s = now - since
#     es = s / (percent)
#     rs = es - s
#     return '%s (- %s)' % (asMinutes(s), asMinutes(rs))