In [1]:
import numpy as np
import time
import os.path

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
###
### change word start with '&apos' to are
### preprocess token including delete null token
def preposs_toekn(tokens):
    return [token for token in tokens if token != '']

In [3]:
train_en_add = './iwsltzhen/iwslt-zh-en/train.tok.en'
train_zh_add = './iwsltzhen/iwslt-zh-en/train.tok.zh'
val_en_add = './iwsltzhen/iwslt-zh-en/dev.tok.en'
val_zh_add = './iwsltzhen/iwslt-zh-en/dev.tok.zh'

train_en = []
with open(train_en_add) as f:
    for line in f:
        train_en.append(preposs_toekn(line[:-1].strip().split(' ')))

In [4]:
train_zh = []
with open(train_zh_add) as f:
    for line in f:
        train_zh.append(preposs_toekn(line[:-1].strip().split(' ')))

In [5]:
train_en[0]

['Life', 'in', 'the', 'deep', 'oceans']

In [6]:
SOS_token = 0
EOS_token = 1
UNK_token = 2

def read_embedding(fasttest_home = './wiki-news-300d-1M.vec'):
    words_to_load = 50000

    words_ft = {}
    idx2words_ft = {}
        
    SOS_token = 0
    words_ft['$SOS$'] = SOS_token
    idx2words_ft[SOS_token] = '$SOS$'
    EOS_token = 1
    words_ft['$EOS$'] = EOS_token
    idx2words_ft[EOS_token] = '$EOS$'
    Unk_token = 2
    words_ft['$UNK$'] = Unk_token
    idx2words_ft[Unk_token] = '$UNK$'
    
    with open(fasttest_home) as f:
        loaded_embeddings_ft = np.zeros((words_to_load, 300)) 
        ordered_words_ft = []
        for i, line in enumerate(f):
            i = i+3
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings_ft[i, :] = np.asarray(s[1:])
            words_ft[s[0]] = i
            idx2words_ft[i] = s[0]
            ordered_words_ft.append(s[0])
    
    return words_ft,idx2words_ft,loaded_embeddings_ft

In [7]:
#words_ft,idx2words_ft = read_embedding(fasttest_home = './')

In [8]:
SOS_token = 0
EOS_token = 1
UNK_token = 2

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"$SOS$" : 0, "$EOS$" : 1, "$UNK$" : 2}
        self.word2count = {"$SOS$" : 0, "$EOS$" : 0, "$UNK$" : 0}
        self.index2word = {0: "$SOS$", 1: "$EOS$", 2: "$UNK$"}
        self.n_words = 3  # Count SOS and EOS
        self.embedding_matrix = None

#     def addSentence(self, sentence):
#         for word in sentence.split(' '):
#             self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def load_embedding(self,address = './'):
        self.word2index, self.index2word,self.embedding_matrix = read_embedding(address)

In [9]:
def text2index(data,word2index):
    indexdata = []
    for line in data:
        indexdata.append([word2index[c] if c in word2index.keys() else UNK_token  for c in line])
        indexdata[-1].append(EOS_token)
    print('finish')
    return indexdata


In [10]:
def preparelang(name,data):
    lang = Lang(name)
    for line in data:
        for word in line:
            lang.addWord(word)
    return lang

In [11]:
enLang = Lang('eng')
enLang.load_embedding('/scratch/tw1682/NLP/wiki-news-300d-1M.vec')
zhLang = preparelang('zh',train_zh)

In [12]:
train_input_index = text2index(train_en,enLang.word2index)
train_output_index = text2index(train_zh,zhLang.word2index)

finish
finish


In [13]:
############################ Data Loader #########################

In [14]:
class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, train_input, train_ouput):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list, self.target_list = train_input, train_ouput
        assert (len(self.data_list) == len(self.target_list))
        #self.word2index = word2index

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        train = self.data_list[key]
        label = self.target_list[key]
        train_length = len(train)
        label_length = len(label)
        
        return train,train_length,label,label_length

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list = []
    label_list = []
    train_length_list = []
    label_length_list = []

    for datum in batch:
        label_length_list.append(datum[3])
        train_length_list.append(datum[1])
    
    batch_max_input_length = np.max(train_length_list)
    batch_max_output_length = np.max(label_length_list)
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]),
                                pad_width=((0,batch_max_input_length-datum[1])),
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
        
        padded_vec = np.pad(np.array(datum[2]),
                                pad_width=((0,batch_max_output_length-datum[3])),
                                mode="constant", constant_values=0)
        label_list.append(padded_vec)
        
    ind_dec_order = np.argsort(train_length_list)[::-1]
    data_list = np.array(data_list)[ind_dec_order]
    train_length_list = np.array(train_length_list)[ind_dec_order]
    label_list = np.array(label_list)[ind_dec_order]
    label_length_list = np.array(label_length_list)[ind_dec_order]
    
    print(type(np.array(data_list)),type(np.array(label_list)))
    
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(train_length_list), torch.from_numpy(np.array(label_list)), torch.LongTensor(label_length_list)]


In [33]:
# Build train, valid and test dataloaders

BATCH_SIZE = 1

train_dataset = VocabDataset(train_input_index,train_output_index)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

# val_dataset = VocabDataset(val_data)
# val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=True)

# test_dataset = VocabDataset(test_data)
# test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=False)

In [34]:
for data, data_lengths, labels, label_lengths in train_loader:
    print(data.shape)
    print(data_lengths)
    print(labels.shape)
    print(label_lengths)
    break

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
torch.Size([1, 6])
tensor([6])
torch.Size([1, 7])
tensor([7])


In [41]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_direction):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_direction = num_direction
        self.embedding = nn.Embedding(vocab_size, embed_size)
        if num_direction == 1:
            self.gru = nn.GRU(embed_size, hidden_size)
        elif num_direction == 2:
            self.gru = nn.GRU(embed_size, hidden_size, bidirectional == True)
        else:
            print('number of direction out of bound')

    def forward(self, x, hidden, lengths):
        embed = self.embedding(x) #.view(1, 1, -1)
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths.numpy(), batch_first=True)
        rnn_out, hidden = self.gru(embed, hidden)
        rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        return rnn_out, hidden

    def initHidden(self,batch_size):
        hidden = torch.randn(self.num_direction, batch_size, self.hidden_size, device=device)
        return hidden

In [42]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        #output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [37]:
# def indexesFromSentence(lang, sentence):
#     return [lang.word2index[word] for word in sentence.split(' ')]


# def tensorFromSentence(lang, sentence):
#     indexes = indexesFromSentence(lang, sentence)
#     indexes.append(EOS_token)
#     return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


# def tensorsFromPair(pair):
#     input_tensor = tensorFromSentence(input_lang, pair[0])
#     target_tensor = tensorFromSentence(output_lang, pair[1])
#     return (input_tensor, target_tensor)

In [38]:
teacher_forcing_ratio = 0.5
MAX_LENGTH = 50

def train(input_tensor, input_tensor_length, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden(BATCH_SIZE)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

#     for ei in range(input_length):
#         encoder_output, encoder_hidden = encoder(
#             input_tensor[ei], encoder_hidden,input_tensor_length)
#         encoder_outputs[ei] = encoder_output[0, 0]

    ########################bug here######################
    ########################bug here######################
    ########################bug here######################
    ########################bug here######################
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden,input_tensor_length)

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [39]:
def trainIters(loader, encoder, decoder,n_iters = 1, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    print(decoder)
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()
    
    for data, data_lengths, labels, label_lengths in train_loader:
        input_tensor, target_tensor = data.view(-1, 1),labels.view(-1, 1)
        loss = train(input_tensor, data_lengths,target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [43]:
input_size = enLang.n_words
emb_size = 300
hidden_size = 100
num_direction = 1
output_size = zhLang.n_words
encoder = EncoderRNN(input_size, emb_size,hidden_size,num_direction = 1)
decoder = DecoderRNN(hidden_size, output_size)
trainIters(train_loader,encoder, decoder, 3, print_every=1000, plot_every=100, learning_rate=0.01)

DecoderRNN(
  (embedding): Embedding(88917, 100)
  (gru): GRU(100, 100)
  (out): Linear(in_features=100, out_features=88917, bias=True)
  (softmax): LogSoftmax()
)
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


RuntimeError: index out of range at /pytorch/aten/src/TH/generic/THTensorMath.cpp:352

In [114]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [268]:
a = [[1,2,3],[2,3]]

In [269]:
np.array(a)

array([list([1, 2, 3]), list([2, 3])], dtype=object)