In [119]:
import numpy as np
import time
import os.path

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [32]:
###
### change word start with '&apos' to are
def preposs_toekn(tokens):
    return [token for token in tokens if token != '']

In [17]:
train_en_add = './iwsltzhen/iwslt-zh-en/train.tok.en'
train_zh_add = './iwsltzhen/iwslt-zh-en/train.tok.zh'
val_en_add = './iwsltzhen/iwslt-zh-en/dev.tok.en'
val_zh_add = './iwsltzhen/iwslt-zh-en/dev.tok.zh'

train_en = []
with open(train_en_add) as f:
    for line in f:
        train_en.append(preposs_toekn(line[:-1].strip().split(' ')))

In [33]:
train_zh = []
with open(train_zh_add) as f:
    for line in f:
        train_zh.append(preposs_toekn(line[:-1].strip().split(' ')))

In [199]:
train_en[0]

['Life', 'in', 'the', 'deep', 'oceans']

In [201]:
SOS_token = 0
EOS_token = 1
UNK_token = 2

def read_embedding(fasttest_home = './wiki-news-300d-1M.vec'):
    words_to_load = 50000

    words_ft = {}
    idx2words_ft = {}
        
    SOS_token = 0
    words_ft['$SOS$'] = SOS_token
    idx2words_ft[SOS_token] = '$SOS$'
    EOS_token = 1
    words_ft['$EOS$'] = EOS_token
    idx2words_ft[EOS_token] = '$EOS$'
    Unk_token = 2
    words_ft['$UNK$'] = Unk_token
    idx2words_ft[Unk_token] = '$UNK$'
    
    with open(fasttest_home) as f:
        loaded_embeddings_ft = np.zeros((words_to_load, 300)) 
        ordered_words_ft = []
        for i, line in enumerate(f):
            i = i+3
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings_ft[i, :] = np.asarray(s[1:])
            words_ft[s[0]] = i
            idx2words_ft[i] = s[0]
            ordered_words_ft.append(s[0])
    
    return words_ft,idx2words_ft,loaded_embeddings_ft

In [202]:
#words_ft,idx2words_ft = read_embedding(fasttest_home = './')

In [203]:
SOS_token = 0
EOS_token = 1
UNK_token = 2

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {} #{0: "$SOS$", 1: "$EOS$", 2: "$UNK$"}
        self.n_words = 3  # Count SOS and EOS
        self.embedding_matrix = None

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def load_embedding(self,address = './'):
        self.word2index, self.index2word,self.embedding_matrix = read_embedding(address)

In [235]:
def text2index(data,word2index):
    indexdata = []
    for line in data:
        indexdata.append([word2index[c] if c in word2index.keys() else UNK_token  for c in line])
        indexdata[-1].append(EOS_token)
    print('finish')
    return indexdata


In [205]:
def preparelang(name,data):
    lang = Lang(name)
    for line in data:
        for word in line:
            lang.addWord(word)
    return lang

In [206]:
enLang = Lang('eng')
enLang.load_embedding('./wiki-news-300d-1M.vec')
zhLang = preparelang('zh',train_zh)

In [240]:
train_input_index = text2index(train_en,enLang.word2index)
train_output_index = text2index(train_zh,zhLang.word2index)

finish
finish


In [208]:
############################ Data Loader #########################

In [256]:
class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, train_input, train_ouput):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list, self.target_list = train_input, train_ouput
        assert (len(self.data_list) == len(self.target_list))
        #self.word2index = word2index

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        train_index = self.data_list[key]
        print(train_index)
        label = self.target_list[key]
        print(label)
        print('***********')
        length = len(train_index)
        
        return train_index,length,label

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []

    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    
    batch_max_length = np.max(length_list)
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]),
                                pad_width=((0,batch_max_length-datum[1])),
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
        
    ind_dec_order = np.argsort(length_list)[::-1]
    data_list = np.array(data_list)[ind_dec_order]
    length_list = np.array(length_list)[ind_dec_order]
    label_list = np.array(label_list)[ind_dec_order]
    
    print(type(np.array(data_list)),type(np.array(label_list)))
    
    ##########bug here pad label list!!##################
    ##########bug here pad label list!!##################
    ##########bug here pad label list!!##################
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.from_numpy(np.array(label_list))]


In [257]:
# Build train, valid and test dataloaders

BATCH_SIZE = 16

train_dataset = VocabDataset(train_input_index,train_output_index)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

# val_dataset = VocabDataset(val_data)
# val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=True)

# test_dataset = VocabDataset(test_data)
# test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=False)

In [264]:
np.array(train_input_index[0:5])

array([list([1932, 10, 5, 2151, 10789, 1]),
       list([519, 12421, 742, 11855, 2832, 29, 12277, 4, 859, 32137, 1081, 195, 9, 68, 8, 1716, 2, 46314, 4, 130, 3638, 4, 6628, 7, 2711, 10536, 4, 5, 13294, 7, 15381, 19570, 8, 5, 10789, 2, 16302, 4, 113, 214, 17, 6850, 4, 18144, 7, 34124, 10936, 6, 1]),
       list([135, 17, 1658, 23978, 6, 32, 2, 3268, 32137, 6, 1]),
       list([170, 95, 2, 529, 9, 1369, 34, 68, 1487, 30, 5, 1980, 99, 10, 742, 6, 1]),
       list([310, 2, 348, 68, 8, 5, 130, 6372, 742, 8, 15230, 15, 2, 511, 111, 530, 4, 7, 95, 2, 38, 529, 9, 236, 34, 82, 8, 23, 6, 1])],
      dtype=object)

In [258]:
for data, lengths, labels in train_loader:
    print(labels)
    break

[1932, 10, 5, 2151, 10789, 1]
[3, 4, 5, 6, 7, 8, 1]
***********
[519, 12421, 742, 11855, 2832, 29, 12277, 4, 859, 32137, 1081, 195, 9, 68, 8, 1716, 2, 46314, 4, 130, 3638, 4, 6628, 7, 2711, 10536, 4, 5, 13294, 7, 15381, 19570, 8, 5, 10789, 2, 16302, 4, 113, 214, 17, 6850, 4, 18144, 7, 34124, 10936, 6, 1]
[7, 8, 9, 10, 11, 12, 5, 13, 14, 15, 16, 17, 18, 19, 20, 19, 21, 22, 23, 24, 25, 5, 26, 27, 28, 29, 30, 31, 32, 5, 33, 34, 35, 36, 29, 37, 38, 39, 40, 41, 42, 43, 44, 5, 6, 1]
***********
[135, 17, 1658, 23978, 6, 32, 2, 3268, 32137, 6, 1]
[7, 8, 45, 30, 46, 47, 48, 30, 7, 8, 1]
***********
[170, 95, 2, 529, 9, 1369, 34, 68, 1487, 30, 5, 1980, 99, 10, 742, 6, 1]
[15, 49, 50, 51, 13, 52, 53, 51, 3, 54, 5, 55, 1]
***********
[310, 2, 348, 68, 8, 5, 130, 6372, 742, 8, 15230, 15, 2, 511, 111, 530, 4, 7, 95, 2, 38, 529, 9, 236, 34, 82, 8, 23, 6, 1]
[15, 56, 57, 58, 59, 5, 60, 61, 62, 63, 5, 13, 64, 65, 66, 67, 68, 1]
***********
[24, 1677, 8, 5, 623, 17, 15, 5, 15230, 118, 138, 307, 23, 2, 

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: double, float, float16, int64, int32, and uint8.

In [130]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [131]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [132]:
# def indexesFromSentence(lang, sentence):
#     return [lang.word2index[word] for word in sentence.split(' ')]


# def tensorFromSentence(lang, sentence):
#     indexes = indexesFromSentence(lang, sentence)
#     indexes.append(EOS_token)
#     return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


# def tensorsFromPair(pair):
#     input_tensor = tensorFromSentence(input_lang, pair[0])
#     target_tensor = tensorFromSentence(output_lang, pair[1])
#     return (input_tensor, target_tensor)

In [133]:
teacher_forcing_ratio = 0.5
MAX_LENGTH = 50

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [141]:
def trainIters(loader, encoder, decoder,n_iters = 1, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    print(decoder)
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()
    
    for data, lengths, labels in loader:
        input_tensor, target_tensor = data.view(-1, 1),labels.view(-1, 1)
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [142]:
input_size = enLang.n_words
hidden_size = 100
output_size = zhLang.n_words
encoder = EncoderRNN(input_size, hidden_size)
decoder = DecoderRNN(hidden_size, output_size)
trainIters(train_loader,encoder, decoder, 3, print_every=1000, plot_every=100, learning_rate=0.01)

DecoderRNN(
  (embedding): Embedding(88916, 100)
  (gru): GRU(100, 100)
  (out): Linear(in_features=100, out_features=88916, bias=True)
  (softmax): LogSoftmax()
)


TypeError: object of type 'NoneType' has no len()

In [114]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [268]:
a = [[1,2,3],[2,3]]

In [269]:
np.array(a)

array([list([1, 2, 3]), list([2, 3])], dtype=object)