In [125]:
import pandas as pd
import os

source_lang = 'en'
target_lang = 'vi'
data_dir = 'data/'
from torch import optim
import torch.nn.functional as F


In [78]:
def load_train(folder="./data/", rows=100000):
    for file in os.listdir(folder):
        file_path = os.path.join(os.path.abspath(folder), file)
        if file_path.__contains__("train"):
            if file_path.endswith(source_lang):
                file_en = open(file_path)
                dataset_en = _read_file(file_en)
            elif file_path.endswith(target_lang):
                file_vi = open(file_path)
                dataset_vi = _read_file(file_vi)
    if rows != -1:
        return [[dataset_en[i], dataset_vi[i]] for i in range(len(dataset_en))]
    return dataset_en, dataset_vi

In [79]:
def _read_file(file):

    lines = file.readlines()
    lst_lines = [x.strip() for x in lines]
    return lst_lines

In [80]:
train_data = load_train()

In [81]:
corpus = pd.DataFrame(train_data) 

In [82]:
# corpus[corpus[0].map(len) < 100]

In [83]:
# corpus[corpus[1].map(len) < 100]

In [84]:
SOS_token = '<start>'
EOS_token = '<end>'
UNK_token = '<unk>'
PAD_token = '<pad>'

SOS_idx = 0
EOS_idx = 1
UNK_idx = 2
PAD_idx = 3

class Vocab:
    def __init__(self):
        self.index2word = {
            SOS_idx: SOS_token,
            EOS_idx: EOS_token,
            UNK_idx: UNK_token,
            PAD_idx: PAD_token
        }
        self.word2index = {v: k for k, v in self.index2word.items()}

    def index_words(self, words):
        for word in words:
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            n_words = len(self)
            self.word2index[word] = n_words
            self.index2word[n_words] = word

    def __len__(self):
        assert len(self.index2word) == len(self.word2index)
        return len(self.index2word)

    def unidex_words(self, indices):
        return [self.index2word[i] for i in indices]

    def to_file(self, filename):
        values = [w for w, k in sorted(list(self.word2index.items())[5:])]
        with open(filename, 'w') as f:
            f.write('\n'.join(values))

    @classmethod
    def from_file(cls, filename):
        vocab = Vocab()
        with open(filename, 'r') as f:
            words = [l.strip() for l in f.readlines()]
            vocab.index_words(words)

In [85]:
import nltk
import pandas as pd

max_length = 300
min_word_count = 1

tokenizers = {
    'en': nltk.tokenize.WordPunctTokenizer().tokenize,
    'vi': nltk.tokenize.WordPunctTokenizer().tokenize
}

def preprocess_corpus(sents, tokenizer, min_word_count):
    n_words = {}

    sents_tokenized = []
    for sent in sents:
        sent_tokenized = [w.lower() for w in tokenizer(sent)]

        sents_tokenized.append(sent_tokenized)

        for word in sent_tokenized:
            if word in n_words:
                n_words[word] += 1
            else:
                n_words[word] = 1

    for i, sent_tokenized in enumerate(sents_tokenized):
        sent_tokenized = [t if n_words[t] >= min_word_count else UNK_token for t in sent_tokenized]
        sents_tokenized[i] = sent_tokenized

    return sents_tokenized

def read_vocab(sents):
    vocab = Vocab()
    for sent in sents:
        vocab.index_words(sent)

    return vocab

source_sents = preprocess_corpus(corpus[0], tokenizers[source_lang], min_word_count)
target_sents = preprocess_corpus(corpus[1], tokenizers[target_lang], min_word_count)
print("preprocessing complete")

source_vocab = read_vocab(source_sents)
target_vocab = read_vocab(target_sents)
print("vocab created")
target_vocab.to_file(os.path.join(data_dir, '{}.vocab.txt'.format(target_lang)))
source_vocab.to_file(os.path.join(data_dir, '{}.vocab.txt'.format(source_lang)))
print("vocab saved to file")
print('Corpus length: {}\nSource vocabulary size: {}\nTarget vocabulary size: {}'.format(
    len(source_sents), len(source_vocab.word2index), len(target_vocab.word2index)
))
examples = list(zip(source_sents, target_sents))[80:90]
for source, target in examples:
    print('Source: "{}", target: "{}"'.format(' '.join(source), ' '.join(target)))

preprocessing complete
vocab created
vocab saved to file
Corpus length: 133317
Source vocabulary size: 42172
Target vocabulary size: 19818
Source: "chronic pain is an example . if you burn yourself , you pull your hand away .", target: "đau kinh niên là một ví dụ . nếu bạn phỏng , bạn sẽ giật tay ra xa ."
Source: "but if you & apos ; re still in pain in six months & apos ; or six years & apos ; time , it & apos ; s because these circuits are producing pain that & apos ; s no longer helping you .", target: "nhưng nếu trong sáu tháng , hay sáu năm , cơn đau vẫn không dứt , đó là vì những vòng tuần hoàn này đang sản xuất ra cơn đau chống lại bạn ."
Source: "if we can look at the activation in the brain that & apos ; s producing the pain , we can form 3d models and watch in real time the brain process information , and then we can select the areas that produce the pain .", target: "nếu ta có thể nhìn vào các xung kích hoạt của não sản xuất ra cơn đau , ta có thể lập ra các mô hình 3 chiều 

In [86]:
import numpy as np

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

source_length = len(source_sents)
inidices = np.random.permutation(source_length)

training_indices = inidices[:int(source_length*0.94)]
dev_indices = inidices[int(source_length*0.8):int(source_length*0.99)]
test_indices = inidices[int(source_length*0.99):]

training_source = [source_sents[i] for i in training_indices]
dev_source = [source_sents[i] for i in dev_indices]
test_source = [source_sents[i] for i in test_indices]

training_target = [target_sents[i] for i in training_indices]
dev_target = [target_sents[i] for i in dev_indices]
test_target = [target_sents[i] for i in test_indices]

# Unwrap training examples
training_t = []
training_s = []
for source, tt in zip(training_source, training_target):
    training_t.append(tt)
    training_s.append(source)

training_source = training_s
training_target = training_t

In [87]:
print(training_t[0])
print(training_s[0])

['nên', 'trong', 'lúc', 'chúng', 'ta', 'đang', 'phải', 'đối', 'mặt', 'với', 'nhiều', 'cuộc', 'khủng', 'hoảng', 'cùng', 'một', 'lúc', 'trên', 'thế', 'giới', ',', 'điều', 'tốt', 'với', 'chúng', 'ta', 'trên', 'phương', 'diện', 'cá', 'nhân', ',', 'điều', 'mà', 'sẽ', 'đem', 'lại', 'cho', 'chúng', 'ta', 'niềm', 'vui', ',', 'lòng', 'biết', 'ơn', ',', 'hiệu', 'quả', 'trong', 'cuộc', 'sống', 'chúng', 'ta', 'và', 'sẽ', 'là', 'điều', 'tốt', 'nhất', 'đối', 'với', 'nghề', 'nghiệp', 'của', 'chính', 'chúng', 'ta', 'cũng', 'chính', 'là', 'điều', 'tốt', 'nhất', 'đối', 'với', 'thế', 'giới', '.']
['so', 'as', 'we', 'are', 'facing', 'all', 'the', 'multiple', 'crises', 'in', 'our', 'world', 'at', 'the', 'moment', ',', 'what', 'is', 'good', 'for', 'us', 'on', 'a', 'personal', 'level', ',', 'what', '&', 'apos', ';', 's', 'going', 'to', 'bring', 'more', 'joy', ',', 'gratitude', ',', 'effectiveness', 'in', 'our', 'lives', 'and', 'be', 'the', 'best', 'for', 'our', 'own', 'careers', 'is', 'also', 'what', 'is', '

In [88]:

import torch

def indexes_from_sentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence]

def tensor_from_sentence(vocab, sentence, max_seq_length):
#     print(sentence)
#     print("sentence over")
    indexes = indexes_from_sentence(vocab, sentence)
    indexes.append(EOS_idx)
    indexes.insert(0, SOS_idx)
    # we need to have all sequences the same length to process them in batches
    if len(indexes) < max_seq_length:
        indexes += [PAD_idx] * (max_seq_length - len(indexes))
    tensor = torch.LongTensor(indexes)
    return tensor

def tensors_from_pair(source_sent, target_sent, max_seq_length):
    source_tensor = tensor_from_sentence(source_vocab, source_sent, max_seq_length).unsqueeze(1)
    target_tensor = tensor_from_sentence(target_vocab, target_sent, max_seq_length).unsqueeze(1)
    return (source_tensor, target_tensor)

max_seq_length = max_length + 2  # 2 for EOS_token and SOS_token

training = []
for source_sent, target_sent in zip(training_source, training_target):
    training.append(tensors_from_pair(source_sent, target_sent, max_seq_length))




In [89]:
x_training, y_training = zip(*training)

In [90]:
x_training = list(x_training)
y_training = list(y_training)
new_x = []
new_y = []
for i in range(len(x_training)):
    if x_training[i].size()[0] == 302:
        new_x.append(x_training[i])
        new_y.append(y_training[i])
    else:
        print(i)
x_training = tuple(new_x)
y_training = tuple(new_y)

1210
1875
18451
37013
42232
47921
53343
54855
55543
56447
61364
70397
73762
76080
78750
79818
82628
83594
104615
106694
114167
116373
124865


In [91]:
new_x = []
new_y = []
for i in range(len(dev_source)):
    if len(dev_source[i]) <= 302:
        new_x.append(dev_source[i])
        new_y.append(dev_target[i])
    else:
        print(i)
dev_source = tuple(new_x)
dev_target = tuple(new_y)

41
7514
9720
18212
24467


In [92]:
new_x = []
new_y = []
for i in range(len(y_training)):
    if y_training[i].size()[0] == 302:
        new_x.append(x_training[i])
        new_y.append(y_training[i])
    else:
        print(i)
x_training = tuple(new_x)
y_training = tuple(new_y)

4856
15532
19300
32354
57827


In [93]:
for i in range(len(x_training)):
    if x_training[i].size()[0] != 302:
        print(x_training[i].size()[0])

In [94]:
x_training = torch.transpose(torch.cat(x_training, dim=-1), 1, 0)
y_training = torch.transpose(torch.cat(y_training, dim=-1), 1, 0)
torch.save(x_training, os.path.join(data_dir, 'x_training.bin'))
torch.save(y_training, os.path.join(data_dir, 'y_training.bin'))

x_development = []
for source_sent in dev_source:
    tensor = tensor_from_sentence(source_vocab, source_sent, max_seq_length).unsqueeze(1)
    x_development.append(tensor)

x_development = torch.transpose(torch.cat(x_development, dim=-1), 1, 0)
torch.save(x_development, os.path.join(data_dir, 'x_development.bin'))

x_test = []
for source_sent in test_source:
    tensor = tensor_from_sentence(source_vocab, source_sent, max_seq_length).unsqueeze(1)
    x_test.append(tensor)

x_test = torch.transpose(torch.cat(x_test, dim=-1), 1, 0)
torch.save(x_test, os.path.join(data_dir, 'x_test.bin'))

USE_CUDA = False
if USE_CUDA:
    x_training = x_training.cuda()
    y_training = y_training.cuda()
    x_development = x_development.cuda()
    x_test = x_test.cuda()

In [95]:
import torch.nn as nn
import torch.nn.init as init

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [152]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [153]:
USE_CUDA = False
device="cpu"
if USE_CUDA:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x_training = x_training.cuda()
    y_training = y_training.cuda()
    x_development = x_development.cuda()
    x_test = x_test.cuda()

In [154]:
import math

def batch_generator(batch_indices, batch_size):
    batches = math.ceil(len(batch_indices)/batch_size)
    for i in range(batches):
        batch_start = i*batch_size
        batch_end = (i+1)*batch_size
        if batch_end > len(batch_indices):
            yield batch_indices[batch_start:]
        else:
            yield batch_indices[batch_start:batch_end]

In [155]:
cross_entropy = nn.CrossEntropyLoss()

In [156]:
from nltk.translate.bleu_score import corpus_bleu

def bleu(n):
    weights = [1.0/n]*n + [0.0]*(4-n)
    return lambda list_of_references, list_of_hypothesis: corpus_bleu(list_of_references, list_of_hypothesis, weights)

def accuracy(list_of_references, list_of_hypothesis):
    total = 0.0
    for references, hypothesis in zip(list_of_references, list_of_hypothesis):
        total += 1.0 if tuple(hypothesis) in set(references) else 0.0
    return total / len(list_of_references)

score_functions = {'BLEU-{}'.format(i):bleu(i) for i in range(1, 5)}
score_functions['Accuracy'] = accuracy

def score(model, X, target, desc='Scoring...'):
    scores = {name:0.0 for name in score_functions.keys()}
    length = len(target)
    list_of_hypothesis = []
    for i, x in tqdm(enumerate(X),
                     desc=desc,
                     total=length):
        y = model(x.unsqueeze(0))
        hypothesis = target_vocab.unidex_words(y[1:-1])  # Remove SOS and EOS from y
        list_of_hypothesis.append(hypothesis)

    for name, func in score_functions.items():
        score = func(target, list_of_hypothesis)
        scores[name] = score

    return scores

In [166]:
from tqdm import tqdm_notebook as tqdm

BATCH_SIZE = 100
total_batches = int(len(x_training)/BATCH_SIZE) + 1
indices = list(range(len(x_training)))

early_stop_after = 10
early_stop_counter = 0
best_model = None

best_score = 0.0
scoring_metric = 'BLEU-1'
scores_history = []
loss_history = []

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=max_length):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    print(input_tensor.size())
    print(target_tensor.size())
    print("ip length ", input_length)
    print("tg length ", target_length)
    encoder_outputs = torch.zeros(max_length+2, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_idx]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [167]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [168]:
import random
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    input_tensors = [random.choice(x_training) for i in range(n_iters)]
    target_tensors = [random.choice(y_training) for i in range(n_iters)]
    criterion = nn.NLLLoss()
    print(len(input_tensors))
    for iter in range(1, n_iters + 1):
        input_tensor = input_tensors[iter - 1]
        print(input_tensor.size())
        target_tensor = target_tensors[iter - 1]
        
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        print(loss)
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

In [169]:
hidden_size = 256
encoder1 = EncoderRNN(len(source_vocab), hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, len(target_vocab)).to(device)

trainIters(encoder1, decoder1, 75000, print_every=5000)
torch.save([encoder1,decoder1],'./model/nmt.pkl')

75000
torch.Size([302])
torch.Size([302])
torch.Size([302])
ip length  302
tg length  302
9.778472294081126
torch.Size([302])
torch.Size([302])
torch.Size([302])
ip length  302
tg length  302


KeyboardInterrupt: 