In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
cd 'drive/My Drive/Colab Notebooks/machine_translation'

/content/drive/My Drive/Colab Notebooks/machine_translation


In [0]:
from dataset import MTDataset
from model import Encoder, Decoder
from language import Language
from utils import preprocess
from train import train
from eval import validate
from translate import translate

In [0]:
sentences_inp_train, sentences_trg_train = preprocess('datasets/train/train.en', 'datasets/train/train.vi', max_len=20)
sentences_inp_val, sentences_trg_val = preprocess('datasets/dev/tst2012.en', 'datasets/dev/tst2012.vi', max_len=20)

In [0]:
train_inp = Language(sentences_inp_train)
train_trg = Language(sentences_trg_train)

In [0]:
val_inp = Language(sentences_inp_val, train=False, word2id=train_inp.word2id, id2word=train_inp.id2word)
val_trg = Language(sentences_trg_val, train=False, word2id=train_trg.word2id, id2word=train_trg.id2word)

In [0]:
train_set = MTDataset(train_inp.wordvec, train_trg.wordvec)
val_set = MTDataset(val_inp.wordvec, val_trg.wordvec)

In [0]:
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR

In [0]:
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
val_loader = DataLoader(val_set, batch_size=64)

In [0]:
Tx, Ty = train_inp.max_len, train_trg.max_len
vocab_size_inp, vocab_size_trg = train_inp.vocab_size, train_trg.vocab_size
embedding_dim = 256
hidden_size = 1024

In [0]:
if torch.cuda.is_available():
    device='cuda'
else:
    device='cpu'

In [0]:
encoder = Encoder(vocab_size_inp, embedding_dim, hidden_size).to(device=device)
decoder = Decoder(hidden_size, vocab_size_trg, embedding_dim).to(device=device)

In [0]:
optimizer = torch.optim.Adam(params=list(encoder.parameters()) + list(decoder.parameters()))
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=2, gamma=0.5)

In [14]:
train(encoder, decoder, train_loader, val_loader, optimizer, criterion, train_trg.id2word, scheduler, 10, 200, device)

Epoch  1
Iter 0, loss = 9.005056
Iter 200, loss = 2.832219
Iter 400, loss = 2.161599
Iter 600, loss = 2.072055
Iter 800, loss = 2.183336
Iter 1000, loss = 1.953040
Validation BLEU score: 0.110472

Epoch  2
Iter 0, loss = 1.553245
Iter 200, loss = 1.450432
Iter 400, loss = 1.463465
Iter 600, loss = 1.509131
Iter 800, loss = 1.661497
Iter 1000, loss = 1.494548
Validation BLEU score: 0.138041

Epoch  3
Iter 0, loss = 1.073714
Iter 200, loss = 0.982473
Iter 400, loss = 0.938613
Iter 600, loss = 1.045970
Iter 800, loss = 1.044115
Iter 1000, loss = 1.120787
Validation BLEU score: 0.157240

Epoch  4
Iter 0, loss = 0.747102
Iter 200, loss = 0.649947
Iter 400, loss = 0.648416
Iter 600, loss = 0.731301
Iter 800, loss = 0.729189
Iter 1000, loss = 0.672074
Validation BLEU score: 0.157583

Epoch  5
Iter 0, loss = 0.427079
Iter 200, loss = 0.404889
Iter 400, loss = 0.375883
Iter 600, loss = 0.342018
Iter 800, loss = 0.421010
Iter 1000, loss = 0.396873
Validation BLEU score: 0.154613

Epoch  6
Iter 0

In [0]:
torch.save(encoder.state_dict(), 'encoder.pth')
torch.save(decoder.state_dict(), 'decoder.pth')

In [0]:
import string
exclude = list(string.punctuation) + list(string.digits)
test_sen = 'hello i am a student'
test_sen = ''.join([char for char in test_sen if char not in exclude]).strip().lower()
test_sen = '<START> ' + test_sen + ' <END>'
length = len(test_sen.split())
diff = train_inp.max_len -length
test_sen = test_sen + ''.join([' <PAD>']*diff)

In [0]:
test_vec = [train_inp.word2id[s] for s in test_sen.split()]
test_tensor = torch.Tensor(test_vec).to(device='cuda', dtype=torch.long).unsqueeze(0)

In [54]:
with torch.no_grad():
    encoder.eval()
    decoder.eval()
    enc_out, enc_hidden_backward, enc_hidden_forward = encoder(test_tensor)
    dec_hidden = enc_hidden_backward
    dec_input = torch.Tensor([train_trg.word2id['<START>']]).to(device='cuda', dtype=torch.long)
    for t in range(1, Ty):
        out, dec_hidden = decoder(dec_input, dec_hidden, enc_out)
        dec_input = torch.max(out, dim=-1)[1].squeeze(1)
        next_id = dec_input.squeeze().clone().cpu().numpy()
        next_word = train_trg.id2word[next_id]
        if next_word == '<END>':
            break
        print(next_word)

xin
chào
tôi
là
một
sinh
viên


In [58]:
translate('i am a student', train_inp.word2id, train_trg.word2id, train_trg.id2word, encoder, decoder, 20, device)

'tôi là một sinh viên'

In [40]:
decoder.load_state_dict(torch.load('decoder.pth'))

<All keys matched successfully>

In [48]:
train_inp.id2word[4112]

'worker'

In [0]:
train_trg.sentences[0]

In [0]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

In [0]:
ref, hyp, bleu = validate()

In [0]:
hyp[0]

In [0]:
ref1 = 'the cat is on the mat'.split()
ref2 = 'there is a cat on the mat'.split()
hyp = 'the cat the cat on the mat'.split()

In [0]:
corpus_bleu([[ref1, ref2]], [hyp])

In [0]:
ref3 = 'i am student ngo anh tu'.split()
ref4 = 'my name is student ngo anh tu'.split()
hyp2 = 'there is a student ngo anh tu'.split()

In [0]:
corpus_bleu([[ref1, ref2], [ref3, ref4]], [hyp, hyp2])

In [0]:
sentence_bleu([ref1, ref2], hyp)

In [0]:
sentence_bleu([ref3, ref4], hyp2)

In [0]:
validate()