In [1]:
!pip install datasets



In [2]:
!pip install vncorenlp
!pip install iteration_utilities



In [3]:
!pip install torchtext==0.6



In [4]:
!pip install pyvi



In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
from torch.nn.utils.rnn import pad_sequence
from PIL import Image
from tqdm import tqdm
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import re
from collections import Counter
from vncorenlp import VnCoreNLP
import torchtext
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
from torchtext import datasets
from sklearn.model_selection import train_test_split
import time
import math

In [6]:
import random
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7e89dc9e8ed0>

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
spacy_en = spacy.load('en_core_web_sm')


In [8]:
from pyvi import ViTokenizer


In [9]:
from iteration_utilities import deepflatten
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [10]:
text_en = 'Please put the dustpan in the broom closet'
print(tokenize_en(text_en))

['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']


In [11]:
def tokenize_vi(text):
  tokens = ViTokenizer.tokenize(text).split()
  return tokens

In [12]:
text_vi = 'cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_vi(text_vi))

['cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [13]:
source = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
target = Field(tokenize=tokenize_vi, init_token='<sos>', eos_token='<eos>', lower=True)

In [14]:
fields = {"English": ("src", source), "Vietnamese": ("trg", target)}

## split

In [25]:
from datasets import load_from_disk
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
dataset = load_from_disk("/content/mt-en-vi")
train = pd.DataFrame(dataset['train'][:200000]).drop(columns=['source'])
val = pd.DataFrame(dataset['validation']).drop(columns=['source'])
test = pd.DataFrame(dataset['test']).drop(columns=['source'])

train = train.rename(columns={'en': 'English', 'vi': 'Vietnamese'})
val = val.rename(columns={'en': 'English', 'vi': 'Vietnamese'})
test = test.rename(columns={'en': 'English', 'vi': 'Vietnamese'})


In [26]:
train.head(5)

Unnamed: 0,English,Vietnamese
0,"- Sorry, that question's not on here.","- Xin lỗi, nhưng mà ở đây không có câu hỏi đấy."
1,He wants you to come with him immediately.,Ông ấy muốn bố đi với ông ấy ngay lập tức
2,I thought we could use some company.,Tôi nghĩ chúng ta có thể muốn vài người bạn đồ...
3,It was founded in 2008 by this anonymous progr...,Nó được sáng lập vào năm 2008 bởi một lập trìn...
4,"With both of these methods, no two prints are ...","Với cả hai phương pháp, không có hai bản in nà..."


In [28]:
train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)
val.to_json("val.json", orient="records", lines=True)

In [29]:
train_data, test_data, val_data = TabularDataset.splits(
    path="./", train="train.json", test="test.json", validation ="val.json", format="json", fields=fields
)

In [30]:
len(train_data)

200000

In [31]:
len(test_data)

11316

In [32]:
len(val_data)

11225

In [33]:
cnt = 0
for example in train_data.examples:  # chỉ lấy 5 mẫu đầu tiên
    #print(example.trg)
    cnt = cnt + 1

print(cnt)

200000


In [34]:
source.build_vocab(train_data, max_size=10000, min_freq=2)
target.build_vocab(train_data, max_size=10000, min_freq=2)

In [35]:
BATCH_SIZE = 32
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, val_data, test_data), batch_size=BATCH_SIZE, sort_key = lambda x: len(x.src),
    sort_within_batch=True, device=device)

test_batch = next(iter(test_iterator))
test_batch


[torchtext.data.batch.Batch of size 32]
	[.src]:[torch.cuda.LongTensor of size 6x32 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 10x32 (GPU 0)]

In [36]:
# adjustable parameters
INPUT_DIM = len(source.vocab)
OUTPUT_DIM = len(target.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [37]:
INPUT_DIM, OUTPUT_DIM

(10004, 10004)

In [38]:
class Encoder(nn.Module):
    def __init__(self, input_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float):
        super().__init__()
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

    def forward(self, src_batch: torch.LongTensor):
        embedded = self.embedding(src_batch) # [sent len, batch size, emb dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs -> [sent len, batch size, hidden dim * n directions]
        return hidden, cell

In [39]:
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
hidden, cell = encoder(test_batch.src)
hidden.shape, cell.shape

(torch.Size([2, 32, 512]), torch.Size([2, 32, 512]))

In [40]:
class Decoder(nn.Module):
    def __init__(self, output_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float):
        super().__init__()
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.out = nn.Linear(hid_dim, output_dim)

    def forward(self, trg: torch.LongTensor, hidden: torch.FloatTensor, cell: torch.FloatTensor):
        # [1, batch size, emb dim], the 1 serves as sent len
        embedded = self.embedding(trg.unsqueeze(0))
        outputs, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.out(outputs.squeeze(0))
        return prediction, hidden, cell

In [41]:
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT).to(device)

# notice that we are not passing the entire the .trg
prediction, hidden, cell = decoder(test_batch.trg[0], hidden, cell)
prediction.shape, hidden.shape, cell.shape

(torch.Size([32, 10004]), torch.Size([2, 32, 512]), torch.Size([2, 32, 512]))

In [42]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, device: torch.device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            'Hidden dimensions of encoder and decoder must be equal!'
        assert encoder.n_layers == decoder.n_layers, \
            'Encoder and decoder must have equal number of layers!'

    def forward(self, src_batch: torch.LongTensor, trg_batch: torch.LongTensor,
                teacher_forcing_ratio: float=0.5):

        max_len, batch_size = trg_batch.shape
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder's output
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        # last hidden & cell state of the encoder is used as the decoder's initial hidden state
        hidden, cell = self.encoder(src_batch)

        trg = trg_batch[0]
        for i in range(1, max_len):
            prediction, hidden, cell = self.decoder(trg, hidden, cell)
            outputs[i] = prediction

            if random.random() < teacher_forcing_ratio:
                trg = trg_batch[i]
            else:
                trg = prediction.argmax(1)

        return outputs

In [43]:
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
seq2seq

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(10004, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(10004, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (out): Linear(in_features=512, out_features=10004, bias=True)
  )
)

In [44]:
outputs = seq2seq(test_batch.src, test_batch.trg)
outputs.shape

torch.Size([10, 32, 10004])

In [45]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(seq2seq):,} trainable parameters')

The model has 17,610,516 trainable parameters


In [46]:
optimizer = optim.Adam(seq2seq.parameters())

# ignore the padding index when calculating the loss
PAD_IDX = target.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [47]:
def train(seq2seq, iterator, optimizer, criterion):
    seq2seq.train()

    epoch_loss = 0
    for batch in iterator:
        optimizer.zero_grad()
        outputs = seq2seq(batch.src, batch.trg)

        # 1. as mentioned in the seq2seq section, we will
        # cut off the first element when performing the evaluation
        # 2. the loss function only works on 2d inputs
        # with 1d targets we need to flatten each of them
        outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
        trg_flatten = batch.trg[1:].view(-1)
        loss = criterion(outputs_flatten, trg_flatten)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [48]:
def evaluate(seq2seq, iterator, criterion):
    seq2seq.eval()

    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            # turn off teacher forcing
            outputs = seq2seq(batch.src, batch.trg, teacher_forcing_ratio=0)

            # trg = [trg sent len, batch size]
            # output = [trg sent len, batch size, output dim]
            outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
            trg_flatten = batch.trg[1:].view(-1)
            loss = criterion(outputs_flatten, trg_flatten)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [49]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [50]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(seq2seq, train_iterator, optimizer, criterion)
    valid_loss = evaluate(seq2seq, valid_iterator, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(seq2seq.state_dict(), 'tut1-model.pt')

    # it's easier to see a change in perplexity between epoch as it's an exponential
    # of the loss, hence the scale of the measure is much bigger
    print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 1 | Time: 11m 9s
	Train Loss: 5.146 | Train PPL: 171.756
	 Val. Loss: 5.083 |  Val. PPL: 161.287
Epoch: 2 | Time: 10m 30s
	Train Loss: 4.521 | Train PPL:  91.904
	 Val. Loss: 4.806 |  Val. PPL: 122.191
Epoch: 3 | Time: 10m 44s
	Train Loss: 4.230 | Train PPL:  68.727
	 Val. Loss: 4.646 |  Val. PPL: 104.124
Epoch: 4 | Time: 10m 42s
	Train Loss: 4.035 | Train PPL:  56.566
	 Val. Loss: 4.582 |  Val. PPL:  97.714
Epoch: 5 | Time: 10m 30s
	Train Loss: 3.894 | Train PPL:  49.088
	 Val. Loss: 4.521 |  Val. PPL:  91.907


# Test

In [51]:
seq2seq.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(seq2seq, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 4.523 | Test PPL:  92.085 |


In [52]:
example_idx = 100
example = test_data.examples[example_idx]
print('source sentence: ', ' '.join(example.src))
print('target sentence: ', ' '.join(example.trg))

source sentence:  men fall from the sky . and gods hurl thunderbolts .
target sentence:  người rơi từ trên trời xuống , các vị thần phóng ra sét .


In [53]:
src_tensor = source.process([example.src]).to(device)
trg_tensor = target.process([example.trg]).to(device)
print(trg_tensor.shape)

seq2seq.eval()
with torch.no_grad():
    outputs = seq2seq(src_tensor, trg_tensor, teacher_forcing_ratio=0.5)

outputs.shape

torch.Size([16, 1])


torch.Size([16, 1, 10004])

In [54]:
output_idx = outputs[1:].squeeze(1).argmax(1)
' '.join([target.vocab.itos[idx] for idx in output_idx])

'người của từ từ , , và và <unk> thần <unk> <unk> . . <eos>'

In [55]:
import torch
import torch.nn.functional as F
import spacy
from torchtext.data import Field
from torchtext.data.utils import get_tokenizer
from torchtext.data.metrics import bleu_score
import pandas as pd

# Assuming you've defined your Encoder, Decoder, and Seq2Seq models already

def predict_example(sentence, src_field, trg_field, model, device, max_len=50):
    model.eval()

    # Tokenize the input sentence
    if isinstance(sentence, str):
        tokens = src_field.tokenize(sentence)
    else:
        tokens = [token.lower() for token in sentence]

    # Numericalize the tokens
    numericalized_tokens = [src_field.vocab.stoi[token] for token in tokens]
    tensor = torch.LongTensor(numericalized_tokens).unsqueeze(1).to(device)  # shape: (src_len, 1)

    # Forward pass through encoder
    with torch.no_grad():
        hidden, cell = model.encoder(tensor)

    # Prepare the input to the decoder
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)  # shape: (1)

        # Forward pass through decoder
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)

        # Get most likely word index from output
        pred_token = output.argmax(1).item()

        # Append prediction to current output prediction
        trg_indexes.append(pred_token)

        # Stop appending if we predicted the end of sentence token
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    # Convert numerical indices to tokens
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]

    # Return the tokens, excluding the start of sequence token
    return trg_tokens[1:]

# Example usage:

# Load your trained model
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT).to(device)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
seq2seq.load_state_dict(torch.load('tut1-model.pt'))

# Define the source and target fields
source = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
target = Field(tokenize=tokenize_vi, init_token='<sos>', eos_token='<eos>', lower=True)

# Load vocabularies
source.vocab = train_data.fields['src'].vocab
target.vocab = train_data.fields['trg'].vocab

In [56]:
# Example sentence to predict
example_sentence = "who am I?"

# Predict
predicted_sentence = predict_example(example_sentence, source, target, seq2seq, device)
print("Predicted:", ' '.join(predicted_sentence))

Predicted: ai là ai là ai ? <eos>


In [57]:
# Example sentence to predict
example_sentence = "what is your name?"

# Predict
predicted_sentence = predict_example(example_sentence, source, target, seq2seq, device)
print("Predicted:", ' '.join(predicted_sentence))

Predicted: tên của là gì là gì ? <eos>


# BLEU

In [58]:
!pip install nltk



In [59]:
import torch
import math
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [60]:
import torch
import math
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

In [61]:
# Load the model
seq2seq.load_state_dict(torch.load('tut1-model.pt'))

# Evaluate the model
test_loss = evaluate(seq2seq, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

references = []
candidates = []

seq2seq.eval()
with torch.no_grad():
    for example in test_data.examples:
        src_tensor = source.process([example.src]).to(device)
        trg_tensor = target.process([example.trg]).to(device)

        outputs = seq2seq(src_tensor, trg_tensor, teacher_forcing_ratio=0.5)
        output_idx = outputs[1:].squeeze(1).argmax(1)
        translated_sentence = [target.vocab.itos[idx] for idx in output_idx]

        # Append reference and candidate
        references.append([example.trg])  # Note that references are expected to be a list of lists
        candidates.append(translated_sentence)

# Calculate BLEU score for the entire test set
smooth_fn = SmoothingFunction().method4  # Use smoothing to handle short sentences
bleu_score = corpus_bleu(references, candidates, smoothing_function=smooth_fn)
print(f'Corpus BLEU score: {bleu_score:.3f}')


| Test Loss: 4.523 | Test PPL:  92.085 |
Corpus BLEU score: 0.047
