In [1]:
import re
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import pandas as pd 
import os 
import math
import torchtext.data
import torchtext.datasets
import torchtext.vocab
from config import opt
from collections import Counter
from torch.utils.data import DataLoader

device = torch.device('cuda')

In [63]:
def remove_tone_line(utf8_str):
    intab_l = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ"
    intab_u = "ẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
    intab = list(intab_l+intab_u)

    outtab_l = "a"*17 + "o"*17 + "e"*11 + "u"*11 + "i"*5 + "y"*5 + "d"
    outtab_u = "A"*17 + "O"*17 + "E"*11 + "U"*11 + "I"*5 + "Y"*5 + "D"
    outtab = outtab_l + outtab_u
    # Khởi tạo regex tìm kiếm các vị trí nguyên âm có dấu 'ạ|ả|ã|...'
    r = re.compile("|".join(intab))

    # Dictionary có key-value là từ có dấu-từ không dấu. VD: {'â' : 'a'}
    replaces_dict = dict(zip(intab, outtab))
    # Thay thế các từ có dấu xuất hiện trong tìm kiếm của regex bằng từ không dấu tương ứng
    non_dia_str = r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)
    return non_dia_str

class tokenize(object):
    def __init__(self, param : str) -> None:
        self.param = param

    def tokenizer(self, sentence):
        if self.param == "with_accents":
            tokens = re.findall(r'\w+|[^\w\s]', sentence, re.UNICODE)
            return tokens
        
        if self.param ==  "without_accents":
            sentence_ipt = remove_tone_line(sentence)
            tokens = re.findall(r'\w+|[^\w\s]', sentence_ipt, re.UNICODE)
            return tokens

specials = ['<unk>', '<pad>', '<sos>', '<eos>']

def load_dataset(config):
    print('Loading dataset...')
    tokenize_opt = tokenize(config["opt"])
    tokenize_ipt = tokenize(config["ipt"])
    train_dataset_ipt = []
    train_dataset_opt = []
    val_dataset_ipt = []
    val_dataset_opt = []
    test_dataset_ipt = []
    test_dataset_opt = []
    counter_opt = Counter()
    counter_ipt = Counter()
    counter_opt.update(specials)
    counter_ipt.update(specials)
    with open(config['filename'], 'r', encoding='utf-8') as f: 
        for i in tqdm(range(config['max_len_load'])):
            line = f.readline()
            [_, origin_seq] = line.split('\t')
            line_opt = tokenize_opt.tokenizer(origin_seq)
            line_ipt = tokenize_ipt.tokenizer(origin_seq)
            line_opt = line_opt[:(config['seq_len']-2)]
            line_ipt = line_ipt[:(config['seq_len']-2)]
            counter_opt.update(line_opt)
            counter_ipt.update(line_ipt)
            if i < config['train_size']:           
                train_dataset_opt.append(line_opt)
                train_dataset_ipt.append(line_ipt)
            elif i < config['train_size'] + config['val_size']:
                val_dataset_opt.append(line_opt)
                val_dataset_ipt.append(line_ipt)
            else:
                test_dataset_opt.append(line_opt)
                test_dataset_ipt.append(line_ipt)

    

    vocab_opt = torchtext.vocab.Vocab(counter_opt, min_freq=1)
    vocab_ipt = torchtext.vocab.Vocab(counter_ipt, min_freq=1)
    return train_dataset_ipt, train_dataset_opt, val_dataset_ipt, val_dataset_opt, test_dataset_ipt, test_dataset_opt, tokenize_ipt, tokenize_opt, vocab_ipt, vocab_opt


train_dataset_ipt, train_dataset_opt, val_dataset_ipt, val_dataset_opt, test_dataset_ipt, test_dataset_opt, tokenize_ipt, tokenize_opt, vocab_ipt, vocab_opt = load_dataset(opt)




Loading dataset...


100%|██████████| 10000/10000 [00:01<00:00, 9235.88it/s]


In [14]:
print(vocab_ipt.stoi['<pad>'])
print(vocab_opt.stoi['<pad>'])

1
1


In [15]:
print(len(vocab_ipt.stoi))
print(len(vocab_opt.stoi))

13849
17872


In [16]:
print(train_dataset_ipt[2])
print(train_dataset_opt[2])

['Internet', 'Society', 'hay', 'ISOC', 'la', 'mot', 'to', 'chuc', 'quoc', 'te', 'hoat', 'dong', 'phi', 'loi', 'nhuan', ',', 'phi', 'chinh', 'phu', 'va', 'bao', 'gom', 'cac', 'thanh', 'vien', 'co', 'trinh', 'do', 'chuyen', 'nganh', '.', 'To', 'chuc', 'nay', 'chu', 'trong', 'den', ':', 'tieu', 'chuan', ',', 'giao', 'duc', 'va', 'cac', 'van', 'de', 've', 'chinh', 'sach', '.', 'Voi', 'tren', '145', 'to', 'chuc', 'thanh', 'vien', 'va', '65', '.', '000', 'thanh', 'vien', 'ca', 'nhan', ',', 'ISOC', 'bao', 'gom', 'nhung', 'con', 'nguoi', 'cu', 'the', 'trong', 'cong', 'dong', 'Internet', '.', 'Moi', 'chi', 'tiet', 'co', 'the', 'tim', 'thay', 'tai', 'website', 'cua', 'ISOC', '.']
['Internet', 'Society', 'hay', 'ISOC', 'là', 'một', 'tổ', 'chức', 'quốc', 'tế', 'hoạt', 'động', 'phi', 'lợi', 'nhuận', ',', 'phi', 'chính', 'phủ', 'và', 'bao', 'gồm', 'các', 'thành', 'viên', 'có', 'trình', 'độ', 'chuyên', 'ngành', '.', 'Tổ', 'chức', 'này', 'chú', 'trọng', 'đến', ':', 'tiêu', 'chuẩn', ',', 'giáo', 'dục',

In [17]:
def encode(x, vocab):
    return [vocab.stoi[s] for s in x]

def decode(x, vocab):
    return [vocab.itos[s] for s in x]


train

In [50]:
def causal_mask(size):
    '''
    mask được sử dụng cho quá trình dự đoán, mô hình không thấy được tương lai
    '''
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int64).to(device)
    return mask == 0

a = (torch.tensor([1, 2]) != 2).long().to(device) & causal_mask(2)

In [61]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, train_dataset_ipt, train_dataset_opt, vocab_ipt, vocab_opt, config):
        super().__init__()
        self.seq_len = config['seq_len']

        self.train_dataset_ipt = train_dataset_ipt
        self.train_dataset_opt = train_dataset_opt

        self.vocab_ipt = vocab_ipt
        self.vocab_opt = vocab_opt

        self.sos_token_ipt = torch.tensor([vocab_ipt.stoi['<sos>']], dtype=torch.int64, device=device)
        self.eos_token_ipt = torch.tensor([vocab_ipt.stoi['<eos>']], dtype=torch.int64, device=device)
        self.pad_token_ipt = torch.tensor([vocab_ipt.stoi['<pad>']], dtype=torch.int64, device=device)

        self.sos_token_opt = torch.tensor([vocab_opt.stoi['<sos>']], dtype=torch.int64, device=device)
        self.eos_token_opt = torch.tensor([vocab_opt.stoi['<eos>']], dtype=torch.int64, device=device)
        self.pad_token_opt = torch.tensor([vocab_opt.stoi['<pad>']], dtype=torch.int64, device=device)

    def __len__(self):
        return len(self.train_dataset_ipt)
    
    def __getitem__(self, index):
        ipt_tokenized = train_dataset_ipt[index]
        opt_tokenized = train_dataset_opt[index]

        enc_num_padding_tokens = self.seq_len - len(ipt_tokenized) - 2
        dec_num_padding_tokens = self.seq_len - len(opt_tokenized) - 1

        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long! Try to increase seq_len in config.py")
        
        encoder_input = torch.cat(
            [
                self.sos_token_ipt,
                torch.tensor(encode(ipt_tokenized, self.vocab_ipt), dtype=torch.int64, device=device),
                self.eos_token_ipt,
                torch.tensor([self.pad_token_ipt] * enc_num_padding_tokens, dtype=torch.int64, device=device)
            ],
            dim=0
        )

        decoder_input = torch.cat(
            [
                self.sos_token_opt,
                torch.tensor(encode(opt_tokenized, self.vocab_opt), dtype=torch.int64, device=device),
                torch.tensor([self.pad_token_opt] * dec_num_padding_tokens, dtype=torch.int64, device=device)
            ],
            dim=0
        )

        label = torch.cat(
            [
                torch.tensor(encode(opt_tokenized, self.vocab_opt), dtype=torch.int64, device=device),
                self.eos_token_opt,
                torch.tensor([self.pad_token_opt] * dec_num_padding_tokens, dtype=torch.int64, device=device)
            ],
            dim=0
        )
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input, # (seq_len)
            "decoder_input": decoder_input, # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token_ipt).unsqueeze(0).unsqueeze(0).long().to(device), # (1, seq_len) do input (batch, seq_len, d_model)
            "decoder_mask": (decoder_input != self.pad_token_opt).unsqueeze(0).long().to(device) & causal_mask(decoder_input.size(0)), # (seq_len, seq_len)
            "label": label, # (seq_len)
            # "ipt_tokenized": ipt_tokenized, 
            # "opt_tokenized": opt_tokenized
        }

In [62]:
test1 = CustomDataset(train_dataset_ipt, train_dataset_opt, vocab_ipt, vocab_opt, opt)
print(test1.__getitem__(1)["encoder_mask"].shape)
print(test1.__getitem__(1)["decoder_mask"].shape)

torch.Size([1, 1, 200])
torch.Size([1, 200, 200])


In [53]:
from model import *

In [54]:
def get_model(config, ipt_vocab_len, opt_vocab_len):
    model = build_transformer(ipt_vocab_len, opt_vocab_len, config['seq_len'], config['seq_len'])
    return model

In [55]:
ipt_vocab_len = len(vocab_ipt.stoi)
opt_vocab_len = len(vocab_opt.stoi)

In [56]:
def get_ds(config):
    train_ds = CustomDataset(train_dataset_ipt=train_dataset_ipt,
                             train_dataset_opt=train_dataset_opt,
                             vocab_ipt=vocab_ipt,
                             vocab_opt=vocab_opt,
                             config=config)
    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)

    return train_dataloader

In [57]:
def train_model(config):
    train_dataloader= get_ds(config)
    model = get_model(config, ipt_vocab_len=ipt_vocab_len, opt_vocab_len=opt_vocab_len).to(device)
    print("Using device: ", device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
    loss_fn = nn.CrossEntropyLoss(ignore_index=vocab_opt.stoi['<pad>'], label_smoothing=0.1).to(device)

    for epoch in range(0, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f'Processing Epoch {epoch:02d}')
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, opt_vocab_len), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, 'model.pt')

In [58]:
print(device)
if __name__ ==  '__main__':
    config = opt
    train_model(config)


cuda
Using device:  cuda


Processing Epoch 00:   0%|          | 0/2000 [00:00<?, ?it/s]


RuntimeError: The expanded size of the tensor (8) must match the existing size (4) at non-singleton dimension 1.  Target sizes: [4, 8, 200, 200].  Tensor sizes: [4, 1, 200]

In [64]:
print(train_dataset_ipt[5214])
print(train_dataset_opt[5214])

['Wikipedia', 'co', 'vai', 'du', 'an', 'lien', 'quan', ':']
['Wikipedia', 'có', 'vài', 'dự', 'án', 'liên', 'quan', ':']


In [65]:
word1 = remove_tone_line("Wikipedia có vài dự án liên quan:")
print(word1)

Wikipedia co vai du an lien quan:


In [71]:
# train_dataset_ipt, train_dataset_opt, val_dataset_ipt, val_dataset_opt, test_dataset_ipt, test_dataset_opt, tokenize_ipt, tokenize_opt, vocab_ipt, vocab_opt

from pathlib import Path
from config import opt
from model import build_transformer
import torch
import torch.nn as nn
import sys


def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

def translate(config, sentence : str):
    seq_len = config['seq_len']

    sos_token_ipt = torch.tensor([vocab_ipt.stoi['<sos>']], dtype=torch.int64, device=device)
    eos_token_ipt = torch.tensor([vocab_ipt.stoi['<eos>']], dtype=torch.int64, device=device)
    pad_token_ipt = torch.tensor([vocab_ipt.stoi['<pad>']], dtype=torch.int64, device=device)

    sos_token_opt = torch.tensor([vocab_opt.stoi['<sos>']], dtype=torch.int64, device=device)
    eos_token_opt = torch.tensor([vocab_opt.stoi['<eos>']], dtype=torch.int64, device=device)
    pad_token_opt = torch.tensor([vocab_opt.stoi['<pad>']], dtype=torch.int64, device=device)

    model = get_model(config, ipt_vocab_len=ipt_vocab_len, opt_vocab_len=opt_vocab_len).to(device)
    state = torch.load('model.pt')
    model.load_state_dict(state['model_state_dict'])
    model.eval()
    with torch.no_grad():
        source = tokenize_ipt.tokenizer(sentence)
        enc_num_padding_tokens = seq_len - len(source) - 2
        encoder_input = torch.cat(
            [
                sos_token_ipt,
                torch.tensor(encode(source, vocab_ipt), dtype=torch.int64, device=device),
                eos_token_ipt,
                torch.tensor([pad_token_ipt] * enc_num_padding_tokens, dtype=torch.int64, device=device)
            ],
            dim=0
        ).unsqueeze(0).to(device) # --> (1, seq_len)
        encoder_mask = (encoder_input != sos_token_ipt).long().to(device)
        encoder_output = model.encode(encoder_input, encoder_mask)
        # # print(encoder_output)
        decoder_output = []
        decoder_input = torch.cat([
            sos_token_opt,
            torch.tensor(decoder_output, dtype=torch.int64, device=device),
            torch.tensor([pad_token_opt] * (seq_len - len(decoder_output) - 1), dtype=torch.int64, device=device),
        ], dim=0).unsqueeze(0).to(device)
        decoder_mask = (decoder_input != pad_token_opt).unsqueeze(0).unsqueeze(0).long().to(device)
        out = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask).to(device)
        prob = model.project(out[0][-1])
        _, next_word = torch.topk(prob, dim = 0, k=10)
        print(next_word)
        for x in next_word:
            print(f"{vocab_opt.itos[x.item()]}")
    #     while len(decoder_output) < seq_len:
    #         decoder_input = torch.cat([
    #             sos_token_opt,
    #             torch.tensor(decoder_output, dtype=torch.int64, device=device),
    #             torch.tensor([pad_token_opt] * (seq_len - len(decoder_output) - 1), dtype=torch.int64, device=device),
    #         ], dim=0).unsqueeze(0).to(device)
    #         decoder_mask = (decoder_input != pad_token_opt).long().to(device)
    #         out = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask).to(device)
    #         prob = model.project(out)
    #         prob = prob[0][-1]
    #         _, next_word = torch.topk(prob, dim=0)
    #         # print(next_word.item())
    #         # print(vocab_opt.stoi['<eos>'])
    #         # break
    #         if next_word.item() == vocab_opt.stoi['<eos>']:
    #             break

    #         decoder_output.append(next_word.item())
    #         #print(decoder_output)
    #         #print(decoder_output)
    #         # decoder_input = torch.cat([
    #         #     train_ds.tgt_sos_token,
    #         #     torch.tensor(decoder_output, dtype=torch.int64),
    #         #     torch.tensor([train_ds.tgt_pad_token] * (train_ds.seq_len - len(decoder_output) - 1), dtype=torch.int64),
    #         # ], dim=0).unsqueeze(0).to(device)
    #         # print(f"{train_ds.tokenizer.tokenizer_tgt_id_to_token(next_word.item())}", end=' ')
    #         # if next_word == train_ds.tokenizer.tokenizer_tgt_token_to_id('<eos>'):
    #         #     break

    # #print(decoder_output)
    # return " ".join(vocab_opt.itos[x] for x in decoder_output)

print(translate(opt, 'trien'))



    

tensor([10938, 10330, 12711, 13437, 14432, 13314, 17862, 13372, 11034,  7607],
       device='cuda:0')
Braj
90377
Joey
Minxin
Rèn
Mascot
Ắc
Megabus
Búp
Anouvong
None
