# Thực hành Transformers

Trong bài này, ta sẽ thực hành cài đặt Transformer

### 1. Cài đặt và import thư viện

In [1]:
!which python3

/usr/bin/python3


In [2]:
!pip uninstall -y torch
!pip install torch==2.3.0

Found existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
Collecting torch==2.3.0
  Downloading torch-2.3.0-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.

In [3]:
!pip3 install spacy dill
!pip3 install torchtext==0.18.0
!pip3 install pandas

Collecting torchtext==0.18.0
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


In [4]:
# !python3 -m spacy download en && python3 -m spacy download fr

In [5]:
!python3 -m spacy download en_core_web_sm
!python3 -m spacy download fr_core_news_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages

In [6]:
import torch.nn as nn
import torch
import torchtext
import copy
import math
import torch.nn.functional as F

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 2. Cài đặt từng module của Transformer

In [8]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, dim)

    def forward(self, x):
        return self.embed(x)

**Position Embedding Class**:

In [9]:
# Positional encoding
class PositionalEncoder(nn.Module):
    def __init__(self, dim, max_seq_len=300):
        super().__init__()
        self.dim = dim

        # create a constant 'pe' matrix with values dependant on
        # pos and i
        pe = torch.zeros(max_seq_len, dim)

        ########################
        position = torch.arange(0, max_seq_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, dim, 2).float() * -(math.log(10000.0) / dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        ########################

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # make embeddings relatively larger
        x = x *math.sqrt(self.dim)
        # add constant to embedding
        seq_len = x.size(1)
        # x = x + Variable(self.pe[:, :seq_len], requires_grad=False).to(device)
        x = x + self.pe[:, :seq_len].to(device)
        return x

**Multi Head Attention**: We first start with implementing attention function

Attention of $q$

In [10]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None:
        if mask.dim() == 3:
            mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)

    scores = F.softmax(scores, dim=-1)

    if dropout is not None:
        scores = dropout(scores)

    output = torch.matmul(scores, v)
    return output

In [11]:
# Multi-headed attention
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, dim, dropout=0.1):
        super().__init__()
        self.dim = dim
        self.dim_head = dim//heads
        self.h = heads
        self.q_linear = nn.Linear(dim, dim)
        self.k_linear = nn.Linear(dim, dim)
        self.v_linear = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(dim, dim)

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        # perform linear operation and split into h heads
        k = self.k_linear(k).view(bs, -1, self.h, self.dim_head)
        q = self.q_linear(q).view(bs, -1, self.h, self.dim_head)
        v = self.v_linear(v).view(bs, -1, self.h, self.dim_head)
        # transpose to get dimensions bs * h * sl * dim
        k = k.transpose(1, 2)
        q = q.transpose(1, 2)
        v = v.transpose(1, 2)
        # calculate attention using the function we will define next
        # scores = attention(q, k, v, self.dim, mask, self.dropout)
        scores = attention(q, k, v, self.dim_head, mask, self.dropout)
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.dim)
        output = self.out(concat)
        return output

In [12]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__()
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [13]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()

        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [14]:
# build an encoder layer with one multi-head attention layer and one
# feed-forward layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        ########################
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2, x2, x2, mask))

        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
        ########################


In [15]:
# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)

        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model).cuda()

    def forward(self, x, e_outputs, src_mask, trg_mask):
        ########################
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))

        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))

        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x
        ########################

def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [16]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)

    def forward(self, src, mask):
        ########################
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)
        ########################

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(DecoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        ########################
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)
        ########################

In [17]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads)
        self.decoder = Decoder(trg_vocab, d_model, N, heads)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output# we don't perform softmax on the output as this will be handled
# automatically by our loss function

### 3. Chuẩn bị và tiền xử lý dữ liệu

In [18]:
import spacy
import re


# Tokenize

class tokenize(object):

    def __init__(self, lang):
        self.nlp = spacy.load(lang)

    def tokenizer(self, sentence):
        sentence = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]

In [19]:
# # Creating batch
# from torchtext.legacy import data
# import numpy as np
# from torch.autograd import Variable


# def nopeak_mask(size, opt):
#     np_mask = np.triu(np.ones((1, size, size)),
#     k=1).astype('uint8')
#     np_mask =  Variable(torch.from_numpy(np_mask) == 0)
#     np_mask = np_mask.to(device)
#     return np_mask

# def create_masks(src, trg, opt):

#     src_mask = (src != opt.src_pad).unsqueeze(-2)

#     if trg is not None:
#         trg.to(device)
#         trg_mask = (trg != opt.trg_pad).unsqueeze(-2).to(device)
#         size = trg.size(1) # get seq_len for matrix
#         np_mask = nopeak_mask(size, opt)
#         trg_mask = trg_mask & np_mask

#     else:
#         trg_mask = None
#     return src_mask, trg_mask

# # patch on Torchtext's batching process that makes it more efficient
# # from http://nlp.seas.harvard.edu/2018/04/03/attention.html#position-wise-feed-forward-networks

# class MyIterator(data.Iterator):
#     def create_batches(self):
#         if self.train:
#             def pool(d, random_shuffler):
#                 for p in data.batch(d, self.batch_size * 100):
#                     p_batch = data.batch(
#                         sorted(p, key=self.sort_key),
#                         self.batch_size, self.batch_size_fn)
#                     for b in random_shuffler(list(p_batch)):
#                         yield b
#             self.batches = pool(self.data(), self.random_shuffler)

#         else:
#             self.batches = []
#             for b in data.batch(self.data(), self.batch_size,
#                                           self.batch_size_fn):
#                 self.batches.append(sorted(b, key=self.sort_key))

# global max_src_in_batch, max_tgt_in_batch

# def batch_size_fn(new, count, sofar):
#     "Keep augmenting batch and calculate total number of tokens + padding."
#     global max_src_in_batch, max_tgt_in_batch
#     if count == 1:
#         max_src_in_batch = 0
#         max_tgt_in_batch = 0
#     max_src_in_batch = max(max_src_in_batch,  len(new.src))
#     max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
#     src_elements = count * max_src_in_batch
#     tgt_elements = count * max_tgt_in_batch
#     return max(src_elements, tgt_elements)

In [20]:
# === Modernized cell 1: masks + utilities ===
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
import spacy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def nopeak_mask(size, device=device):
    """
    Returns a mask for preventing attention to future tokens.
    Shape expected by downstream: (1, size, size) (broadcastable)
    """
    # upper triangular with 1s above diagonal
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    mask = torch.from_numpy(np_mask) == 0  # True where allowed
    return mask.to(device)

def create_masks(src, trg, src_pad, trg_pad, device=device):
    """
    src: LongTensor shape (batch, src_len)
    trg: LongTensor shape (batch, trg_len)  OR None for inference
    Returns: src_mask (batch,1,src_len), trg_mask (batch,1,trg_len, trg_len) or None
    """
    # src_mask: (batch, 1, src_len)
    src_mask = (src != src_pad).unsqueeze(1).to(device)

    if trg is not None:
        # trg_mask: (batch, 1, trg_len)
        trg_mask = (trg != trg_pad).unsqueeze(1).to(device)  # (batch,1,trg_len)
        seq_len = trg.size(1)
        np_mask = nopeak_mask(seq_len, device)  # (1, seq_len, seq_len)
        # combine padding mask and subsequent mask
        # Need to broadcast trg_mask to (batch, seq_len) -> (batch, 1, seq_len) & np_mask (1, seq_len, seq_len)
        # final shape: (batch, seq_len, seq_len) after broadcasting; some models expect (batch, 1, seq_len, seq_len) - adapt as needed
        trg_mask = trg_mask & np_mask  # broadcasting: (batch,1,seq_len) & (1,seq_len,seq_len) -> (batch, seq_len, seq_len) because of alignment
        # For compatibility with many implementations, return shape (batch, 1, seq_len, seq_len)
        trg_mask = trg_mask.unsqueeze(1)  # (batch,1,seq_len,seq_len)
    else:
        trg_mask = None

    return src_mask, trg_mask

# === Batch-sizing helper retained (optional) ===
# If you want dynamic batching by tokens (like original), you can keep this function and use it when creating batches.
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new['src']))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new['trg']) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)



In [21]:
# import pandas as pd
# import torchtext
# from torchtext.legacy import data
# import os
# import dill as pickle

# def read_data(opt):
#     if opt.src_data is not None:
#         try:
#             opt.src_data = open(opt.src_data).read().strip().split('\n')
#         except:
#             print("error: '" + opt.src_data + "' file not found")
#             quit()

#     if opt.trg_data is not None:
#         try:
#             opt.trg_data = open(opt.trg_data).read().strip().split('\n')
#         except:
#             print("error: '" + opt.trg_data + "' file not found")
#             quit()

# def create_fields(opt):
#     spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl']
#     src_lang = opt.src_lang[0:2]
#     trg_lang = opt.trg_lang[0:2]
#     if src_lang not in spacy_langs:
#         print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs)
#     if trg_lang not in spacy_langs:
#         print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs)

#     print("loading spacy tokenizers...")

#     t_src = tokenize(opt.src_lang)
#     t_trg = tokenize(opt.trg_lang)
#     TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
#     SRC = data.Field(lower=True, tokenize=t_src.tokenizer)

#     return(SRC, TRG)

# def create_dataset(opt, SRC, TRG):

#     print("creating dataset and iterator... ")

#     raw_data = {'src' : [line for line in opt.src_data], 'trg': [line for line in opt.trg_data]}
#     df = pd.DataFrame(raw_data, columns=["src", "trg"])

#     mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
#     df = df.loc[mask]

#     df.to_csv("translate_transformer_temp.csv", index=False)

#     data_fields = [('src', SRC), ('trg', TRG)]
#     train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)

#     train_iter = MyIterator(train, batch_size=opt.batchsize, device=device,
#                         repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
#                         batch_size_fn=batch_size_fn, train=True, shuffle=True)

#     os.remove('translate_transformer_temp.csv')
#     SRC.build_vocab(train)
#     TRG.build_vocab(train)
#     opt.src_pad = SRC.vocab.stoi['<pad>']
#     opt.trg_pad = TRG.vocab.stoi['<pad>']

#     opt.train_len = get_len(train_iter)

#     return train_iter

# def get_len(train):

#     for i, b in enumerate(train):
#         pass

#     return i

In [22]:
# === Modernized cell 2: dataset, vocab, iterator (DataLoader + collate_fn) ===
import os
import pandas as pd
from functools import partial

# Simple wrapper to get spacy tokenizer
def get_spacy_tokenizer(lang_code):
    # lang_code e.g., 'en_core_web_sm' or 'en'
    # Accept either 'en' or 'en_core_web_sm' in opt.src_lang
    name = lang_code if '_' in lang_code else f"{lang_code}_core_web_sm"
    try:
        nlp = spacy.load(name)
    except Exception as e:
        # user may need to install the model
        raise RuntimeError(f"Spacy model '{name}' not found. Install with: python -m spacy download {name}") from e

    def tokenize_text(text):
        return [tok.text.lower() for tok in nlp(text)]
    return tokenize_text

class TranslationDataset(Dataset):
    def __init__(self, src_lines, trg_lines, src_tok_fn, trg_tok_fn, src_vocab, trg_vocab, add_sos_eos=True):
        assert len(src_lines) == len(trg_lines)
        self.src_lines = src_lines
        self.trg_lines = trg_lines
        self.src_tok_fn = src_tok_fn
        self.trg_tok_fn = trg_tok_fn
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.add_sos_eos = add_sos_eos

    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        src_text = self.src_lines[idx]
        trg_text = self.trg_lines[idx]
        src_tokens = self.src_tok_fn(src_text)
        trg_tokens = self.trg_tok_fn(trg_text)
        if self.add_sos_eos:
            trg_tokens = ['<sos>'] + trg_tokens + ['<eos>']
        # numericalize lazily in collate
        return {'src': src_tokens, 'trg': trg_tokens}

def yield_tokens(lines, tokenizer):
    for line in lines:
        yield tokenizer(line)

def build_vocabs(opt, src_lines, trg_lines, src_tok_fn, trg_tok_fn, min_freq=2):
    specials = ['<pad>', '<sos>', '<eos>', '<unk>']
    src_vocab = build_vocab_from_iterator(yield_tokens(src_lines, src_tok_fn),
                                         specials=specials,
                                         special_first=True)
    trg_vocab = build_vocab_from_iterator(yield_tokens(trg_lines, trg_tok_fn),
                                         specials=specials,
                                         special_first=True)

    # set default index for unknown tokens
    src_vocab.set_default_index(src_vocab['<unk>'])
    trg_vocab.set_default_index(trg_vocab['<unk>'])
    return src_vocab, trg_vocab

def numericalize(tokens_list, vocab):
    return [vocab[token] for token in tokens_list]

def collate_fn(batch, src_vocab, trg_vocab, max_strlen=None, device=device):
    # batch is a list of {'src': [...tokens...], 'trg': [...tokens...]}
    src_batch = [torch.tensor(numericalize(x['src'], src_vocab), dtype=torch.long) for x in batch]
    trg_batch = [torch.tensor(numericalize(x['trg'], trg_vocab), dtype=torch.long) for x in batch]

    # optionally filter by max length (similar to your mask earlier)
    if max_strlen is not None:
        keep_indices = [i for i, (s, t) in enumerate(zip(src_batch, trg_batch))
                        if s.size(0) <= max_strlen and t.size(0) <= max_strlen]
        if len(keep_indices) != len(batch):
            src_batch = [src_batch[i] for i in keep_indices]
            trg_batch = [trg_batch[i] for i in keep_indices]

    # pad sequences to longest in batch (pad value is index of '<pad>')
    pad_idx_src = src_vocab['<pad>']
    pad_idx_trg = trg_vocab['<pad>']
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=pad_idx_src).to(device)  # (batch, src_len)
    trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=pad_idx_trg).to(device)  # (batch, trg_len)

    return src_padded, trg_padded

def read_data(opt):
    """Read text files into lists of lines (keeps your original API)"""
    if opt.src_data is not None:
        try:
            opt.src_data = open(opt.src_data).read().strip().split('\n')
        except Exception as e:
            raise RuntimeError(f"error: '{opt.src_data}' file not found") from e

    if opt.trg_data is not None:
        try:
            opt.trg_data = open(opt.trg_data).read().strip().split('\n')
        except Exception as e:
            raise RuntimeError(f"error: '{opt.trg_data}' file not found") from e

def create_dataset_and_dataloader(opt, device=device):
    """
    Replaces your create_fields + create_dataset workflow.
    Returns: dataloader, src_vocab, trg_vocab, pad indices, dataset length
    """
    print("Creating tokenizers...")
    src_lang = opt.src_lang #[0:2]
    trg_lang = opt.trg_lang#[0:2]
    src_tok = get_spacy_tokenizer(src_lang)
    trg_tok = get_spacy_tokenizer(trg_lang)

    # We expect opt.src_data and opt.trg_data to be lists of lines (read_data should be called first)
    src_lines = opt.src_data
    trg_lines = opt.trg_data

    print("Building vocabs...")
    src_vocab, trg_vocab = build_vocabs(opt, src_lines, trg_lines, src_tok, trg_tok)

    print("Creating dataset...")
    dataset = TranslationDataset(src_lines, trg_lines, src_tok, trg_tok, src_vocab, trg_vocab, add_sos_eos=True)

    # create DataLoader with custom collate that closes over vocabs and max_strlen
    my_collate = partial(collate_fn, src_vocab=src_vocab, trg_vocab=trg_vocab, max_strlen=getattr(opt, 'max_strlen', None))
    dataloader = DataLoader(dataset, batch_size=opt.batchsize, shuffle=True, collate_fn=my_collate)

    # pad ids
    opt.src_pad = src_vocab['<pad>']
    opt.trg_pad = trg_vocab['<pad>']

    # compute train_len like original get_len (number of batches)
    train_len = len(dataloader)

    return dataloader, src_vocab, trg_vocab, opt.src_pad, opt.trg_pad, train_len

# Example helper to iterate get_len as before (if you want compatibility)
def get_len(dataloader):
    for i, b in enumerate(dataloader):
        pass
    return i


### 4. Cài đặt giải thuật tối ưu và huấn luyện mô hình

In [23]:
# Optimizer
class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
    """
    Cosine annealing with restarts.
    Parameters
    ----------
    optimizer : torch.optim.Optimizer
    T_max : int
        The maximum number of iterations within the first cycle.
    eta_min : float, optional (default: 0)
        The minimum learning rate.
    last_epoch : int, optional (default: -1)
        The index of the last epoch.
    """

    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 T_max: int,
                 eta_min: float = 0.,
                 last_epoch: int = -1,
                 factor: float = 1.) -> None:
        # pylint: disable=invalid-name
        self.T_max = T_max
        self.eta_min = eta_min
        self.factor = factor
        self._last_restart: int = 0
        self._cycle_counter: int = 0
        self._cycle_factor: float = 1.
        self._updated_cycle_len: int = T_max
        self._initialized: bool = False
        super(CosineWithRestarts, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        """Get updated learning rate."""
        # HACK: We need to check if this is the first time get_lr() was called, since
        # we want to start with step = 0, but _LRScheduler calls get_lr with
        # last_epoch + 1 when initialized.
        if not self._initialized:
            self._initialized = True
            return self.base_lrs

        step = self.last_epoch + 1
        self._cycle_counter = step - self._last_restart

        lrs = [
            (
                self.eta_min + ((lr - self.eta_min) / 2) *
                (
                    np.cos(
                        np.pi *
                        ((self._cycle_counter) % self._updated_cycle_len) /
                        self._updated_cycle_len
                    ) + 1
                )
            ) for lr in self.base_lrs
        ]

        if self._cycle_counter % self._updated_cycle_len == 0:
            # Adjust the cycle length.
            self._cycle_factor *= self.factor
            self._cycle_counter = 0
            self._updated_cycle_len = int(self._cycle_factor * self.T_max)
            self._last_restart = step

        return lrs


In [24]:
!mkdir data
!wget https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/english.txt
!mv english.txt data
!wget https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/french.txt data/french.txt
!mv french.txt data

--2025-10-24 14:14:47--  https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/english.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4897403 (4.7M) [text/plain]
Saving to: ‘english.txt’


2025-10-24 14:14:47 (69.0 MB/s) - ‘english.txt’ saved [4897403/4897403]

--2025-10-24 14:14:47--  https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/french.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5938378 (5.7M) [text/plain]
Saving to: ‘french.txt’


2025-10-24 14:14:48 (97.6 MB/s) - ‘french.t

In [25]:

def get_model(opt, src_vocab, trg_vocab):

    assert opt.d_model % opt.heads == 0
    assert opt.dropout < 1

    model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads)

    if opt.load_weights is not None:
        print("loading pretrained weights...")
        model.load_state_dict(torch.load(f'{opt.load_weights}/model_weights'))
    else:
        for p in model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    if opt.device == 0:
        model = model.cuda()

    return model

In [26]:
""" BAI TAP VE NHA """

import time
import os

class Opt:
    pass

def train_model(model, dataloader, opt):
    ########################

    model.train()
    start = time.time()

    # Định nghĩa criterion bên trong hàm
    criterion = nn.CrossEntropyLoss(ignore_index=opt.trg_pad)

    for epoch in range(opt.epochs):
        total_loss = 0

        # 1. Lặp qua dataloader
        for i, batch in enumerate(dataloader):

            # 2. Lấy src, trg (đã là batch_first=True và trên device từ collate_fn)
            src, trg = batch

            # trg_input là <sos>...word (ví dụ: [1, 5, 7, 9])
            trg_input = trg[:, :-1]
            # trg_output là word...<eos> (ví dụ: [5, 7, 9, 2])
            trg_output = trg[:, 1:].contiguous().view(-1)

            # 3. Tạo mặt nạ (masks) với đúng tham số
            src_mask, trg_mask = create_masks(src, trg_input, opt.src_pad, opt.trg_pad, opt.device)

            opt.optimizer.zero_grad()

            preds = model(src, trg_input, src_mask, trg_mask)

            preds_flat = preds.contiguous().view(-1, preds.size(-1))

            loss = criterion(preds_flat, trg_output)
            loss.backward()

            opt.optimizer.step()

            total_loss += loss.item()

            if (i + 1) % opt.printevery == 0:
                avg_loss = total_loss / opt.printevery
                print(f"Epoch [{epoch+1}/{opt.epochs}], Step [{i+1}/{opt.train_len}], Loss: {avg_loss:.4f}, Time: {time.time() - start:.2f}s")
                total_loss = 0
                start = time.time()

        #checkpoint
        if opt.checkpoint > 0:
            print(f"--- epoch {epoch+1} finished, saving weights ---")
            if not os.path.exists('weights'):
                os.makedirs('weights')
            torch.save(model.state_dict(), f'weights/model_epoch_{epoch+1}.weights')

    print("training complete.")
    ########################


def main():
    opt = Opt()
    opt.src_data = "data/english.txt"
    opt.trg_data = "data/french.txt"
    opt.src_lang = "en_core_web_sm"
    opt.trg_lang = 'fr_core_news_sm'
    opt.epochs = 2
    opt.d_model=512
    opt.n_layers=6
    opt.heads=8
    opt.dropout=0.1
    opt.batchsize=32
    opt.printevery=100
    opt.lr=0.0001
    opt.max_strlen=80
    opt.checkpoint = 0
    opt.no_cuda = False
    opt.load_weights = None

    # opt.device = 0
    # if opt.device == 0:
    #     assert torch.cuda.is_available()

    # read_data(opt)
    # SRC, TRG = create_fields(opt)
    # opt.train = create_dataset(opt, SRC, TRG)
    # model = get_model(opt, len(SRC.vocab), len(TRG.vocab)).to(device)

    # opt.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)

    # if opt.checkpoint > 0:
    #     print("model weights will be saved every %d minutes and at end of epoch to directory weights/"%(opt.checkpoint))

    # train_model(model, opt)

    device = torch.device("cuda" if torch.cuda.is_available() and not opt.no_cuda else "cpu")
    opt.device = device

    read_data(opt)

    dataloader, src_vocab, trg_vocab, opt.src_pad, opt.trg_pad, opt.train_len = \
        create_dataset_and_dataloader(opt, device=device)

    print(f"Train steps per epoch: {opt.train_len}")

    model = get_model(opt, len(src_vocab), len(trg_vocab)).to(device)

    opt.optimizer = torch.optim.Adam(
        model.parameters(),
        lr=opt.lr,
        betas=(0.9, 0.98),
        eps=1e-9
    )

    train_model(model, dataloader, opt)


    # for asking about further training use while true loop, and return
if __name__ == "__main__":
    main()

Creating tokenizers...
Building vocabs...
Creating dataset...
Train steps per epoch: 4841
Epoch [1/2], Step [100/4841], Loss: 7.0322, Time: 44.35s
Epoch [1/2], Step [200/4841], Loss: 5.2132, Time: 42.56s
Epoch [1/2], Step [300/4841], Loss: 4.7141, Time: 43.99s
Epoch [1/2], Step [400/4841], Loss: 4.4322, Time: 43.76s
Epoch [1/2], Step [500/4841], Loss: 4.2238, Time: 42.98s
Epoch [1/2], Step [600/4841], Loss: 4.0578, Time: 44.35s
Epoch [1/2], Step [700/4841], Loss: 3.9467, Time: 43.34s
Epoch [1/2], Step [800/4841], Loss: 3.7916, Time: 43.95s
Epoch [1/2], Step [900/4841], Loss: 3.6988, Time: 43.90s
Epoch [1/2], Step [1000/4841], Loss: 3.5549, Time: 43.12s
Epoch [1/2], Step [1100/4841], Loss: 3.4962, Time: 43.78s
Epoch [1/2], Step [1200/4841], Loss: 3.3923, Time: 43.26s
Epoch [1/2], Step [1300/4841], Loss: 3.3500, Time: 44.35s
Epoch [1/2], Step [1400/4841], Loss: 3.2424, Time: 43.92s
Epoch [1/2], Step [1500/4841], Loss: 3.2019, Time: 43.54s
Epoch [1/2], Step [1600/4841], Loss: 3.0981, Time