In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import math
import random
import torch.nn as nn
import numpy as np
import pandas as pd
from torch import optim
from tqdm.notebook import tqdm
import torch.autograd as Variable
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **1. Data Processing**

In [3]:
# Sample data
src_sents = ["i love you", "how are you", "good night", "thank you", "i am fine"]
tgt_sents = ["tôi yêu bạn", "bạn khỏe không", "chúc ngủ ngon", "cảm ơn bạn", "tôi khỏe"]

**Create vocabulaies**

In [4]:
src_words = set()
trg_words = set()

for sent in src_sents:
    src_words.update(sent.split()) # update function allows us to add many items in the same thing

for sent in tgt_sents:
    trg_words.update(sent.split())

In [5]:
src_words

{'am', 'are', 'fine', 'good', 'how', 'i', 'love', 'night', 'thank', 'you'}

In [6]:
trg_words

{'bạn', 'chúc', 'cảm', 'không', 'khỏe', 'ngon', 'ngủ', 'tôi', 'yêu', 'ơn'}

Dictionary

In [7]:
src_vocab = {"<sos>":0, "<eos>": 1, "<pad>":2, "<unk>":3}

for i, word in enumerate(src_words, start= 4):
    src_vocab[word] = i
src_vocab

{'<sos>': 0,
 '<eos>': 1,
 '<pad>': 2,
 '<unk>': 3,
 'fine': 4,
 'are': 5,
 'night': 6,
 'good': 7,
 'am': 8,
 'you': 9,
 'i': 10,
 'how': 11,
 'love': 12,
 'thank': 13}

In [8]:
trg_vocab = {"<sos>":0, "<eos>": 1, "<pad>":2, "<unk>": 3}
for i, word in enumerate(trg_words, start = 4):
    trg_vocab[word] = i
trg_vocab

{'<sos>': 0,
 '<eos>': 1,
 '<pad>': 2,
 '<unk>': 3,
 'bạn': 4,
 'tôi': 5,
 'ơn': 6,
 'yêu': 7,
 'cảm': 8,
 'ngủ': 9,
 'chúc': 10,
 'không': 11,
 'ngon': 12,
 'khỏe': 13}

In [9]:
def create_dictionary(src_sents, trg_sents):
    src_words = set()
    trg_words = set()

    # create vocab
    for sent in src_sents:
        src_words.update(sent.split()) # update function allows us to add many items in the same thing

    for sent in trg_sents:
        trg_words.update(sent.split())

    #
    src_vocab = {"<sos>":0, "<eos>": 1, "<pad>":2, "<unk>":3}
    for i, word in enumerate(src_words, start= 4):
        src_vocab[word] = i


    trg_vocab = {"<sos>":0, "<eos>": 1, "<pad>":2, "<unk>": 3}
    for i, word in enumerate(trg_words, start = 4):
        trg_vocab[word] = i

    return  src_vocab, trg_vocab

In [10]:
class Read_Dataset(Dataset):
    def __init__(self, src_sents, tgt_sents, src_vocab, trg_vocab, max_leng = 50):
        super().__init__()

        self.src = src_sents
        self.tgt = tgt_sents
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.max_leng = max_leng

    def __len__(self):
        return len(self.src)

    def __getitem__(self, index):

        src_sentence = self.src[index]
        trg_sentence = self.tgt[index]

        # tokenization
        src_indides = [self.src_vocab['<sos>']] + [self.src_vocab.get(word.lower(), self.src_vocab['<unk>']) for word in src_sentence.split()]  + [self. src_vocab['<eos>']]
        trg_indides = [self.trg_vocab['<sos>']] + [self.trg_vocab.get(word.lower(), self.trg_vocab['<unk>']) for word in trg_sentence.split()]  + [self.trg_vocab['<eos>']]

        # padding
        src_indides = src_indides[:self.max_leng] + [self.src_vocab['<pad>']] * (self.max_leng - len(src_indides))
        trg_indides = trg_indides[:self.max_leng] + [self.trg_vocab['<pad>']] * (self.max_leng - len(trg_indides))

        return torch.LongTensor(src_indides), torch.LongTensor(trg_indides)


In [11]:
data = Read_Dataset(src_sents, tgt_sents, src_vocab, trg_vocab, 5)

data.__getitem__(0)

(tensor([ 0, 10, 12,  9,  1]), tensor([0, 5, 7, 4, 1]))

In [12]:
traindata = DataLoader(data, batch_size = 2, shuffle = True)

## 2. **Build Transformers From Scratch**

**Embedding Layer**

In [13]:
class Embedding_layer(nn.Module):
    def __init__(self, vocab_len, d_model = 512):
        super().__init__()

        self.model = d_model
        self.emb = nn.Embedding(vocab_len, d_model)

    def forward(self, x):
        return self.emb(x) * math.sqrt(self.model)


In [14]:
em = Embedding_layer(len(src_vocab), d_model= 3)
s,t  = data.__getitem__(0)
em(s)

tensor([[-0.1440,  2.1720,  0.1891],
        [-0.4419, -2.4272,  0.6007],
        [ 0.1082, -0.5154, -0.8250],
        [ 1.9928,  0.1610, -0.3965],
        [-0.8749,  1.8202,  1.1426]], grad_fn=<MulBackward0>)

**Possition layer**

$a^x = e^{x*ln(a)}$

$ w_k = 10000^{\frac{2*k}{d}} =  e  $

$ w_k = 10000^{\frac{2*k}{d}} =  e ^ {\frac{2*k}{d} * ln(10000)}  $

In [15]:
class Possition_layer(nn.Module):
    def __init__(self, max_leng = 200, d_model = 512, dropout = 0.1):
        super().__init__()

        self.dropout = nn.Dropout(dropout)

        pe = torch.ones(max_leng, d_model)
        possition = torch.arange(0, max_leng, dtype=float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:,::2] = torch.sin(possition * div_term)
        pe[:,1::2] = torch.cos(possition * div_term)

        pe = pe.unsqueeze(0) # add bach_size dimention
        self.register_buffer('pe', pe)


    def forward(self, x): # x embedded
        x = x + self.pe[:,:x.size(1),:]  # chièu 1 là batchsize

        return self.dropout(x)

In [16]:
x = iter(traindata)
d,y = next(x)
em = Embedding_layer(len(src_vocab), d_model= 4)
e = em(d)
p = Possition_layer(40, 4, 0.1)
x = em(d)
p(x)

tensor([[[-2.9813,  0.7836,  2.0076, -1.9121],
         [ 2.8081,  0.0000,  2.7557, -0.0000],
         [ 0.5863, -1.9362,  0.4612, -1.3980],
         [ 0.3048,  0.5989,  0.6166,  0.0000],
         [ 0.0000,  0.1610, -1.4853,  3.6390]],

        [[-2.9813,  0.7836,  2.0076, -1.9121],
         [ 0.9150,  0.0000, -0.3539, -0.6157],
         [ 0.0000,  1.9236,  1.9641,  0.3816],
         [ 0.0000,  0.5989,  0.6166,  2.2479],
         [ 0.3136,  0.1610, -1.4853,  3.6390]]], grad_fn=<MulBackward0>)

**Self Attention**

In [17]:
def attention(q,k,v, mask = None, dropout = None):

    # q,k,v shape [batch_size, seq_leng, d_model]
    d_model = q.size(-1)
    score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_model)

    if mask is not None:
        score = score.masked_fill(mask ==0, 1e-10)

    sort_max = F.softmax(score, -1)

    if dropout is not None:
        sort_max = dropout(sort_max)

    return torch.matmul(sort_max,v), sort_max

**Multihead Attention**

In [18]:
class Multihead_attention(nn.Module):
    def __init__(self, head = 8, d_model = 256, dropout = 0.1):
        super().__init__()
        assert d_model % head == 0
        self.head = head
        self.d_k = d_model // head # chia d_model thành nhiều đầu

        self.q_weight = nn.Linear(d_model, d_model)
        self.k_weight = nn.Linear(d_model, d_model)
        self.v_weight = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)

        self.out = nn.Linear(d_model, d_model)

    def forward(self, q,k,v,mask = None):
        batch_size = q.size(0)

        # -1 represents sequence length
        q = self.q_weight(q).view(batch_size, -1,self.head, self.d_k).transpose(1,2) # đổi thành ma trận các head ma trận
        k = self.k_weight(k).view(batch_size, -1,self.head, self.d_k).transpose(1,2) # đổi thành ma trận các head ma trận
        v = self.v_weight(v).view(batch_size, -1,self.head, self.d_k).transpose(1,2) # đổi thành ma trận các head ma trận

        if mask is not None:
            # mask: [batch_size, 1, 1, seq_len] -> [batch_size, h, seq_len, seq_len]
            mask = mask.unsqueeze(1)  # Add head dimension
            # This ensures it can be broadcasted over attention scores

        score, attn = attention(q,k,v,mask, self.dropout)

        concat = score.transpose(1,2).contiguous().view(batch_size, -1, self.head * self.d_k)

        return self.out(concat)

**Norm Layers**

In [19]:
class Norm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.a = nn.Parameter(torch.ones(d_model))
        self.b = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a * (x - mean) / (std + self.eps) + self.b

**FeedWord Layer**

In [20]:
class FeedForward(nn.Module):
    """ Trong kiến trúc của chúng ta có tầng linear
    """
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__()

        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

**Encoder layer**

In [21]:
class Encoder_layer(nn.Module):
    def __init__(self, head = 8, d_model = 512, dropout = 0.1 ):
        super().__init__()
        self.attn = Multihead_attention(head , d_model,dropout)

        self.norm1 = Norm(d_model)
        self.norm2 = Norm(d_model)

        self.ffn = FeedForward(d_model,dropout=dropout)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self,x, mask):
        x2 = self.norm1(x)
        x = x + self.dropout1(self.attn(x2, x2, x2, mask))
        x2 = self.norm2(x)
        x = x + self.dropout2(self.ffn(x2))
        return x

**Decoder Layer**

In [22]:
class Decoder_Layer(nn.Module):
    def __init__(self,head , d_model, dropout=0.1):
        super().__init__()

        # 3 norm layers
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)

        # 3 dropout layers
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)

        # 2 multi-head attention blocks
        self.attn_1 = Multihead_attention(head, d_model, dropout=dropout)
        self.attn_2 = Multihead_attention(head, d_model, dropout=dropout)

        # Feedforward network
        self.ffn = FeedForward(d_model, dropout=dropout)

    def forward(self, x, encoder_output, src_mask, trg_mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, encoder_output, encoder_output, src_mask))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ffn(x2))
        return x

**Encoder**

In [23]:
import copy

def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder(nn.Module):
    def __init__(self, vocab_len = 100000, d_model = 521, N = 8,head = 8, dropout = 0.1 ):
        super().__init__()

        self.emb = Embedding_layer(vocab_len,d_model)
        self.ps = Possition_layer(200,  d_model,  dropout)
        self.encoder_layers = get_clones(Encoder_layer(head=head, d_model=d_model, dropout=dropout), N)

        self.n = N
        self.norm = Norm(d_model)

    def forward(self,x, mask):

        x = self.emb(x)
        x = self.ps(x)

        for i in range(self.n):
            x = self.encoder_layers[i](x, mask)

        return self.norm(x)


**Decoder**

In [24]:
class Decoder(nn.Module):
    def __init__(self, vocab_len = 10000, d_model = 256, N = 8,head = 8, dropout = 0.1 ):
        super().__init__()

        self.emb = Embedding_layer(vocab_len,d_model)
        self.ps = Possition_layer(200,  d_model,  dropout)
        self.decoder_layers = get_clones(Decoder_Layer(head, d_model, dropout), N)
        self.n = N
        self.norm = Norm(d_model)

    def forward(self,encoder_output,x, src_mask, trg_mask):

        x = self.emb(x)
        x = self.ps(x)

        for i in range(self.n):
            x = self.decoder_layers[i](x, encoder_output, src_mask, trg_mask)


        return self.norm(x)


**Tranformer Model**

In [25]:
# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, trg_vocab)

    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        # Correct the order of arguments for the decoder
        d_output = self.decoder(e_outputs, trg, src_mask, trg_mask)
        output = self.out(d_output)
        return output

In [26]:
# Mask functions
def nopeak_mask(size, device):
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    # Remove the Variable() call
    np_mask = (torch.from_numpy(np_mask) == 0).to(device)
    return np_mask

def create_masks(src, trg, src_pad, trg_pad, device):
    src_mask = (src != src_pad).unsqueeze(-2)

    if trg is not None:
        trg_mask = (trg != trg_pad).unsqueeze(-2)
        size = trg.size(1)
        np_mask = nopeak_mask(size, device)
        trg_mask = trg_mask & np_mask
    else:
        trg_mask = None

    return src_mask, trg_mask


**Model Validation**

In [27]:
# path = "data/translation_dataset.csv"
path = "/content/drive/MyDrive/Dataset_of_Colab/translation_dataset.csv"


df = pd.read_csv(path)

src_vocab, trg_vocab = create_dictionary(df["English"].values, df["Vietnamese"].values)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Read_Dataset(train_df["English"].values, train_df["Vietnamese"].values, src_vocab, trg_vocab, max_leng=200)
val_dataset = Read_Dataset(val_df["English"].values, val_df["Vietnamese"].values, src_vocab, trg_vocab, max_leng=200)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)

In [28]:
len(trg_vocab)

37106

In [29]:
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    len(src_vocab),
    len(trg_vocab),
    256,
    4,
    8,
    0.1,
).to(device)

optimizer = optim.Adam(model.parameters(), lr = 0.001,  betas=(0.9, 0.98))


criterion = nn.CrossEntropyLoss(ignore_index = trg_vocab["<pad>"]) # trg_vocab["<pad>"] equals 2

In [30]:
def train_model(model, train_loader, val_loader,optimizer, criterion, device, num_epochs=10, batch_size=32):

    model = model.to(device)
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0

        for src_batch, tgt_batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            src_batch, tgt_batch = src_batch.long().to(device), tgt_batch.long().to(device)

            print(src_batch.dtype, tgt_batch.dtype)


            trg_input = tgt_batch[:, :-1]
            trg_output = tgt_batch[:, 1:]

            src_mask, trg_mask = create_masks(src_batch, trg_input, 2, 2 , device)

            output = model(src_batch, trg_input, src_mask, trg_mask)

            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg_output = trg_output.contiguous().view(-1)

            loss = criterion(output, trg_output)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            avg_train_loss = train_loss / len(train_loader)

        model.eval()
        val_loss = 0

        with torch.no_grad():
            for src_batch, tgt_batch in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}"):
                src_batch, tgt_batch = src_batch.long().to(device), tgt_batch.long().to(device)

                trg_input = tgt_batch[:, :-1]
                trg_output = tgt_batch[:, 1:]

                src_mask, trg_mask = create_masks(src_batch, trg_input, 2, 2 , device)

                output = model(src_batch, trg_input, src_mask, trg_mask)

                output_dim = output.shape[-1]
                output = output.contiguous().view(-1, output_dim)
                trg_output = trg_output.contiguous().view(-1)

                loss = criterion(output, trg_output)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_transformer_model.pt")
            print("Saved best model.")

In [None]:
train_model(model, train_loader, val_loader, optimizer, criterion, device)

Training Epoch 1:   0%|          | 0/3537 [00:00<?, ?it/s]

torch.int64 torch.int64
