## Đặng Nguyễn Quang Huy Intern AI Engineer

## 1.Import the necessary libraries

In [643]:
import requests
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import torch.nn as nn
import torch.optim as optim


In [644]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2. Data Preprocessing

> Get data origin from url

In [645]:
url = f"https://datasets-server.huggingface.co/rows?dataset=Helsinki-NLP%2Fopus_books&config=en-hu&split=train&offset=0&length=100"

response = requests.get(url)
if response.status_code == 200:
    data = response.json()
    rows = data.get('rows', [])
    df = pd.DataFrame(rows)
    df.to_csv('/content/drive/MyDrive/Machine_Translation/data/data_origin.csv', index=False, encoding='utf-8')
else:
    print(f"Failed to retrieve data: {response.status_code}")

> Read data

In [646]:
# Đọc dữ liệu từ CSV
df = pd.read_csv('/content/drive/MyDrive/Machine_Translation/data/data_origin.csv')

> Extract pairs of translated sentences

In [647]:
pairs = []
for row in df.itertuples(index=False):
    translation = eval(row.row)['translation']
    pairs.append((translation['en'], translation['hu']))

> Create tokenizer for English and Hungarian

In [648]:
tokenizer_en = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_hu = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

>  Create dataset and dataloader

In [649]:
class TranslationDataset(Dataset):
    def __init__(self, pairs, tokenizer_en, tokenizer_hu, max_length=128):
        self.pairs = pairs
        self.tokenizer_en = tokenizer_en
        self.tokenizer_hu = tokenizer_hu
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        en_text, hu_text = self.pairs[idx]
        en_encoding = self.tokenizer_en.encode_plus(
            en_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt"
        )
        hu_encoding = self.tokenizer_hu.encode_plus(
            hu_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt"
        )

        en_input_ids = en_encoding['input_ids'].squeeze(0)
        hu_input_ids = hu_encoding['input_ids'].squeeze(0)

        min_len = min(en_input_ids.shape[0], hu_input_ids.shape[0])

        return en_input_ids[:min_len], hu_input_ids[:min_len]

> Create training set, test set, validation set

In [650]:
dataset = TranslationDataset(pairs, tokenizer_en, tokenizer_hu)
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

## 3.Build a Transformer model for machine translation

> Transformer model

In [651]:
class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8, num_encoder_layers=4, num_decoder_layers=4 ,dim_feedforward=2460, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout
        )
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src = src.permute(1, 0)
        tgt = tgt.permute(1, 0)
        src_emb = self.src_embedding(src)
        tgt_emb = self.tgt_embedding(tgt)
        output = self.transformer(src_emb, tgt_emb)
        output = output.permute(1, 0, 2)
        return self.fc_out(output)

In [652]:
src_vocab_size = len(tokenizer_en)
tgt_vocab_size = len(tokenizer_hu)
model = TransformerModel(src_vocab_size, tgt_vocab_size)



> Model training

In [653]:
# Move model and data to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer_hu.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training function for each epoch
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    epoch_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)

        # Ensure the batch sizes of src and tgt are equal
        if src.size(0) != tgt.size(0):
            raise RuntimeError("The batch sizes of src and tgt must be equal")

        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        output = output.view(-1, output.shape[-1])
        tgt = tgt[:, 1:].contiguous().view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# Evaluation function for the model on the validation set
def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            output = output.view(-1, output.shape[-1])
            tgt = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# Train the model
n_epochs = 100
best_val_loss = float('inf')
patience = 10
counter = 0
for epoch in range(n_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    # Check if validation loss has improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
    else:
        counter += 1
    # If validation loss hasn't improved for 'patience' epochs, stop training
    if counter >= patience:
        print(f'Validation loss has not improved for {patience} epochs. Early stopping...')
        break


Epoch 1, Train Loss: 11.0233, Val Loss: 10.5503
Epoch 2, Train Loss: 10.1820, Val Loss: 10.0860
Epoch 3, Train Loss: 9.5491, Val Loss: 9.5903
Epoch 4, Train Loss: 8.9040, Val Loss: 9.1214
Epoch 5, Train Loss: 8.2605, Val Loss: 8.6662
Epoch 6, Train Loss: 7.6670, Val Loss: 8.2644
Epoch 7, Train Loss: 7.1191, Val Loss: 7.9902
Epoch 8, Train Loss: 6.6529, Val Loss: 7.7611
Epoch 9, Train Loss: 6.2399, Val Loss: 7.6087
Epoch 10, Train Loss: 5.8528, Val Loss: 7.4931
Epoch 11, Train Loss: 5.4618, Val Loss: 7.4475
Epoch 12, Train Loss: 5.1071, Val Loss: 7.3756
Epoch 13, Train Loss: 4.7454, Val Loss: 7.3089
Epoch 14, Train Loss: 4.3842, Val Loss: 7.2524
Epoch 15, Train Loss: 4.0703, Val Loss: 7.2116
Epoch 16, Train Loss: 3.7903, Val Loss: 7.2143
Epoch 17, Train Loss: 3.4860, Val Loss: 7.1792
Epoch 18, Train Loss: 3.2463, Val Loss: 7.1782
Epoch 19, Train Loss: 2.9803, Val Loss: 7.1619
Epoch 20, Train Loss: 2.7732, Val Loss: 7.1612
Epoch 21, Train Loss: 2.5353, Val Loss: 7.1719
Epoch 22, Train Lo

> Evaluate on test set

In [654]:
total_test_loss = 0
model.eval()
with torch.no_grad():
    for src, tgt in test_loader:
        src, tgt = src.to(device), tgt.to(device)
        output = model(src, tgt[:, :-1])
        output = output.view(-1, output.shape[-1])
        tgt = tgt[:, 1:].contiguous().view(-1)
        loss = criterion(output, tgt)
        total_test_loss += loss.item()

avg_test_loss = total_test_loss / len(test_loader)
print(f'Average Test Loss: {avg_test_loss:.4f}')

Average Test Loss: 6.6402
