In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch.nn.utils.rnn as rnn_utils
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Hyperparmeters

In [2]:
learning_rate = 5e-3
batch_size = 10
test_size = 0.01 #(1% of the data, around 700 samples)
seed = 42

# Dataset

In [3]:
class TransliterationDataset(Dataset):
    def __init__(self, DATA_HUB='atlasia/ATAM'):
        # Load the dataset
        self.data = load_dataset(DATA_HUB)['train'].to_pandas().values.tolist()
        # Create a set of all unique characters in the source and target languages
        self.arabizi_chars = set(''.join([d[0] for d in self.data]))
        self.arabic_chars = set(''.join([d[1] for d in self.data]))
        # Create a dictionary mapping each character to a unique index
        self.char2idx_ary = {char: idx for idx, char in enumerate(self.arabizi_chars)}
        self.char2idx_ar = {char: idx for idx, char in enumerate(self.arabic_chars)}
        # Calculate the size of the vocabulary
        self.vocab_size_src = len(self.char2idx_ary)
        self.vocab_size_tgt = len(self.char2idx_ar)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        darija, darija_ar = self.data[idx]
        input_seq = [self.char2idx_ary[char] for char in darija]
        target_seq = [self.char2idx_ar[char] for char in darija_ar]
        input = torch.LongTensor(input_seq)
        target = torch.LongTensor(target_seq)
        return input, target

def collate_function(batch):
    inputs, targets = zip(*batch)
    # Pad sequences to have the same length within each batch
    padded_inputs = rnn_utils.pad_sequence(inputs, batch_first=True, padding_value=0)
    padded_targets = rnn_utils.pad_sequence(targets, batch_first=True, padding_value=0)
    return padded_inputs, padded_targets


In [4]:
dataset = TransliterationDataset()
train_data, val_data = train_test_split(dataset, test_size=test_size, random_state=seed)

In [5]:
print(f'The training dataset has {len(train_data)} samples.')
print(f'The validation dataset has {len(val_data)} samples.')
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_function)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=collate_function)

The training dataset has 66514 samples.
The validation dataset has 672 samples.


# Model

In [6]:
class TransliterationModel(nn.Module):
    def __init__(self, vocab_size_src, vocab_size_tgt, d_model=128, nhead=2, num_encoder_layers=2, num_decoder_layers=2):
        super(TransliterationModel, self).__init__()
        self.embedding_src = nn.Embedding(vocab_size_src, d_model)
        self.embedding_tgt = nn.Embedding(vocab_size_tgt, d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers)
        self.fc = nn.Linear(d_model, vocab_size_tgt)

    def forward(self, src, tgt):
        src = self.embedding_src(src)
        tgt = self.embedding_tgt(tgt)
        output = self.transformer(src, tgt)
        output = self.fc(output)
        return output

In [7]:
model = TransliterationModel(vocab_size_src=dataset.vocab_size_src, vocab_size_tgt=dataset.vocab_size_tgt)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [11]:
def train(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in tqdm(range(num_epochs)):
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            print(f'inputs: {inputs.shape}')
            print(f'targets: {targets.shape}')
            print(f'targets[:, :-1]: {targets[:, :-1].shape}')
            outputs = model(inputs, targets[:, :-1])  # Exclude the last token from targets as input to the decoder
            # Reshape outputs and targets to (batch_size * seq_len, vocab_size_tgt) for loss calculation
            outputs = outputs.view(-1, dataset.vocab_size_tgt)
            targets = targets[:, 1:].contiguous().view(-1)  # Exclude the first token from targets for loss calculation
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")
    torch.save(model.state_dict(), 'transliteration_transformer.pth')

In [12]:
# Start training
train(model, train_loader, criterion, optimizer)

  0%|          | 0/10 [00:00<?, ?it/s]

inputs: torch.Size([10, 9])
targets: torch.Size([10, 8])
targets[:, :-1]: torch.Size([10, 7])





RuntimeError: the batch number of src and tgt must be equal