# Hyperparmeters

In [None]:
learning_rate = 5e-3
batch_size = 10
test_size = 0.01 #(1% of the data, around 700 samples)
seed = 42

# Dataset

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch.nn.utils.rnn as rnn_utils

In [None]:
class TransliterationDataset(Dataset):
    def __init__(self, DATA_HUB = 'atlasia/ATAM'):
        self.data = load_dataset(DATA_HUB)['train'].to_pandas().values.tolist()
        self.arabizi_chars = set(''.join([d[0] for d in self.data]))
        self.arabic_chars = set(''.join([d[1] for d in self.data]))
        self.char2idx_ary = {char: idx for idx, char in enumerate(self.arabizi_chars)}
        self.char2idx_ar = {char: idx for idx, char in enumerate(self.arabic_chars)}
        self.vocab_size_input = len(self.char2idx_ary)
        self.vocab_size_output = len(self.char2idx_ar)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        darija, darija_ar = self.data[idx]
        input_seq = [self.char2idx_ary[char] for char in darija]
        target_seq = [self.char2idx_ar[char] for char in darija_ar]
        input = torch.LongTensor(input_seq)
        target = torch.LongTensor(target_seq)
        # print(input)
        # print(target)
        return input, target
    
def collate_function_old(batch):
    inputs, targets = zip(*batch)
    # Pad sequences to have the same length
    padded_inputs = rnn_utils.pad_sequence(inputs, batch_first=True, padding_value=0)
    padded_targets = rnn_utils.pad_sequence(targets, batch_first=True, padding_value=0)
    return padded_inputs, padded_targets

def collate_function(batch):
    inputs, targets = zip(*batch)
    # Pad sequences to have the same length
    padded_inputs = rnn_utils.pad_sequence(inputs, batch_first=True, padding_value=0)
    # Ensure targets are padded and convert to 1D tensor
    padded_targets = rnn_utils.pad_sequence(targets, batch_first=True, padding_value=0)
    # Flatten the padded_targets to ensure it matches the batch size of inputs
    padded_targets = padded_targets.view(-1)  # Reshape to 1D tensor
    return padded_inputs, padded_targets



In [None]:
dataset = TransliterationDataset()

In [None]:
train_data, val_data = train_test_split(dataset, test_size=test_size, random_state=seed)
print(f'The training dataset has {len(train_data)} samples.')
print(f'The validation dataset has {len(val_data)} samples.')

In [None]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_function)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=collate_function)

In [None]:
for input, target in train_loader:
    print(f'input: {input.shape}')
    print(f'target: {target.shape}')
    break

# Model

In [None]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
class TransliterationModel(nn.Module):
    def __init__(self, vocab_size_input, vocab_size_output):
        super(TransliterationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size_input, 128)
        self.conv1d = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3)
        self.fc = nn.Linear(256, vocab_size_output)
        self.vocab_size_input = vocab_size_input
        self.vocab_size_output = vocab_size_output
    
    def forward(self, x):
        x = self.embedding(x)
        # Conv1D expects input in (batch_size, in_channels, seq_len) format
        x = x.permute(0, 2, 1)  # Permute to (batch_size, seq_len, in_channels)
        x = self.conv1d(x)
        x = torch.relu(x)
        x = torch.max_pool1d(x, kernel_size=x.size(2))  # Global max pooling
        x = x.squeeze(2)  # Squeeze to remove the channel dimension
        x = self.fc(x)
        return x

In [None]:
print(dataset.vocab_size_input)
print(dataset.vocab_size_output)

In [None]:
model = TransliterationModel(vocab_size_input=dataset.vocab_size_input, vocab_size_output=dataset.vocab_size_output)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# def train(model, train_loader, criterion, optimizer, num_epochs=10):
#     model.train()
#     for epoch in range(num_epochs):
#         running_loss = 0.0
#         for inputs, targets in train_loader:
#             optimizer.zero_grad()
#             outputs = model(inputs)
#             print(f'inputs[0]: {inputs[0]}')
#             print(f'targets[0]: {targets[0]}')
#             print(f'outputs[0]: {outputs[0]}')
#             print('--------------------------------------')
#             print(f'outputs: {outputs.shape}')
#             print(f'targets: {targets.shape}')
#             print('--------------------------------------')
#             # Convert targets to one-hot encoding
#             targets_one_hot = F.one_hot(targets, num_classes=model.vocab_size).float()
#             print(f'outputs: {outputs.shape}')
#             print(f'targets: {targets_one_hot.shape}')
#             print('--------------------------------------')
#             print(f'outputs: {outputs}')
#             print(f'targets: {targets_one_hot}')
#             print('--------------------------------------')
#             loss = criterion(outputs, targets_one_hot)
#             loss.backward()
#             optimizer.step()
#             running_loss += loss.item() * inputs.size(0)
#         epoch_loss = running_loss / len(train_loader.dataset)
#         print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

In [None]:
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            # Reshape the targets to match the shape of the outputs
            targets = targets.view(outputs.shape[0] * outputs.shape[1])
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}")

        # Evaluation on the validation set
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                # Reshape the targets to match the shape of the outputs
                targets = targets.view(outputs.shape[0] * outputs.shape[1])
                loss = criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)
        
        val_loss /= len(val_loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}")

In [None]:
# Start training
train(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)