In [24]:
import pandas as pd
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import string
import nltk
from nltk.tokenize import word_tokenize
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


Cleaning the text

In [25]:
# Ensure proper NLTK setup
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [26]:
# Load and preprocess data
df = pd.read_csv('sentences.csv').dropna().drop_duplicates()

Train-test split

In [27]:
# Remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['eng'] = df['eng'].apply(remove_punctuation)
df['darija'] = df['darija'].apply(remove_punctuation)
df = df[df['eng'].str.strip().astype(bool) & df['darija'].str.strip().astype(bool)]



Tokenizing sentences.

In [28]:
# Split data
SPLIT_SIZE = 0.2
train_data, test_data = train_test_split(df, test_size=SPLIT_SIZE, random_state=4)
train_data, val_data = train_test_split(train_data, test_size=SPLIT_SIZE, random_state=4)


In [29]:
def tokenize_and_lowercase(text):
    return word_tokenize(text.lower())

X = train_data['eng'].apply(tokenize_and_lowercase).tolist()
Y = train_data['darija'].apply(tokenize_and_lowercase).tolist()
X_val = val_data['eng'].apply(tokenize_and_lowercase).tolist()
Y_val = val_data['darija'].apply(tokenize_and_lowercase).tolist()
X_test = test_data['eng'].apply(tokenize_and_lowercase).tolist()
Y_test = test_data['darija'].apply(tokenize_and_lowercase).tolist()


In [30]:

# Build vocabularies
vocab_eng = set(word for sentence in X for word in sentence)
vocab_dari = set(word for sentence in Y for word in sentence)

word_to_ix_eng = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
word_to_ix_eng.update({word: i + len(word_to_ix_eng) for i, word in enumerate(vocab_eng)})

word_to_ix_dari = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
word_to_ix_dari.update({word: i + len(word_to_ix_dari) for i, word in enumerate(vocab_dari)})


Building vocabularies.

In [31]:
# Encode sequences
def prepare_sequence(seq, to_ix):
    return torch.tensor(
        [to_ix['<SOS>']] + [to_ix.get(word, to_ix['<UNK>']) for word in seq] + [to_ix['<EOS>']],
        dtype=torch.long
    )

X_train_encoded = [prepare_sequence(seq, word_to_ix_eng) for seq in X]
Y_train_encoded = [prepare_sequence(seq, word_to_ix_dari) for seq in Y]
X_val_encoded = [prepare_sequence(seq, word_to_ix_eng) for seq in X_val]
Y_val_encoded = [prepare_sequence(seq, word_to_ix_dari) for seq in Y_val]


In [32]:

# Pad sequences
X_train_padded = pad_sequence(X_train_encoded, batch_first=True, padding_value=word_to_ix_eng['<PAD>'])
Y_train_padded = pad_sequence(Y_train_encoded, batch_first=True, padding_value=word_to_ix_dari['<PAD>'])
X_val_padded = pad_sequence(X_val_encoded, batch_first=True, padding_value=word_to_ix_eng['<PAD>'])
Y_val_padded = pad_sequence(Y_val_encoded, batch_first=True, padding_value=word_to_ix_dari['<PAD>'])



In [33]:
# Dataset and DataLoader
train_dataset = TensorDataset(X_train_padded, Y_train_padded)
val_dataset = TensorDataset(X_val_padded, Y_val_padded)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [34]:
# Define AdvancedLSTMCell
class AdvancedLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AdvancedLSTMCell, self).__init__()
        self.hidden_size = hidden_size

        self.Wxi = nn.Linear(input_size, hidden_size, bias=False)
        self.Whi = nn.Linear(hidden_size, hidden_size, bias=False)
        self.Wci = nn.Parameter(torch.zeros(hidden_size))
        self.bi = nn.Parameter(torch.zeros(hidden_size))

        self.Wxf = nn.Linear(input_size, hidden_size, bias=False)
        self.Whf = nn.Linear(hidden_size, hidden_size, bias=False)
        self.Wcf = nn.Parameter(torch.zeros(hidden_size))
        self.bf = nn.Parameter(torch.zeros(hidden_size))

        self.Wxo = nn.Linear(input_size, hidden_size, bias=False)
        self.Who = nn.Linear(hidden_size, hidden_size, bias=False)
        self.Wco = nn.Parameter(torch.zeros(hidden_size))
        self.bo = nn.Parameter(torch.zeros(hidden_size))

        self.Wxg = nn.Linear(input_size, hidden_size, bias=False)
        self.Whg = nn.Linear(hidden_size, hidden_size, bias=False)
        self.bc = nn.Parameter(torch.zeros(hidden_size))

    def forward(self, x, hidden):
        h_prev, c_prev = hidden

        i_t = torch.sigmoid(self.Wxi(x) + self.Whi(h_prev) + self.Wci * c_prev + self.bi)
        f_t = torch.sigmoid(self.Wxf(x) + self.Whf(h_prev) + self.Wcf * c_prev + self.bf)
        o_t = torch.sigmoid(self.Wxo(x) + self.Who(h_prev) + self.Wco * c_prev + self.bo)
        g_t = torch.tanh(self.Wxg(x) + self.Whg(h_prev) + self.bc)

        c_t = f_t * c_prev + i_t * g_t
        h_t = o_t * torch.tanh(c_t)

        return h_t, c_t


In [35]:
# Define AdvancedLSTM
class AdvancedLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(AdvancedLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm_cell = AdvancedLSTMCell(hidden_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        outputs = []
        for t in range(embedded.size(1)):
            hidden = self.lstm_cell(embedded[:, t, :], hidden)
            outputs.append(hidden[0])
        outputs = torch.stack(outputs, dim=1)
        outputs = self.output_layer(outputs)
        return outputs, hidden

    def init_hidden(self, batch_size, device):
        return (torch.zeros(batch_size, self.hidden_size).to(device),
                torch.zeros(batch_size, self.hidden_size).to(device))


In [36]:
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for X_batch, Y_batch in data_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        hidden = model.init_hidden(X_batch.size(0), device)

        optimizer.zero_grad()
        outputs, _ = model(X_batch, hidden)

        # Truncate sequence lengths
        min_seq_len = min(outputs.size(1), Y_batch.size(1))
        outputs = outputs[:, :min_seq_len, :]
        Y_batch = Y_batch[:, :min_seq_len]

        # Reshape for loss computation
        outputs = outputs.contiguous().view(-1, outputs.size(-1))  # Fixed here
        Y_batch = Y_batch.contiguous().view(-1)  # Fixed here

        loss = criterion(outputs, Y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(data_loader)


def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, Y_batch in data_loader:
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
            hidden = model.init_hidden(X_batch.size(0), device)
            outputs, _ = model(X_batch, hidden)

            # Truncate sequence lengths
            min_seq_len = min(outputs.size(1), Y_batch.size(1))
            outputs = outputs[:, :min_seq_len, :]
            Y_batch = Y_batch[:, :min_seq_len]

            # Reshape for loss computation
            outputs = outputs.contiguous().view(-1, outputs.size(-1))  # Fixed here
            Y_batch = Y_batch.contiguous().view(-1)  # Fixed here

            loss = criterion(outputs, Y_batch)
            total_loss += loss.item()

    return total_loss / len(data_loader)



Prepare sequences

In [37]:
# Initialize model and parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = len(word_to_ix_eng)
hidden_size = 256
output_size = len(word_to_ix_dari)

model = AdvancedLSTM(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=word_to_ix_dari['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [38]:
# Training loop
NUM_EPOCHS = 20

for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")


Epoch 1/20
Train Loss: 6.0958
Validation Loss: 5.7569
Epoch 2/20
Train Loss: 5.1583
Validation Loss: 5.6772
Epoch 3/20
Train Loss: 4.5618
Validation Loss: 5.7046
Epoch 4/20
Train Loss: 3.9501
Validation Loss: 5.7485
Epoch 5/20
Train Loss: 3.3855
Validation Loss: 5.8525
Epoch 6/20
Train Loss: 2.8875
Validation Loss: 5.9320
Epoch 7/20
Train Loss: 2.4726
Validation Loss: 6.0326
Epoch 8/20
Train Loss: 2.1565
Validation Loss: 6.1812
Epoch 9/20
Train Loss: 1.9156
Validation Loss: 6.2726
Epoch 10/20
Train Loss: 1.7295
Validation Loss: 6.3668
Epoch 11/20
Train Loss: 1.5773
Validation Loss: 6.4389
Epoch 12/20
Train Loss: 1.4599
Validation Loss: 6.5376
Epoch 13/20
Train Loss: 1.3626
Validation Loss: 6.6071
Epoch 14/20
Train Loss: 1.2867
Validation Loss: 6.6889
Epoch 15/20
Train Loss: 1.2230
Validation Loss: 6.7718
Epoch 16/20
Train Loss: 1.1745
Validation Loss: 6.8536
Epoch 17/20
Train Loss: 1.1369
Validation Loss: 6.8577
Epoch 18/20
Train Loss: 1.1088
Validation Loss: 6.9712
Epoch 19/20
Train L

In [39]:
# Save the trained model
torch.save(model.state_dict(), "advanced_lstm_translation_model.pth")


In [40]:
# Testing
def test_model(model, data_loader, word_to_ix, ix_to_word, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for X_batch, _ in data_loader:
            X_batch = X_batch.to(device)
            hidden = model.init_hidden(X_batch.size(0), device)
            outputs, _ = model(X_batch, hidden)
            outputs = torch.argmax(outputs, dim=-1)

            for output_seq in outputs:
                translated_sentence = [
                    ix_to_word[ix.item()] for ix in output_seq if ix.item() not in {word_to_ix['<PAD>'], word_to_ix['<SOS>'], word_to_ix['<EOS>']}
                ]
                predictions.append(" ".join(translated_sentence))

    return predictions

In [41]:

# Create reverse mappings for decoding
ix_to_word_eng = {ix: word for word, ix in word_to_ix_eng.items()}
ix_to_word_dari = {ix: word for word, ix in word_to_ix_dari.items()}

# Prepare test DataLoader
X_test_encoded = [prepare_sequence(seq, word_to_ix_eng) for seq in X_test]
Y_test_encoded = [prepare_sequence(seq, word_to_ix_dari) for seq in Y_test]
X_test_padded = pad_sequence(X_test_encoded, batch_first=True, padding_value=word_to_ix_eng['<PAD>'])
Y_test_padded = pad_sequence(Y_test_encoded, batch_first=True, padding_value=word_to_ix_dari['<PAD>'])
test_dataset = TensorDataset(X_test_padded, Y_test_padded)
test_loader = DataLoader(test_dataset, batch_size=32)

In [42]:
# Test the model
test_predictions = test_model(model, test_loader, word_to_ix_dari, ix_to_word_dari, device)

# Print a few test results
for i in range(5):
    print(f"Original: {' '.join(X_test[i])}")
    print(f"Predicted: {test_predictions[i]}")
    print(f"Actual: {' '.join(Y_test[i])}")
    print()

Original: theres creepers everywhere
Predicted: kayn kayn lblays bzaf
Actual: kaynin chmakriya fin ma mchiti

Original: eight books
Predicted: tmnya tlktouba
Actual: tmnya t lktouba

Original: and working for you in particular
Predicted: o kant ou dakchi
Actual: olkhdma 3ndak 3la wjah lkhosos

Original: unless we ask them to bring some with them
Predicted: ma3ada imkan bzzaf o s3ib o o dyal dyal
Actual: ma3ada ila guelna lihom yjibo chwia m3ahom

Original: he has
Predicted: 3ndo ch3er
Actual: aando



In [43]:
# Test the model
test_predictions = test_model(model, test_loader, word_to_ix_dari, ix_to_word_dari, device)

# Print a few test results
for i in range(10):
    print(f"Original: {' '.join(X_test[i])}")
    print(f"Predicted: {test_predictions[i]}")
    print(f"Actual: {' '.join(Y_test[i])}")
    print()

Original: theres creepers everywhere
Predicted: kayn kayn lblays bzaf
Actual: kaynin chmakriya fin ma mchiti

Original: eight books
Predicted: tmnya tlktouba
Actual: tmnya t lktouba

Original: and working for you in particular
Predicted: o kant ou dakchi
Actual: olkhdma 3ndak 3la wjah lkhosos

Original: unless we ask them to bring some with them
Predicted: ma3ada imkan bzzaf o s3ib o o dyal dyal
Actual: ma3ada ila guelna lihom yjibo chwia m3ahom

Original: he has
Predicted: 3ndo ch3er
Actual: aando

Original: is there an emergency button
Predicted: wach kayn chi chi chi
Actual: wach kayn boton dyal tawari2

Original: sorry i havent asked how your day was
Predicted: sma7 liya hadi sawalt ach dyalk
Actual: sma7 liya masawltkch kidaz nhark

Original: i dont want
Predicted: ana
Actual: mabghitch

Original: im worried about i seem to worry about everything
Predicted: ana khayfa 3la ana
Actual: ana khayf mn ban liya ana khayf 3la kolchi

Original: its stupid to be lost when we have a good ma