In [162]:
import pandas as pd
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import string
import nltk
from nltk.tokenize import word_tokenize
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [163]:

# Ensure proper NLTK setup
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [164]:

# Load and preprocess data
df = pd.read_csv('sentences.csv').dropna().drop_duplicates()

In [165]:
# Remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['eng'] = df['eng'].apply(remove_punctuation)
df['darija'] = df['darija'].apply(remove_punctuation)
df = df[df['eng'].str.strip().astype(bool) & df['darija'].str.strip().astype(bool)]


In [166]:
# Split data
SPLIT_SIZE = 0.2
train_data, test_data = train_test_split(df, test_size=SPLIT_SIZE, random_state=4)
train_data, val_data = train_test_split(train_data, test_size=SPLIT_SIZE, random_state=4)

In [167]:
def tokenize_and_lowercase(text):
    return word_tokenize(text.lower())

X = train_data['eng'].apply(tokenize_and_lowercase).tolist()
Y = train_data['darija'].apply(tokenize_and_lowercase).tolist()
X_val = val_data['eng'].apply(tokenize_and_lowercase).tolist()
Y_val = val_data['darija'].apply(tokenize_and_lowercase).tolist()
X_test = test_data['eng'].apply(tokenize_and_lowercase).tolist()
Y_test = test_data['darija'].apply(tokenize_and_lowercase).tolist()

In [168]:
# Build vocabularies
vocab_eng = set(word for sentence in X for word in sentence)
vocab_dari = set(word for sentence in Y for word in sentence)

word_to_ix_eng = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
word_to_ix_eng.update({word: i + len(word_to_ix_eng) for i, word in enumerate(vocab_eng)})

word_to_ix_dari = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
word_to_ix_dari.update({word: i + len(word_to_ix_dari) for i, word in enumerate(vocab_dari)})

In [169]:
# Encode sequences
def prepare_sequence(seq, to_ix):
    return torch.tensor(
        [to_ix['<SOS>']] + [to_ix.get(word, to_ix['<UNK>']) for word in seq] + [to_ix['<EOS>']],
        dtype=torch.long
    )

X_train_encoded = [prepare_sequence(seq, word_to_ix_eng) for seq in X]
Y_train_encoded = [prepare_sequence(seq, word_to_ix_dari) for seq in Y]
X_val_encoded = [prepare_sequence(seq, word_to_ix_eng) for seq in X_val]
Y_val_encoded = [prepare_sequence(seq, word_to_ix_dari) for seq in Y_val]

In [170]:
# Pad sequences
X_train_padded = pad_sequence(X_train_encoded, batch_first=True, padding_value=word_to_ix_eng['<PAD>'])
Y_train_padded = pad_sequence(Y_train_encoded, batch_first=True, padding_value=word_to_ix_dari['<PAD>'])
X_val_padded = pad_sequence(X_val_encoded, batch_first=True, padding_value=word_to_ix_eng['<PAD>'])
Y_val_padded = pad_sequence(Y_val_encoded, batch_first=True, padding_value=word_to_ix_dari['<PAD>'])


In [171]:

# Dataset and DataLoader
train_dataset = TensorDataset(X_train_padded, Y_train_padded)
val_dataset = TensorDataset(X_val_padded, Y_val_padded)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [172]:
# Load GloVe embeddings
def load_glove_embeddings(glove_file, vocab, embedding_dim):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(x) for x in values[1:]], dtype=torch.float)
            embeddings[word] = vector

    embedding_matrix = torch.zeros(len(vocab), embedding_dim)
    for word, idx in vocab.items():
        if word in embeddings:
            embedding_matrix[idx] = embeddings[word]
        else:
            embedding_matrix[idx] = torch.randn(embedding_dim)  # Random for unknown words

    return embedding_matrix

In [173]:

# File path to GloVe
GLOVE_PATH = "glove.6B.100d.txt"
EMBEDDING_DIM = 100  # GloVe embedding size

embedding_matrix = load_glove_embeddings(GLOVE_PATH, word_to_ix_eng, EMBEDDING_DIM)

In [174]:
class CustomLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Weight matrices for input, forget, and output gates
        self.Wxi = nn.Linear(input_size, hidden_size, bias=False)
        self.Whi = nn.Linear(hidden_size, hidden_size, bias=False)
        self.Wci = nn.Parameter(torch.zeros(hidden_size))  # Peephole connection

        self.Wxf = nn.Linear(input_size, hidden_size, bias=False)
        self.Whf = nn.Linear(hidden_size, hidden_size, bias=False)
        self.Wcf = nn.Parameter(torch.zeros(hidden_size))  # Peephole connection

        self.Wxo = nn.Linear(input_size, hidden_size, bias=False)
        self.Who = nn.Linear(hidden_size, hidden_size, bias=False)
        self.Wco = nn.Parameter(torch.zeros(hidden_size))  # Peephole connection

        # Weight matrices for candidate memory (g_t)
        self.Wxg = nn.Linear(input_size, hidden_size, bias=False)
        self.Whg = nn.Linear(hidden_size, hidden_size, bias=False)

        # Bias terms
        self.bi = nn.Parameter(torch.zeros(hidden_size))
        self.bf = nn.Parameter(torch.zeros(hidden_size))
        self.bo = nn.Parameter(torch.zeros(hidden_size))
        self.bc = nn.Parameter(torch.zeros(hidden_size))

    def forward(self, x, hidden):
        h_prev, c_prev = hidden

        # Compute candidate memory (g_t)
        g_t = torch.tanh(self.Whg(h_prev) + self.Wxg(x) + self.bc)

        # Compute input gate (i_t) with working memory connections
        i_t = torch.sigmoid(self.Wxi(x) + self.Whi(h_prev) + torch.tanh(self.Wci * c_prev) + self.bi)

        # Compute forget gate (f_t) with working memory connections
        f_t = torch.sigmoid(self.Wxf(x) + self.Whf(h_prev) + torch.tanh(self.Wcf * c_prev) + self.bf)

        # Compute new cell state (c_t)
        c_t = f_t * c_prev + i_t * g_t

        # Compute output gate (o_t) with working memory connections
        o_t = torch.sigmoid(self.Wxo(x) + self.Who(h_prev) + torch.tanh(self.Wco * c_t) + self.bo)

        # Compute new hidden state (h_t)
        h_t = o_t * torch.tanh(g_t)

        return h_t, c_t

In [175]:
class AdvancedLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, embedding_matrix, bidirectional=False, dropout=0.3):
        super(AdvancedLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.num_directions = 2 if bidirectional else 1
        # Initialize embeddings with pretrained weights
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)

        # Custom LSTM cell for forward and backward passes
        self.lstm_cell = CustomLSTMCell(embedding_matrix.size(1), hidden_size)
        if bidirectional:
            self.lstm_cell_reverse = CustomLSTMCell(embedding_matrix.size(1), hidden_size)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

        # Output layer
        self.output_layer = nn.Linear(hidden_size * self.num_directions, output_size)

    def forward(self, x, hidden):
        batch_size, seq_len = x.size()

        # Embedding lookup
        embedded = self.embedding(x)

        # Initialize outputs
        outputs = []
        h_t, c_t = hidden

        # Forward pass through LSTM
        for t in range(seq_len):
            h_t, c_t = self.lstm_cell(embedded[:, t, :], (h_t, c_t))
            outputs.append(h_t.unsqueeze(1))  # Shape: [batch_size, 1, hidden_size]

        if self.bidirectional:
            h_t_rev, c_t_rev = hidden
            outputs_reverse = []
            for t in reversed(range(seq_len)):
                h_t_rev, c_t_rev = self.lstm_cell_reverse(embedded[:, t, :], (h_t_rev, c_t_rev))
                outputs_reverse.insert(0, h_t_rev.unsqueeze(1))  # Prepend for reverse order

            # Concatenate forward and backward outputs along the hidden dimension
            outputs = [torch.cat([fwd, bwd], dim=-1) for fwd, bwd in zip(outputs, outputs_reverse)]

        outputs = torch.cat(outputs, dim=1)  # Shape: [batch_size, seq_len, hidden_size * num_directions]
        outputs = self.dropout(outputs)
        outputs = self.output_layer(outputs)  # Shape: [batch_size, seq_len, output_size]

        return outputs, (h_t, c_t)

    def init_hidden(self, batch_size, device):
        h_0 = torch.zeros(batch_size, self.hidden_size).to(device)
        c_0 = torch.zeros(batch_size, self.hidden_size).to(device)
        return h_0, c_0



In [176]:
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for X_batch, Y_batch in data_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        hidden = model.init_hidden(X_batch.size(0), device)

        optimizer.zero_grad()
        outputs, _ = model(X_batch, hidden)

        # Truncate outputs and Y_batch to the same sequence length
        seq_len = min(outputs.size(1), Y_batch.size(1))
        outputs = outputs[:, :seq_len, :]
        Y_batch = Y_batch[:, :seq_len]

        # Reshape for loss computation
        outputs = outputs.contiguous().view(-1, outputs.size(-1))
        Y_batch = Y_batch.contiguous().view(-1)

        loss = criterion(outputs, Y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(data_loader)

In [177]:
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, Y_batch in data_loader:
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
            hidden = model.init_hidden(X_batch.size(0), device)
            outputs, _ = model(X_batch, hidden)

            # Truncate outputs and Y_batch to the same sequence length
            seq_len = min(outputs.size(1), Y_batch.size(1))
            outputs = outputs[:, :seq_len, :]
            Y_batch = Y_batch[:, :seq_len]

            # Reshape for loss computation
            outputs = outputs.contiguous().view(-1, outputs.size(-1))
            Y_batch = Y_batch.contiguous().view(-1)

            loss = criterion(outputs, Y_batch)
            total_loss += loss.item()

    return total_loss / len(data_loader)

In [178]:
# Initialize model and parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_SIZE = 512  # Increased hidden size
input_size = len(word_to_ix_eng)
output_size = len(word_to_ix_dari)
DROPOUT = 0.3
BIDIRECTIONAL = True

# Pass the embedding matrix while initializing the model
model = AdvancedLSTM(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    output_size=output_size,
    embedding_matrix=embedding_matrix,
    bidirectional=BIDIRECTIONAL,
    dropout=DROPOUT
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=word_to_ix_dari['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [179]:
# Training loop
NUM_EPOCHS = 10

for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}",f"Train Loss: {train_loss:.4f}",f"Validation Loss: {val_loss:.4f}")

Epoch 1/10 Train Loss: 5.8306 Validation Loss: 5.5020
Epoch 2/10 Train Loss: 5.0382 Validation Loss: 5.3230
Epoch 3/10 Train Loss: 4.3026 Validation Loss: 5.2431
Epoch 4/10 Train Loss: 3.4075 Validation Loss: 5.3398
Epoch 5/10 Train Loss: 2.4755 Validation Loss: 5.5494
Epoch 6/10 Train Loss: 1.7538 Validation Loss: 5.7472
Epoch 7/10 Train Loss: 1.3132 Validation Loss: 5.8896
Epoch 8/10 Train Loss: 1.0300 Validation Loss: 6.1013
Epoch 9/10 Train Loss: 0.8237 Validation Loss: 6.1866
Epoch 10/10 Train Loss: 0.6650 Validation Loss: 6.2123


In [180]:
# Save the trained model
torch.save(model.state_dict(), "advanced_lstm_translation_model.pth")

In [181]:
# Testing
def test_model(model, data_loader, word_to_ix, ix_to_word, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for X_batch, _ in data_loader:
            X_batch = X_batch.to(device)
            hidden = model.init_hidden(X_batch.size(0), device)
            outputs, _ = model(X_batch, hidden)
            outputs = torch.argmax(outputs, dim=-1)

            for output_seq in outputs:
                translated_sentence = [
                    ix_to_word[ix.item()] for ix in output_seq if ix.item() not in {word_to_ix['<PAD>'], word_to_ix['<SOS>'], word_to_ix['<EOS>']}
                ]
                predictions.append(" ".join(translated_sentence))

    return predictions

In [182]:

# Create reverse mappings for decoding
ix_to_word_eng = {ix: word for word, ix in word_to_ix_eng.items()}
ix_to_word_dari = {ix: word for word, ix in word_to_ix_dari.items()}

# Prepare test DataLoader
X_test_encoded = [prepare_sequence(seq, word_to_ix_eng) for seq in X_test]
Y_test_encoded = [prepare_sequence(seq, word_to_ix_dari) for seq in Y_test]
X_test_padded = pad_sequence(X_test_encoded, batch_first=True, padding_value=word_to_ix_eng['<PAD>'])
Y_test_padded = pad_sequence(Y_test_encoded, batch_first=True, padding_value=word_to_ix_dari['<PAD>'])
test_dataset = TensorDataset(X_test_padded, Y_test_padded)
test_loader = DataLoader(test_dataset, batch_size=32)

In [184]:
# Test the model
test_predictions = test_model(model, test_loader, word_to_ix_dari, ix_to_word_dari, device)

# Print a few test results
for i in range(50):
    print(f"Original: {' '.join(X_test[i])}")
    print(f"Predicted: {test_predictions[i]}")
    print(f"Actual: {' '.join(Y_test[i])}")
    print()

Original: theres creepers everywhere
Predicted: kayn wa7d sket
Actual: kaynin chmakriya fin ma mchiti

Original: eight books
Predicted: tmnya tlktouba
Actual: tmnya t lktouba

Original: and working for you in particular
Predicted: o chi 3la
Actual: olkhdma 3ndak 3la wjah lkhosos

Original: unless we ask them to bring some with them
Predicted: mnghir nakhdo tlab chi chi
Actual: ma3ada ila guelna lihom yjibo chwia m3ahom

Original: he has
Predicted: 3endo
Actual: aando

Original: is there an emergency button
Predicted: wach kayn chi ra7a l2itisal
Actual: wach kayn boton dyal tawari2

Original: sorry i havent asked how your day was
Predicted: sma7 liya wach lik
Actual: sma7 liya masawltkch kidaz nhark

Original: i dont want
Predicted: mabghitch
Actual: mabghitch

Original: im worried about i seem to worry about everything
Predicted: ana 7it fhadchi kayban 3la hadchi kolchi
Actual: ana khayf mn ban liya ana khayf 3la kolchi

Original: its stupid to be lost when we have a good map
Predicted