In [116]:
import pandas as pd
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import string


Cleaning the text

In [117]:
# Read and preprocess dataset
df = pd.read_csv('/kaggle/input/aaaaaa/sentences.csv')
df = df.dropna()
df = df.drop_duplicates()

In [118]:
# Remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['eng'] = df['eng'].apply(remove_punctuation)
df['darija'] = df['darija'].apply(remove_punctuation)

Train-test split

In [119]:

SPLIT_SIZE = 0.2
train_data, test_data = train_test_split(df, test_size=SPLIT_SIZE, random_state=4)
train_data, val_data = train_test_split(train_data, test_size=SPLIT_SIZE, random_state=4)


Tokenizing sentences.

In [120]:

X = train_data['eng'].str.lower().str.split().tolist()
Y = train_data['darija'].str.split().tolist()

In [121]:
# Afficher les 5 premiers cas
print("Premiers 5 éléments de X :")
for i, sentence in enumerate(X[:5]):
    print(f"{i+1}: {sentence}")

print("\nPremiers 5 éléments de Y :")
for i, sentence in enumerate(Y[:5]):
    print(f"{i+1}: {sentence}")

Premiers 5 éléments de X :
1: ['it', 'is', 'not', 'a', 'good', 'idea', 'for', 'me']
2: ['indeed']
3: ['but', 'you', 'also', 'like', 'a', 'few', 'glasses', 'of', 'good', 'french', 'wine', 'for', 'drinks']
4: ['the', 'matter', 'is', 'clear']
5: ['i', 'wish', 'i', 'could', 'transform', 'my', 'nightdreams', 'into', 'real', 'life', 'adventures']

Premiers 5 éléments de Y :
1: ['machi', 'blan', 'lia']
2: ['fi3lan']
3: ['walakin', '3ziz', '3lik', 'chi', 'kwiysat', 'dial', 'l', 'vin', 'dial', 'fransa', 'tchrbhom']
4: ['l9aDiyya', 'bayna']
5: ['makrehtch', 'koun', 'reje3t', '2a7lami', 'dial', 'lik', 'l79i9a']


In [122]:
X_val = val_data['eng'].str.lower().str.split().tolist()
Y_val = val_data['darija'].str.split().tolist()

X_test = test_data['eng'].str.lower().str.split().tolist()
Y_test = test_data['darija'].str.split().tolist()

Building vocabularies.

In [123]:

# Combine all datasets (train, val, test) to build a unified vocabulary
X_combined = train_data['eng'].str.lower().str.split().tolist() + val_data['eng'].str.lower().str.split().tolist()
Y_combined = train_data['darija'].str.split().tolist() + val_data['darija'].str.split().tolist()

# Build vocabularies for English and Darija
vocab_eng = set(word for sentence in X_combined for word in sentence)
vocab_dari = set(word for sentence in Y_combined for word in sentence)

# Create word-to-index mappings
word_to_ix_eng = {word: i for i, word in enumerate(vocab_eng, start=1)}
word_to_ix_eng['<PAD>'] = 0
word_to_ix_eng['<UNK>'] = len(word_to_ix_eng)

word_to_ix_dari = {word: i for i, word in enumerate(vocab_dari, start=1)}
word_to_ix_dari['<PAD>'] = 0
word_to_ix_dari['<UNK>'] = len(word_to_ix_dari)

# Verify sizes of vocabularies
print(f"English vocabulary size: {len(word_to_ix_eng)}")
print(f"Darija vocabulary size: {len(word_to_ix_dari)}")

English vocabulary size: 4977
Darija vocabulary size: 13362


In [124]:
# Afficher les 5 premiers éléments de word_to_ix_eng
print("5 premiers éléments de word_to_ix_eng :")
for i, (word, index) in enumerate(word_to_ix_eng.items()):
    print(f"{word}: {index}")
    if i == 4:  # Stop après 5 éléments
        break

# Afficher les 5 premiers éléments de word_to_ix_dari
print("\n5 premiers éléments de word_to_ix_dari :")
for i, (word, index) in enumerate(word_to_ix_dari.items()):
    print(f"{word}: {index}")
    if i == 4:  # Stop après 5 éléments
        break


5 premiers éléments de word_to_ix_eng :
duck: 1
nowhere: 2
opportunity: 3
era: 4
famous: 5

5 premiers éléments de word_to_ix_dari :
3lihoum: 1
sder: 2
kaykhrj: 3
kanakol: 4
kant3elem: 5


In [125]:
# Add SOS and EOS tokens
word_to_ix_eng['<SOS>'] = len(word_to_ix_eng)
word_to_ix_eng['<EOS>'] = len(word_to_ix_eng)

word_to_ix_dari['<SOS>'] = len(word_to_ix_dari)
word_to_ix_dari['<EOS>'] = len(word_to_ix_dari)

# Verify updated vocabulary sizes
print(f"Updated English vocabulary size: {len(word_to_ix_eng)}")
print(f"Updated Darija vocabulary size: {len(word_to_ix_dari)}")


Updated English vocabulary size: 4979
Updated Darija vocabulary size: 13364


In [126]:
# Function to add SOS and EOS tokens
def add_sos_eos(sentence, sos_token='<SOS>', eos_token='<EOS>'):
    return [sos_token] + sentence + [eos_token]

# Add SOS and EOS tokens to train, val, and test datasets
X = [add_sos_eos(sentence) for sentence in X]
Y = [add_sos_eos(sentence) for sentence in Y]

X_val = [add_sos_eos(sentence) for sentence in X_val]
Y_val = [add_sos_eos(sentence) for sentence in Y_val]

X_test = [add_sos_eos(sentence) for sentence in X_test]
Y_test = [add_sos_eos(sentence) for sentence in Y_test]


In [110]:
X_combined = [add_sos_eos(sentence) for sentence in X_combined]
Y_combined = [add_sos_eos(sentence) for sentence in Y_combined]


In [127]:
print(f"Example processed sentence (English): {X[0]}")
print(f"Example processed sentence (Darija): {Y[0]}")


Example processed sentence (English): ['<SOS>', 'it', 'is', 'not', 'a', 'good', 'idea', 'for', 'me', '<EOS>']
Example processed sentence (Darija): ['<SOS>', 'machi', 'blan', 'lia', '<EOS>']


Prepare sequences

In [128]:
# Prepare sequences with SOS and EOS
def prepare_sequence(seq, to_ix):
    return torch.tensor(
        [to_ix["<SOS>"]] + [to_ix.get(word, to_ix["<UNK>"]) for word in seq] + [to_ix["<EOS>"]],
        dtype=torch.long
    )

# Encode and pad sequences for training
X_train_encoded = [prepare_sequence(seq, word_to_ix_eng) for seq in X]
Y_train_encoded = [prepare_sequence(seq, word_to_ix_dari) for seq in Y]

X_train_padded = pad_sequence(X_train_encoded, batch_first=True, padding_value=word_to_ix_eng["<PAD>"])
Y_train_padded = pad_sequence(Y_train_encoded, batch_first=True, padding_value=word_to_ix_dari["<PAD>"])

# Encode and pad sequences for validation
X_val_encoded = [prepare_sequence(seq, word_to_ix_eng) for seq in X_val]
Y_val_encoded = [prepare_sequence(seq, word_to_ix_dari) for seq in Y_val]

X_val_padded = pad_sequence(X_val_encoded, batch_first=True, padding_value=word_to_ix_eng["<PAD>"])
Y_val_padded = pad_sequence(Y_val_encoded, batch_first=True, padding_value=word_to_ix_dari["<PAD>"])

# Encode and pad sequences for testing
X_test_encoded = [prepare_sequence(seq, word_to_ix_eng) for seq in X_test]
Y_test_encoded = [prepare_sequence(seq, word_to_ix_dari) for seq in Y_test]

X_test_padded = pad_sequence(X_test_encoded, batch_first=True, padding_value=word_to_ix_eng["<PAD>"])
Y_test_padded = pad_sequence(Y_test_encoded, batch_first=True, padding_value=word_to_ix_dari["<PAD>"])

# Print shapes for sanity check
print(f"X_train_padded: {X_train_padded.shape}, Y_train_padded: {Y_train_padded.shape}")
print(f"X_val_padded: {X_val_padded.shape}, Y_val_padded: {Y_val_padded.shape}")
print(f"X_test_padded: {X_test_padded.shape}, Y_test_padded: {Y_test_padded.shape}")


X_train_padded: torch.Size([8108, 42]), Y_train_padded: torch.Size([8108, 33])
X_val_padded: torch.Size([2027, 35]), Y_val_padded: torch.Size([2027, 27])
X_test_padded: torch.Size([2534, 35]), Y_test_padded: torch.Size([2534, 28])


In [129]:
# Afficher les 5 premières séquences encodées pour X
print("5 premières séquences encodées de X_train_encoded :")
for i in range(min(5, len(X_train_encoded))):
    print(f"Sequence {i + 1}: {X_train_encoded[i]}")

# Afficher les 5 premières séquences encodées pour Y
print("\n5 premières séquences encodées de Y_train_encoded :")
for i in range(min(5, len(Y_train_encoded))):
    print(f"Sequence {i + 1}: {Y_train_encoded[i]}")

# Afficher les 5 premières séquences padées pour X
print("\n5 premières séquences padées de X_train_padded :")
for i in range(min(5, len(X_train_padded))):
    print(f"Sequence {i + 1}: {X_train_padded[i].tolist()}")

# Afficher les 5 premières séquences padées pour Y
print("\n5 premières séquences padées de Y_train_padded :")
for i in range(min(5, len(Y_train_padded))):
    print(f"Sequence {i + 1}: {Y_train_padded[i].tolist()}")


5 premières séquences encodées de X_train_encoded :
Sequence 1: tensor([4977, 4977, 2700, 1372, 1064, 3949, 4082, 4881, 3607, 4535, 4978, 4978])
Sequence 2: tensor([4977, 4977, 2242, 4978, 4978])
Sequence 3: tensor([4977, 4977, 1184, 2257, 4808,  255, 3949, 3564, 2707,  629, 4082, 3410,
        3710, 3607,  243, 4978, 4978])
Sequence 4: tensor([4977, 4977,   82,  706, 1372, 1762, 4978, 4978])
Sequence 5: tensor([4977, 4977, 3506, 1667, 3506, 2993, 3898,   63, 3789, 4658, 4584, 1205,
        1183, 4978, 4978])

5 premières séquences encodées de Y_train_encoded :
Sequence 1: tensor([13362, 13362, 10264, 10505,  2364, 13363, 13363])
Sequence 2: tensor([13362, 13362, 12134, 13363, 13363])
Sequence 3: tensor([13362, 13362,  6500,  5125,  5732, 10329,  3253,  4108,  3030,  8601,
         4108,  6214,  3725, 13363, 13363])
Sequence 4: tensor([13362, 13362,  8157,  5158, 13363, 13363])
Sequence 5: tensor([13362, 13362,  2055,  5548,  7910,  2510,  4108, 11083,  6134, 13363,
        13363])

5 

LSTM Model Implementation

In [130]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define VanillaLSTMCell as per the given equations
class VanillaLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(VanillaLSTMCell, self).__init__()
        self.hidden_size = hidden_size

        # Input gate parameters
        self.Wxi = nn.Linear(input_size, hidden_size)
        self.Whi = nn.Linear(hidden_size, hidden_size)
        self.bi = nn.Parameter(torch.zeros(hidden_size))

        # Forget gate parameters
        self.Wxf = nn.Linear(input_size, hidden_size)
        self.Whf = nn.Linear(hidden_size, hidden_size)
        self.bf = nn.Parameter(torch.zeros(hidden_size))

        # Output gate parameters
        self.Wxo = nn.Linear(input_size, hidden_size)
        self.Who = nn.Linear(hidden_size, hidden_size)
        self.bo = nn.Parameter(torch.zeros(hidden_size))

        # Cell gate (g_t) parameters
        self.Wxg = nn.Linear(input_size, hidden_size)
        self.Whg = nn.Linear(hidden_size, hidden_size)
        self.bc = nn.Parameter(torch.zeros(hidden_size))

    def forward(self, x_t, prev_state):
        h_prev, c_prev = prev_state

        # Compute gates
        i_t = torch.sigmoid(self.Whi(h_prev) + self.Wxi(x_t) + self.bi)
        f_t = torch.sigmoid(self.Whf(h_prev) + self.Wxf(x_t) + self.bf)
        o_t = torch.sigmoid(self.Who(h_prev) + self.Wxo(x_t) + self.bo)
        g_t = torch.tanh(self.Whg(h_prev) + self.Wxg(x_t) + self.bc)

        # Compute cell state and hidden state
        c_t = f_t * c_prev + i_t * g_t
        h_t = o_t * torch.tanh(g_t)

        return h_t, c_t

# Define the Seq2Seq model using VanillaLSTMCell
class LSTMSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, hidden_dim, n_layers):
        super(LSTMSeq2Seq, self).__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.encoder_embedding = nn.Embedding(input_dim, embed_dim)
        self.decoder_embedding = nn.Embedding(output_dim, embed_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Encoder and Decoder use VanillaLSTMCell
        self.encoder_cells = nn.ModuleList([VanillaLSTMCell(embed_dim if i == 0 else hidden_dim, hidden_dim) for i in range(n_layers)])
        self.decoder_cells = nn.ModuleList([VanillaLSTMCell(embed_dim if i == 0 else hidden_dim, hidden_dim) for i in range(n_layers)])

    def forward(self, source, target):
        batch_size = source.size(0)
        seq_len = target.size(1)

        # Embedding for the encoder
        encoder_input = self.encoder_embedding(source)

        # Initialize states
        h, c = [torch.zeros(batch_size, self.hidden_dim).to(encoder_input.device) for _ in range(self.n_layers)], \
               [torch.zeros(batch_size, self.hidden_dim).to(encoder_input.device) for _ in range(self.n_layers)]

        # Encoder
        for t in range(source.size(1)):
            x_t = encoder_input[:, t, :]
            for i, cell in enumerate(self.encoder_cells):
                h[i], c[i] = cell(x_t, (h[i], c[i]))
                x_t = h[i]  # Pass the hidden state to the next layer

        # Initialize decoder states with encoder final states
        decoder_input = self.decoder_embedding(target[:, 0])  # Feed <SOS> token at the first timestep
        outputs = []

        for t in range(seq_len - 1):  # Exclude the last token in target
            for i, cell in enumerate(self.decoder_cells):
                h[i], c[i] = cell(decoder_input, (h[i], c[i]))
                decoder_input = h[i]  # Feed the current hidden state to the next layer

            # Get the output
            outputs.append(self.fc(decoder_input))  # Output for current timestep

            if t < seq_len - 1:  # Teacher forcing: use the target token for the next time step
                decoder_input = self.decoder_embedding(target[:, t + 1])

        # Stack outputs
        outputs = torch.stack(outputs, dim=1)
        return outputs


In [131]:
print(f"input_dim: {input_dim}, word_to_ix_eng size: {len(word_to_ix_eng)}")
print(f"output_dim: {output_dim}, word_to_ix_dari size: {len(word_to_ix_dari)}")


input_dim: 4979, word_to_ix_eng size: 4979
output_dim: 13364, word_to_ix_dari size: 13364


In [56]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# Create TensorDatasets for train, validation, and test
train_dataset = TensorDataset(X_train_padded, Y_train_padded)
val_dataset = TensorDataset(X_val_padded, Y_val_padded)
test_dataset = TensorDataset(X_test_padded, Y_test_padded)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define model parameters
input_dim = len(word_to_ix_eng)  # Vocabulary size of English
output_dim = len(word_to_ix_dari)  # Vocabulary size of Darija
embed_dim = 256  # Embedding size
hidden_dim = 512  # Hidden layer size
n_layers = 2  # Number of LSTM layers
lr=0.001

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMSeq2Seq(input_dim, output_dim, embed_dim, hidden_dim, n_layers).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=word_to_ix_dari["<PAD>"])  # Ignore PAD tokens in the loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for batch in train_loader:
        source, target = batch
        source, target = source.to(device), target.to(device)

        optimizer.zero_grad()

        # Forward pass
        output = model(source, target)
        
        # Reshape output and target for loss computation
        output = output.view(-1, output_dim)  # Flatten predictions
        target = target[:, 1:].contiguous().view(-1)  # Flatten target, exclude <SOS> token

        # Compute loss and backpropagate
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    # Validation loss (optional)
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            source, target = batch
            source, target = source.to(device), target.to(device)

            output = model(source, target)
            output = output.view(-1, output_dim)
            target = target[:, 1:].contiguous().view(-1)

            val_loss += criterion(output, target).item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {val_loss / len(val_loader):.4f}")


Epoch 1/20, Train Loss: 5.4781, Validation Loss: 5.1966
Epoch 2/20, Train Loss: 4.8811, Validation Loss: 5.2758
Epoch 3/20, Train Loss: 4.7316, Validation Loss: 5.3133
Epoch 4/20, Train Loss: 4.5752, Validation Loss: 5.3121
Epoch 5/20, Train Loss: 4.3887, Validation Loss: 5.3861
Epoch 6/20, Train Loss: 4.1529, Validation Loss: 5.4817
Epoch 7/20, Train Loss: 3.8665, Validation Loss: 5.5950
Epoch 8/20, Train Loss: 3.5235, Validation Loss: 5.7382
Epoch 9/20, Train Loss: 3.1559, Validation Loss: 5.8747
Epoch 10/20, Train Loss: 2.7731, Validation Loss: 6.0137
Epoch 11/20, Train Loss: 2.4187, Validation Loss: 6.1691
Epoch 12/20, Train Loss: 2.1212, Validation Loss: 6.3103
Epoch 13/20, Train Loss: 1.8920, Validation Loss: 6.3907
Epoch 14/20, Train Loss: 1.7238, Validation Loss: 6.5345
Epoch 15/20, Train Loss: 1.6033, Validation Loss: 6.6026
Epoch 16/20, Train Loss: 1.5259, Validation Loss: 6.7041
Epoch 17/20, Train Loss: 1.4747, Validation Loss: 6.7427
Epoch 18/20, Train Loss: 1.4420, Validat

In [24]:
print(model)

LSTMSeq2Seq(
  (encoder_embedding): Embedding(4979, 256)
  (decoder_embedding): Embedding(13364, 256)
  (fc): Linear(in_features=512, out_features=13364, bias=True)
  (encoder_cells): ModuleList(
    (0): VanillaLSTMCell(
      (Wxi): Linear(in_features=256, out_features=512, bias=True)
      (Whi): Linear(in_features=512, out_features=512, bias=True)
      (Wxf): Linear(in_features=256, out_features=512, bias=True)
      (Whf): Linear(in_features=512, out_features=512, bias=True)
      (Wxo): Linear(in_features=256, out_features=512, bias=True)
      (Who): Linear(in_features=512, out_features=512, bias=True)
      (Wxg): Linear(in_features=256, out_features=512, bias=True)
      (Whg): Linear(in_features=512, out_features=512, bias=True)
    )
    (1): VanillaLSTMCell(
      (Wxi): Linear(in_features=512, out_features=512, bias=True)
      (Whi): Linear(in_features=512, out_features=512, bias=True)
      (Wxf): Linear(in_features=512, out_features=512, bias=True)
      (Whf): Linear(

In [25]:
# Chemin pour sauvegarder le modèle
model_path = "/kaggle/working/lstm_seq2seq_model.pth"

# Sauvegarder le modèle
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'input_dim': input_dim,
    'output_dim': output_dim,
    'embed_dim': embed_dim,
    'hidden_dim': hidden_dim,
    'n_layers': n_layers,
}, model_path)

print(f"Modèle sauvegardé dans {model_path}")


Modèle sauvegardé dans /kaggle/working/lstm_seq2seq_model.pth


test

In [83]:
def translate_sentence(sentence, model, word_to_ix_eng, word_to_ix_dari, ix_to_word_dari, max_length=50):
    # Preprocess and tokenize the sentence
    tokens = sentence.lower().split()  # Tokenize and lowercase
    tokens = [word_to_ix_eng.get(word, word_to_ix_eng['<UNK>']) for word in tokens]  # Convert to indices
    tokens = [word_to_ix_eng['<SOS>']] + tokens + [word_to_ix_eng['<EOS>']]  # Add <SOS> and <EOS>
    
    # Convert to tensor and move to device
    input_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

    model.eval()
    with torch.no_grad():
        # Encoder forward pass
        batch_size = input_tensor.size(0)
        encoder_input = model.encoder_embedding(input_tensor)
        h, c = [torch.zeros(batch_size, model.hidden_dim).to(device) for _ in range(model.n_layers)], \
               [torch.zeros(batch_size, model.hidden_dim).to(device) for _ in range(model.n_layers)]
        
        for t in range(input_tensor.size(1)):
            x_t = encoder_input[:, t, :]
            for i, cell in enumerate(model.encoder_cells):
                h[i], c[i] = cell(x_t, (h[i], c[i]))
                x_t = h[i]  # Pass the hidden state to the next layer

        # Decoder initialization
        decoder_input = torch.tensor([word_to_ix_dari['<SOS>']], dtype=torch.long).to(device)  # Start with <SOS>
        decoder_input = model.decoder_embedding(decoder_input).unsqueeze(0)  # Add batch dimension
        translated_sentence = []

        for _ in range(max_length):
            for i, cell in enumerate(model.decoder_cells):
                h[i], c[i] = cell(decoder_input.squeeze(1), (h[i], c[i]))
                decoder_input = h[i]  # Feed the current hidden state to the next layer

            output_logits = model.fc(decoder_input)  # Compute output
            predicted_idx = torch.argmax(output_logits, dim=-1).item()  # Get index of the predicted word

            if predicted_idx == word_to_ix_dari['<EOS>']:
                break  # Stop if <EOS> is predicted

            translated_sentence.append(ix_to_word_dari[predicted_idx])  # Convert index to word
            decoder_input = model.decoder_embedding(torch.tensor([predicted_idx], dtype=torch.long).to(device)).unsqueeze(0)

    return ' '.join(translated_sentence)


In [80]:
ix_to_word_dari = {index: word for word, index in word_to_ix_dari.items()}


In [134]:
test_sentence = "are you new to this neighborhood"
translated_sentence = translate_sentence(test_sentence, model, word_to_ix_eng, word_to_ix_dari, ix_to_word_dari)
print(f"Translated: {translated_sentence}")


Translated: <SOS> wach nta jdid fhad l7ay


In [93]:
test_sentence = "hand me that bag"
translated_sentence = translate_sentence(test_sentence, model3, word_to_ix_eng, word_to_ix_dari, ix_to_word_dari)
print(f"Translated: {translated_sentence}")


Translated: <SOS> wach nta katakhod


In [136]:
test_sentence = "where are you"
translated_sentence = translate_sentence(test_sentence, model3, word_to_ix_eng, word_to_ix_dari, ix_to_word_dari)
print(f"Translated: {translated_sentence}")

Translated: <SOS> wach nta


In [101]:
test_sentence = "So the first thing you do is choose"
translated_sentence = translate_sentence(test_sentence, model2, word_to_ix_eng, word_to_ix_dari, ix_to_word_dari)
print(f"Translated: {translated_sentence}")

Translated: <SOS> wach 3ndk chi blassa katmchi liha fach kat7ss brassk machi houa hadak


In [80]:
test_sentences = ["how are you", "what is your name", "I am learning AI", "hello",""]

for sentence in test_sentences:
    translated_sentence = translate_sentence(sentence, model, word_to_ix_eng, word_to_ix_dari, ix_to_word_dari)
    print(f"Anglais : {sentence} => Darija : {translated_sentence}")


Anglais : how are you => Darija : <SOS> wach kayn nabid ma7ali bchkal khaS
Anglais : what is your name => Darija : <SOS> wach nta wajd
Anglais : I am learning AI => Darija : <SOS> wach kayn nabid ma7ali bchkal khaS
Anglais : hello => Darija : <SOS> wach kayn nabid ma7ali bchkal khaS
Anglais :  => Darija : <SOS> wach kayn nabid ma7ali bchkal khaS


model 2 changing hyperparams

In [133]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# Create TensorDatasets for train, validation, and test
train_dataset = TensorDataset(X_train_padded, Y_train_padded)
val_dataset = TensorDataset(X_val_padded, Y_val_padded)
test_dataset = TensorDataset(X_test_padded, Y_test_padded)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Define model parameters
input_dim = len(word_to_ix_eng)  # Vocabulary size of English
output_dim = len(word_to_ix_dari)  # Vocabulary size of Darija
embed_dim = 64  # Embedding size
hidden_dim2 = 128  # Hidden layer size
n_layers = 1  # Number of LSTM layers
lr=0.0001

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2 = LSTMSeq2Seq(input_dim, output_dim, embed_dim, hidden_dim, n_layers).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=word_to_ix_dari["<PAD>"])  # Ignore PAD tokens in the loss
optimizer = optim.Adam(model2.parameters(), lr=0.001)

# Training loop
num_epochs = 15
for epoch in range(num_epochs):
    model2.train()
    epoch_loss = 0
    
    for batch in train_loader:
        source, target = batch
        source, target = source.to(device), target.to(device)

        optimizer.zero_grad()

        # Forward pass
        output = model2(source, target)
        
        # Reshape output and target for loss computation
        output = output.view(-1, output_dim)  # Flatten predictions
        target = target[:, 1:].contiguous().view(-1)  # Flatten target, exclude <SOS> token

        # Compute loss and backpropagate
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    # Validation loss (optional)
    model2.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            source, target = batch
            source, target = source.to(device), target.to(device)

            output = model2(source, target)
            output = output.view(-1, output_dim)
            target = target[:, 1:].contiguous().view(-1)

            val_loss += criterion(output, target).item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {val_loss / len(val_loader):.4f}")


Epoch 1/15, Train Loss: 6.2453, Validation Loss: 5.2595
Epoch 2/15, Train Loss: 4.9672, Validation Loss: 5.2891
Epoch 3/15, Train Loss: 4.8273, Validation Loss: 5.3159
Epoch 4/15, Train Loss: 4.6952, Validation Loss: 5.3195
Epoch 5/15, Train Loss: 4.5541, Validation Loss: 5.3403
Epoch 6/15, Train Loss: 4.4071, Validation Loss: 5.3334
Epoch 7/15, Train Loss: 4.2531, Validation Loss: 5.3632
Epoch 8/15, Train Loss: 4.0984, Validation Loss: 5.3863
Epoch 9/15, Train Loss: 3.9473, Validation Loss: 5.4109
Epoch 10/15, Train Loss: 3.7981, Validation Loss: 5.4299
Epoch 11/15, Train Loss: 3.6548, Validation Loss: 5.4604
Epoch 12/15, Train Loss: 3.5152, Validation Loss: 5.4940
Epoch 13/15, Train Loss: 3.3826, Validation Loss: 5.5363
Epoch 14/15, Train Loss: 3.2551, Validation Loss: 5.5631
Epoch 15/15, Train Loss: 3.1316, Validation Loss: 5.5999


In [None]:
print(model2)

model 3 

In [55]:
# Training Setup
batch_size = 64
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
train_loader = DataLoader(TensorDataset(X_train_padded, Y_train_padded), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_padded, Y_val_padded), batch_size=batch_size, shuffle=False)

input_dim = len(word_to_ix_eng)
output_dim = len(word_to_ix_dari)
embed_dim = 64
hidden_dim = 128
n_layers = 1
lr=0.001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model3 = LSTMSeq2Seq(input_dim, output_dim, embed_dim, hidden_dim, n_layers).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=word_to_ix_dari["<PAD>"])
optimizer = optim.Adam(model3.parameters(), lr=0.001)

num_epochs = 15
for epoch in range(num_epochs):
    model3.train()
    epoch_loss = 0

    for source, target in train_loader:
        source, target = source.to(device), target.to(device)

        optimizer.zero_grad()
        output = model3(source, target)
        output = output.view(-1, output_dim)
        target = target[:, 1:].contiguous().view(-1)

        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model3.parameters(), max_norm=1.0)
        optimizer.step()
        epoch_loss += loss.item()

    val_loss = 0
    model3.eval()
    with torch.no_grad():
        for source, target in val_loader:
            source, target = source.to(device), target.to(device)
            output = model3(source, target)
            output = output.view(-1, output_dim)
            target = target[:, 1:].contiguous().view(-1)
            val_loss += criterion(output, target).item()

    avg_val_loss = val_loss / len(val_loader)  # Compute avg_val_loss before using it
    scheduler.step(avg_val_loss)  # Pass avg_val_loss to the scheduler

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {avg_val_loss:.4f}")

Epoch 1/15, Train Loss: 6.3758, Validation Loss: 5.2577
Epoch 2/15, Train Loss: 4.9536, Validation Loss: 5.3045
Epoch 3/15, Train Loss: 4.8059, Validation Loss: 5.3305
Epoch 4/15, Train Loss: 4.6616, Validation Loss: 5.3361
Epoch 5/15, Train Loss: 4.5085, Validation Loss: 5.3608
Epoch 6/15, Train Loss: 4.3543, Validation Loss: 5.3721
Epoch 7/15, Train Loss: 4.1993, Validation Loss: 5.3927
Epoch 8/15, Train Loss: 4.0429, Validation Loss: 5.4303
Epoch 9/15, Train Loss: 3.8928, Validation Loss: 5.4450
Epoch 10/15, Train Loss: 3.7462, Validation Loss: 5.4685
Epoch 11/15, Train Loss: 3.6041, Validation Loss: 5.5011
Epoch 12/15, Train Loss: 3.4667, Validation Loss: 5.5429
Epoch 13/15, Train Loss: 3.3362, Validation Loss: 5.5791
Epoch 14/15, Train Loss: 3.2095, Validation Loss: 5.6105
Epoch 15/15, Train Loss: 3.0911, Validation Loss: 5.6492


In [137]:
# Training Setup
batch_size = 64
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
train_loader = DataLoader(TensorDataset(X_train_padded, Y_train_padded), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_padded, Y_val_padded), batch_size=batch_size, shuffle=False)

input_dim = len(word_to_ix_eng)
output_dim = len(word_to_ix_dari)
embed_dim = 512
hidden_dim = 512
n_layers = 5
lr=0.001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model3 = LSTMSeq2Seq(input_dim, output_dim, embed_dim, hidden_dim, n_layers).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=word_to_ix_dari["<PAD>"])
optimizer = optim.Adam(model3.parameters(), lr=0.001)

num_epochs = 15
for epoch in range(num_epochs):
    model3.train()
    epoch_loss = 0

    for source, target in train_loader:
        source, target = source.to(device), target.to(device)

        optimizer.zero_grad()
        output = model3(source, target)
        output = output.view(-1, output_dim)
        target = target[:, 1:].contiguous().view(-1)

        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model3.parameters(), max_norm=1.0)
        optimizer.step()
        epoch_loss += loss.item()

    val_loss = 0
    model3.eval()
    with torch.no_grad():
        for source, target in val_loader:
            source, target = source.to(device), target.to(device)
            output = model3(source, target)
            output = output.view(-1, output_dim)
            target = target[:, 1:].contiguous().view(-1)
            val_loss += criterion(output, target).item()

    avg_val_loss = val_loss / len(val_loader)  # Compute avg_val_loss before using it
    scheduler.step(avg_val_loss)  # Pass avg_val_loss to the scheduler

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {avg_val_loss:.4f}")

Epoch 1/15, Train Loss: 5.5366, Validation Loss: 5.2606
Epoch 2/15, Train Loss: 4.9086, Validation Loss: 5.3500
Epoch 3/15, Train Loss: 4.7672, Validation Loss: 5.3798
Epoch 4/15, Train Loss: 4.6350, Validation Loss: 5.4555
Epoch 5/15, Train Loss: 4.5125, Validation Loss: 5.5186
Epoch 6/15, Train Loss: 4.3849, Validation Loss: 5.6219
Epoch 7/15, Train Loss: 4.2575, Validation Loss: 5.7511
Epoch 8/15, Train Loss: 4.1436, Validation Loss: 5.8736
Epoch 9/15, Train Loss: 4.0308, Validation Loss: 5.9787
Epoch 10/15, Train Loss: 3.9315, Validation Loss: 6.1090
Epoch 11/15, Train Loss: 3.8377, Validation Loss: 6.1317
Epoch 12/15, Train Loss: 3.7421, Validation Loss: 6.2169
Epoch 13/15, Train Loss: 3.6564, Validation Loss: 6.3159
Epoch 14/15, Train Loss: 3.5821, Validation Loss: 6.3698
Epoch 15/15, Train Loss: 3.5033, Validation Loss: 6.4415


In [92]:
print(model3)

LSTMSeq2Seq(
  (encoder_embedding): Embedding(4979, 64)
  (decoder_embedding): Embedding(13364, 64)
  (fc): Linear(in_features=128, out_features=13364, bias=True)
  (encoder_cells): ModuleList(
    (0): VanillaLSTMCell(
      (Wxi): Linear(in_features=64, out_features=128, bias=True)
      (Whi): Linear(in_features=128, out_features=128, bias=True)
      (Wxf): Linear(in_features=64, out_features=128, bias=True)
      (Whf): Linear(in_features=128, out_features=128, bias=True)
      (Wxo): Linear(in_features=64, out_features=128, bias=True)
      (Who): Linear(in_features=128, out_features=128, bias=True)
      (Wxg): Linear(in_features=64, out_features=128, bias=True)
      (Whg): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (decoder_cells): ModuleList(
    (0): VanillaLSTMCell(
      (Wxi): Linear(in_features=64, out_features=128, bias=True)
      (Whi): Linear(in_features=128, out_features=128, bias=True)
      (Wxf): Linear(in_features=64, out_features=128, bias

In [102]:
print(model2)

LSTMSeq2Seq(
  (encoder_embedding): Embedding(4979, 64)
  (decoder_embedding): Embedding(13364, 64)
  (fc): Linear(in_features=512, out_features=13364, bias=True)
  (encoder_cells): ModuleList(
    (0): VanillaLSTMCell(
      (Wxi): Linear(in_features=64, out_features=512, bias=True)
      (Whi): Linear(in_features=512, out_features=512, bias=True)
      (Wxf): Linear(in_features=64, out_features=512, bias=True)
      (Whf): Linear(in_features=512, out_features=512, bias=True)
      (Wxo): Linear(in_features=64, out_features=512, bias=True)
      (Who): Linear(in_features=512, out_features=512, bias=True)
      (Wxg): Linear(in_features=64, out_features=512, bias=True)
      (Whg): Linear(in_features=512, out_features=512, bias=True)
    )
  )
  (decoder_cells): ModuleList(
    (0): VanillaLSTMCell(
      (Wxi): Linear(in_features=64, out_features=512, bias=True)
      (Whi): Linear(in_features=512, out_features=512, bias=True)
      (Wxf): Linear(in_features=64, out_features=512, bias

In [91]:
print(model)

LSTMSeq2Seq(
  (encoder_embedding): Embedding(4979, 256)
  (decoder_embedding): Embedding(13364, 256)
  (fc): Linear(in_features=512, out_features=13364, bias=True)
  (encoder_cells): ModuleList(
    (0): VanillaLSTMCell(
      (Wxi): Linear(in_features=256, out_features=512, bias=True)
      (Whi): Linear(in_features=512, out_features=512, bias=True)
      (Wxf): Linear(in_features=256, out_features=512, bias=True)
      (Whf): Linear(in_features=512, out_features=512, bias=True)
      (Wxo): Linear(in_features=256, out_features=512, bias=True)
      (Who): Linear(in_features=512, out_features=512, bias=True)
      (Wxg): Linear(in_features=256, out_features=512, bias=True)
      (Whg): Linear(in_features=512, out_features=512, bias=True)
    )
    (1): VanillaLSTMCell(
      (Wxi): Linear(in_features=512, out_features=512, bias=True)
      (Whi): Linear(in_features=512, out_features=512, bias=True)
      (Wxf): Linear(in_features=512, out_features=512, bias=True)
      (Whf): Linear(

**Advanced LSTM Cells**

In [61]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AdvancedLSTMCell(nn.Module):
    def _init_(self, input_dim, hidden_dim, peephole=True, working_memory=True):
        super(AdvancedLSTMCell, self)._init_()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.peephole = peephole
        self.working_memory = working_memory
        
        # Input gate parameters
        self.Wxi = nn.Linear(input_dim, hidden_dim, bias=True)
        self.Whi = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.Wci = nn.Linear(hidden_dim, hidden_dim, bias=False) if peephole else None

        # Forget gate parameters
        self.Wxf = nn.Linear(input_dim, hidden_dim, bias=True)
        self.Whf = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.Wcf = nn.Linear(hidden_dim, hidden_dim, bias=False) if peephole else None

        # Output gate parameters
        self.Wxo = nn.Linear(input_dim, hidden_dim, bias=True)
        self.Who = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.Wco = nn.Linear(hidden_dim, hidden_dim, bias=False) if peephole else None

        # Cell state parameters
        self.Wxg = nn.Linear(input_dim, hidden_dim, bias=True)
        self.Whg = nn.Linear(hidden_dim, hidden_dim, bias=False)

    def forwardA(self, x, hidden):
        h_prev, c_prev = hidden

        # Working Memory Connections
        if self.working_memory:
            tanh_c_prev = torch.tanh(c_prev)
            i = torch.sigmoid(self.Wxi(x) + self.Whi(h_prev) + self.Wci(c_prev) if self.peephole else self.Wxi(x) + self.Whi(h_prev) + tanh_c_prev)
            f = torch.sigmoid(self.Wxf(x) + self.Whf(h_prev) + self.Wcf(c_prev) if self.peephole else self.Wxf(x) + self.Whf(h_prev) + tanh_c_prev)
            o = torch.sigmoid(self.Wxo(x) + self.Who(h_prev) + self.Wco(c_prev) if self.peephole else self.Wxo(x) + self.Who(h_prev) + tanh_c_prev)
        else:
            i = torch.sigmoid(self.Wxi(x) + self.Whi(h_prev) + (self.Wci(c_prev) if self.peephole else 0))
            f = torch.sigmoid(self.Wxf(x) + self.Whf(h_prev) + (self.Wcf(c_prev) if self.peephole else 0))
            o = torch.sigmoid(self.Wxo(x) + self.Who(h_prev) + (self.Wco(c_prev) if self.peephole else 0))

        # Compute the candidate cell state
        g = torch.tanh(self.Wxg(x) + self.Whg(h_prev))
        
        # Compute the new cell state and hidden state
        c_next = f * c_prev + i * g
        h_next = o * torch.tanh(g)

        return h_next, c_next

In [85]:
class AdvancedLSTMEncoder(nn.Module):
    def _init_(self, input_dim, embed_dim, hidden_dim, n_layers, peephole=False, working_memory=False):
        super(AdvancedLSTMEncoder, self)._init_()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm_cells = nn.ModuleList(
            [AdvancedLSTMCell(embed_dim if i == 0 else hidden_dim, hidden_dim, peephole, working_memory) for i in range(n_layers)]
        )

    def forward(self, x):
        embedded = self.embedding(x)
        h, c = [None] * len(self.lstm_cells), [None] * len(self.lstm_cells)
        outputs = []
        
        for t in range(embedded.size(1)):
            x_t = embedded[:, t, :]
            for i, lstm_cell in enumerate(self.lstm_cells):
                h_prev, c_prev = (h[i], c[i]) if h[i] is not None else (torch.zeros_like(x_t), torch.zeros_like(x_t))
                h[i], c[i] = lstm_cell(x_t, (h_prev, c_prev))
                x_t = h[i]
            outputs.append(h[-1])

        return torch.stack(outputs, dim=1), (h, c)

In [103]:
class LSTMSeq2Seq(nn.Module):
    def _init_(self, input_dim, output_dim, embed_dim, hidden_dim, n_layers,
                 lstm_cell_type, peephole=False, working_memory=False):
        super(LSTMSeq2Seq, self)._init_()
        
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        # Embedding layers
        self.encoder_embedding = nn.Embedding(input_dim, embed_dim)
        self.decoder_embedding = nn.Embedding(output_dim, embed_dim)

        # Encoder LSTM layers
        self.encoder_cells = nn.ModuleList([
            lstm_cell_type(embed_dim if i == 0 else hidden_dim, hidden_dim, 
                           peephole=peephole, working_memory=working_memory)
            for i in range(n_layers)
        ])
        
        # Decoder LSTM layers
        self.decoder_cells = nn.ModuleList([
            lstm_cell_type(embed_dim if i == 0 else hidden_dim, hidden_dim, 
                           peephole=peephole, working_memory=working_memory)
            for i in range(n_layers)
        ])
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

In [105]:
# Define model with Advanced LSTM Cell
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = len(word_to_ix_eng)
output_dim = len(word_to_ix_dari)
embed_dim = 64
hidden_dim = 128
n_layers = 2

model = LSTMSeq2Seq(
    input_dim=input_dim,
    output_dim=output_dim,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    n_layers=n_layers,
    lstm_cell_type=AdvancedLSTMCell,  # Use the advanced LSTM cell
    peephole=True,  # Enable peephole connections
    working_memory=True  # Enable working memory connections
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=word_to_ix_dari['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

# Training and Validation
num_epochs = 15
batch_size = 64
train_loader = DataLoader(TensorDataset(X_train_padded, Y_train_padded), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_padded, Y_val_padded), batch_size=batch_size, shuffle=False)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for source, target in train_loader:
        source, target = source.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(source, target)
        output = output.view(-1, output_dim)
        target = target[:, 1:].contiguous().view(-1)

        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for source, target in val_loader:
            source, target = source.to(device), target.to(device)
            output = model(source, target)
            output = output.view(-1, output_dim)
            target = target[:, 1:].contiguous().view(-1)
            val_loss += criterion(output, target).item()

    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

TypeError: LSTMSeq2Seq.__init__() got an unexpected keyword argument 'input_dim'