In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tqdm import tqdm

In [2]:
# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Hyper-Parameters
MAX_LENGTH = 25
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3
BATCH_SIZE = 64
NUM_EPOCHS = 50
LEARNING_RATE = 0.001
CLIP = 1

### Data Preprocessing

In [4]:
import pandas as pd

df = pd.read_csv("eng_spn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,English words/sentences,French words/sentences
0,0,go,ve
1,1,go,vete
2,2,go,vaya
3,3,go,vayase
4,4,hi,hola


In [5]:
input_data = df["English words/sentences"]
target_data = df["French words/sentences"].apply(lambda x: "<sos> " + x + " <eos>")

In [6]:
# Tokenize dataset
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_data)
input_sequences = input_tokenizer.texts_to_sequences(input_data)

target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(target_data)
target_sequences = target_tokenizer.texts_to_sequences(target_data)

In [7]:
# Pad sequences
padded_input_sequences = pad_sequences(
    input_sequences, maxlen=MAX_LENGTH, padding="post"
)

padded_target_sequences = pad_sequences(
    target_sequences, maxlen=MAX_LENGTH, padding="post"
)

In [8]:
# Vocab size
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [9]:
# Convert to pytorch tensors
input_tensor = torch.tensor(padded_input_sequences, dtype=torch.long)
target_tensor = torch.tensor(padded_target_sequences, dtype=torch.long)

In [10]:
# Dataloader
dataloader = DataLoader(
    TensorDataset(input_tensor, target_tensor), batch_size=BATCH_SIZE, shuffle=True
)

### Encoder

In [11]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout):
        super(Encoder, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # LSTM layer
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True,
        )

        # Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Convert tokens to vectors
        embedded = self.embedding(x)

        # Pass the embedded vector into LSTM layer
        lstm_output, (hidden, cell) = self.lstm(embedded)

        return hidden, cell

### Decoder

In [12]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout):
        super(Decoder, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # LSTM layer
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True,
        )

        # Dropout
        self.dropout = nn.Dropout(dropout)

        # Linear layer
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        # Add batch dimension
        x = x.unsqueeze(1)

        # Convert tokens to vectors
        embedded = self.embedding(x)

        # Pass the embedded vector into LSTM layer
        lstm_output, (hidden, cell) = self.lstm(embedded, (hidden, cell))

        # Remove the batch dimension
        lstm_output = lstm_output.squeeze(1)

        # Generate predictions for the next token
        prediction = self.fc(lstm_output)

        return prediction, hidden, cell

### Seq2Seq

In [13]:
class Seq2Seq(nn.Module):
    def __init__(
        self,
        input_vocab_size,
        target_vocab_size,
        embed_dim,
        hidden_dim,
        num_layers,
        dropout,
    ):
        super(Seq2Seq, self).__init__()

        # Encoder
        self.encoder = Encoder(
            input_vocab_size, embed_dim, hidden_dim, num_layers, dropout
        )

        # Decoder
        self.decoder = Decoder(
            target_vocab_size, embed_dim, hidden_dim, num_layers, dropout
        )

    def forward(self, input, target):
        batch_size, max_length = target.size()
        target_vocab_size = self.decoder.fc.out_features

        # Tensor to store outputs for all time steps
        outputs = torch.zeros(batch_size, max_length, target_vocab_size)

        # Get hidden and cell states from the encoder
        hidden, cell = self.encoder(input)

        # Start decoding with the first target token
        target_input_token = target[:, 0]

        for t in range(1, max_length):
            decoder_output, hidden, cell = self.decoder(
                target_input_token, hidden, cell
            )
            outputs[:, t, :] = decoder_output
            target_input_token = target[:, t]

        return outputs

In [14]:
# Initialize model
model = Seq2Seq(
    input_vocab_size,
    target_vocab_size,
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_LAYERS,
    DROPOUT,
)

In [15]:
# Checkpoint


# Save model function
def save_checkpoint(epoch, model, filename="checkpoint.pth"):
    torch.save(
        {
            "epoch": epoch + 1,
            "model_state_dict": model.state_dict(),
        },
        filename,
    )


# Load model function
def load_checkpoint(model, filename):
    checkpoint = torch.load(filename)

    start_epoch = checkpoint["epoch"]

    model.load_state_dict(checkpoint["model_state_dict"])
    return start_epoch

In [16]:
# Load model
try:
    start_epoch = load_checkpoint(model, filename="checkpoint.pth")
    print(f"Resuming training from epoch: {start_epoch}")
except FileNotFoundError:
    start_epoch = 1
    print(f"No checkpoint found, starting training from scratch...")

No checkpoint found, starting training from scratch...


  checkpoint = torch.load(filename)


In [17]:
# Initialize Adam optimizer and Loss function
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [18]:
# Train function
def train(model, optimizer, criterion, dataloader, epochs=NUM_EPOCHS):

    model.train()  # Set model to Training mode

    total_loss = 0

    for epoch in range(start_epoch, epochs + 1):
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch}/{epochs}")

        for input, target in progress_bar:

            # Reset gradients
            optimizer.zero_grad()

            # Forward pass
            output = model(input, target)

            # Reshape input and target to calculate loss
            output = output[:, 1:].reshape(-1, output.shape[2])
            target = target[:, 1:].reshape(-1)

            # Compute loss and backpropagation
            loss = criterion(output, target)
            loss.backward()

            # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)

            # Update model parameters
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        total_loss += epoch_loss

        progress_bar.close()

        save_checkpoint(epoch, model)

    print(f"Total Loss: {total_loss/len(dataloader)}")

In [19]:
# Training
train(model, optimizer, criterion, dataloader)

Epoch 1/50: 100%|██████████| 157/157 [02:56<00:00,  1.13s/it, loss=4.76]
Epoch 2/50: 100%|██████████| 157/157 [02:53<00:00,  1.11s/it, loss=4.42]
Epoch 3/50: 100%|██████████| 157/157 [02:51<00:00,  1.09s/it, loss=4.08]
Epoch 4/50: 100%|██████████| 157/157 [03:05<00:00,  1.18s/it, loss=4.09]
Epoch 5/50: 100%|██████████| 157/157 [03:05<00:00,  1.18s/it, loss=3.94]
Epoch 6/50: 100%|██████████| 157/157 [03:18<00:00,  1.27s/it, loss=3.07]
Epoch 7/50: 100%|██████████| 157/157 [03:33<00:00,  1.36s/it, loss=2.96]
Epoch 8/50: 100%|██████████| 157/157 [03:15<00:00,  1.24s/it, loss=2.97]
Epoch 9/50: 100%|██████████| 157/157 [03:16<00:00,  1.25s/it, loss=2.66]
Epoch 10/50: 100%|██████████| 157/157 [02:55<00:00,  1.12s/it, loss=2.6] 
Epoch 11/50: 100%|██████████| 157/157 [02:51<00:00,  1.09s/it, loss=2.42]
Epoch 12/50: 100%|██████████| 157/157 [02:54<00:00,  1.11s/it, loss=2.47]
Epoch 13/50: 100%|██████████| 157/157 [02:36<00:00,  1.00it/s, loss=2.58]
Epoch 14/50: 100%|██████████| 157/157 [02:47<00

Total Loss: 74.06826939419577





### Prediction

In [20]:
def predict(
    model, input_text, input_tokenizer, target_tokenizer, max_length=MAX_LENGTH
):

    # Set model to evaluation mode
    model.eval()

    # Convert text to sequence
    input_sequence = input_tokenizer.texts_to_sequences([input_text])

    # Apply padding
    padded_input_sequence = pad_sequences(
        input_sequence, maxlen=max_length, padding="post"
    )

    # Convert to torch tensor
    input_tensor = torch.tensor(padded_input_sequence, dtype=torch.long)

    with torch.no_grad():
        hidden, cell = model.encoder(input_tensor)

    sos_token = target_tokenizer.word_index["sos"]

    # Start prediction with "sos" token
    x_input = torch.tensor([sos_token], dtype=torch.long)

    # List to store predicted tokens
    translated_sentence = []

    for _ in range(max_length):
        with torch.no_grad():
            prediction, hidden, cell = model.decoder(x_input, hidden, cell)

        predicted_token = prediction.argmax(1).item()

        # Stop prediction if "eos" is predicted
        if predicted_token == target_tokenizer.word_index["eos"]:
            break

        translated_sentence.append(predicted_token)

        # Update x_input for next time step
        x_input = torch.tensor([predicted_token], dtype=torch.long)

    # Convert tokens back to text
    translated_sentence = target_tokenizer.sequences_to_texts([translated_sentence])[0]

    return translated_sentence

In [22]:
# Predict
input_sentence = "ask tom instead"
translated_sentence = predict(model, input_sentence, input_tokenizer, target_tokenizer)
print(f"Input Sentence: {input_sentence}")
print(f"Translated Sentence: {translated_sentence}")

Input Sentence: ask tom instead
Translated Sentence: pregunta a tom en su lugar
