# RNN with attention for translation


### Loading the data


In [1]:
import pandas as pd

train_df = pd.read_csv(
    "por.txt",
    sep="\t",
    usecols=[0, 1],
    names=["EN", "PR"],
)
train_df.head()

Unnamed: 0,EN,PR
0,Go.,Vai.
1,Go.,Vá.
2,Hi.,Oi.
3,Run!,Corre!
4,Run!,Corra!


### Preprocess


In [2]:
import unicodedata
import re


def unicode2ascii(s):
    return "".join(
        c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"
    )


def normalize_string(s):
    s = unicode2ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [3]:
from tqdm import tqdm


def preprocess(texts):
    for text in tqdm(texts, desc="Building vocab"):
        tokens = normalize_string(str(text)).split()
        yield tokens

### Create vocabs


In [4]:
from torchtext.vocab import build_vocab_from_iterator

SPECIAL_TOKENS = ["<SOS>", "<EOS>", "<UNK>", "<PAD>"]
english_vocab = build_vocab_from_iterator(
    preprocess(train_df["EN"].values), special_first=True, specials=SPECIAL_TOKENS
)

port_vocab = build_vocab_from_iterator(
    preprocess(train_df["PR"].values), special_first=True, specials=SPECIAL_TOKENS
)

Building vocab: 100%|██████████| 168903/168903 [00:04<00:00, 39251.91it/s]

Building vocab: 100%|██████████| 168903/168903 [00:04<00:00, 35398.26it/s]


In [5]:
print("Length of english vocab:", len(english_vocab))
print("Length of portugese vocab:", len(port_vocab))

Length of english vocab: 12221

Length of portugese vocab: 20582


In [6]:
import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SOS_TOKEN_IDX = english_vocab["<SOS>"]
EOS_TOKEN_IDX = english_vocab["<EOS>"]
MAX_LENGTH = 100


def sentence2idxs(vocab, sentence):
    tokens = normalize_string(str(sentence)).split(" ")
    return [vocab[word] if word in vocab else vocab["<UNK>"] for word in tokens]


def sentence2tensor(vocab, sentence):
    indexes = sentence2idxs(vocab, sentence)
    indexes.append(EOS_TOKEN_IDX)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)


def process_sentence(vocab, sentence):
    idxs = sentence2idxs(vocab, sentence)
    idxs.append(EOS_TOKEN_IDX)
    return idxs


def get_dataloader(
    df, input_vocab, output_vocab, batch_size=64, in_column="EN", out_column="PR"
):

    n = len(df)
    input_idxs = np.ones((n, MAX_LENGTH), dtype=np.int32) * input_vocab["<PAD>"]
    target_idxs = np.ones((n, MAX_LENGTH), dtype=np.int32) * output_vocab["<PAD>"]

    for idx, row in tqdm(df.iterrows(), total=n):
        in_lang_idxs = process_sentence(input_vocab, row[in_column])
        out_lang_idxs = process_sentence(output_vocab, row[out_column])

        input_idxs[idx, : len(in_lang_idxs)] = in_lang_idxs
        target_idxs[idx, : len(out_lang_idxs)] = out_lang_idxs

    data = TensorDataset(
        torch.LongTensor(input_idxs).to(device),
        torch.LongTensor(target_idxs).to(device),
    )

    dataloader = DataLoader(data, batch_size=batch_size)
    return dataloader

### Encoder Decoder RNN Model


In [7]:
import torch.nn as nn


class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [8]:
import torch.nn.functional as F


class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(
            batch_size, 1, dtype=torch.long, device=device
        ).fill_(SOS_TOKEN_IDX)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden = self.forward_step(
                decoder_input, decoder_hidden
            )
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1)  # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(
                    -1
                ).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return (
            decoder_outputs,
            decoder_hidden,
            None,
        )  # return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

### Train


In [9]:
def train_epoch(
    dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion
):

    total_loss = 0
    for data in tqdm(dataloader):
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)), target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [10]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001):
    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(n_epochs):
        loss = train_epoch(
            train_dataloader,
            encoder,
            decoder,
            encoder_optimizer,
            decoder_optimizer,
            criterion,
        )
        print("Epoch:", epoch, "loss:", loss)

In [11]:
hidden_size = 128
batch_size = 64

train_dataloader = get_dataloader(
    train_df[:10000],
    batch_size=batch_size,
    input_vocab=english_vocab,
    output_vocab=port_vocab,
)

encoder = EncoderRNN(len(english_vocab), hidden_size).to(device)
decoder = DecoderRNN(hidden_size, len(port_vocab)).to(device)

100%|██████████| 10000/10000 [00:01<00:00, 8725.63it/s]


In [12]:
train(train_dataloader, encoder, decoder, n_epochs=5)

100%|██████████| 157/157 [00:18<00:00,  8.66it/s]


Epoch: 0 loss: 0.9608244647265999


100%|██████████| 157/157 [00:16<00:00,  9.33it/s]


Epoch: 1 loss: 0.21218722299405723


100%|██████████| 157/157 [00:16<00:00,  9.32it/s]


Epoch: 2 loss: 0.19913442546774626


100%|██████████| 157/157 [00:16<00:00,  9.27it/s]


Epoch: 3 loss: 0.19054425522020668


100%|██████████| 157/157 [00:16<00:00,  9.28it/s]

Epoch: 4 loss: 0.1829584247557221





## Task

Task is to translate sentences from English to Dutch with Encoder-Decoder RNN with attention mechanism


In [None]:
import pandas as pd

train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,id,EN,NL
0,0,I couldn't understand his joke.,Ik begreep zijn grap niet.
1,1,There was nothing Tom could do about it.,Er was niets dat Tom eraan kon doen.
2,2,He has a hat on.,Hij draagt een hoed.
3,3,Does that happen every time?,Gebeurt dat elke keer?
4,4,Please don't run in the classroom.,Alsjeblieft niet rennen in het klaslokaal.


In [14]:
from torchtext.vocab import build_vocab_from_iterator

SPECIAL_TOKENS = ["<SOS>", "<EOS>", "<UNK>", "<PAD>"]
english_vocab = build_vocab_from_iterator(
    preprocess(train_df["EN"].values), special_first=True, specials=SPECIAL_TOKENS
)

dutch_vocab = build_vocab_from_iterator(
    preprocess(train_df["NL"].values), special_first=True, specials=SPECIAL_TOKENS
)

Building vocab: 100%|██████████| 68601/68601 [00:01<00:00, 41895.73it/s]

Building vocab: 100%|██████████| 68601/68601 [00:01<00:00, 40395.78it/s]


In [15]:
print("Length of english vocab:", len(english_vocab))
print("Length of dutch vocab:", len(dutch_vocab))

Length of english vocab: 9494

Length of dutch vocab: 13854


### Model


In [16]:
import torch.nn.functional as F


class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(
            batch_size, 1, dtype=torch.long, device=device
        ).fill_(SOS_TOKEN_IDX)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1)  # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(
                    -1
                ).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions

    def forward_step(self, input, hidden, encoder_outputs):
        embedded = self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [17]:
hidden_size = 128
batch_size = 64

train_dataloader = get_dataloader(
    train_df,
    batch_size=batch_size,
    input_vocab=english_vocab,
    output_vocab=dutch_vocab,
    in_column="EN",
    out_column="NL",
)

encoder = EncoderRNN(len(english_vocab), hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, len(dutch_vocab)).to(device)

100%|██████████| 68601/68601 [00:09<00:00, 7005.69it/s]


In [18]:
train(train_dataloader, encoder, decoder, n_epochs=10)

100%|██████████| 1072/1072 [03:12<00:00,  5.57it/s]


Epoch: 0 loss: 0.4511483043932648


100%|██████████| 1072/1072 [03:12<00:00,  5.58it/s]


Epoch: 1 loss: 0.27090134021283974


100%|██████████| 1072/1072 [03:11<00:00,  5.58it/s]


Epoch: 2 loss: 0.22069264835775343


100%|██████████| 1072/1072 [03:11<00:00,  5.59it/s]


Epoch: 3 loss: 0.1866087288554035


100%|██████████| 1072/1072 [03:11<00:00,  5.60it/s]


Epoch: 4 loss: 0.16060118323692413


100%|██████████| 1072/1072 [03:12<00:00,  5.56it/s]


Epoch: 5 loss: 0.1401216130592485


100%|██████████| 1072/1072 [03:10<00:00,  5.62it/s]


Epoch: 6 loss: 0.12376572213955779


100%|██████████| 1072/1072 [03:11<00:00,  5.61it/s]


Epoch: 7 loss: 0.11092277055383841


100%|██████████| 1072/1072 [03:10<00:00,  5.63it/s]


Epoch: 8 loss: 0.10053218684312124


100%|██████████| 1072/1072 [03:10<00:00,  5.62it/s]

Epoch: 9 loss: 0.09200224798839929





In [19]:
torch.save(encoder.state_dict(), "encoder.pt")
torch.save(decoder.state_dict(), "decoder.pt")

In [20]:
def predict(
    encoder, decoder, dataloader, input_vocab=english_vocab, output_vocab=dutch_vocab
):
    with torch.no_grad():
        predictions = []
        for data in tqdm(dataloader):
            input_tensor, _ = data
            input_tensor = input_tensor.to(device)

            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_outputs, decoder_hidden, decoder_attn = decoder(
                encoder_outputs, encoder_hidden
            )

            _, topi = decoder_outputs.topk(1)
            decoded_ids = topi.squeeze()

            for sentence in decoded_ids:
                decoded_words = []
                for idx in sentence:
                    if idx.item() == EOS_TOKEN_IDX:
                        break
                    decoded_words.append(output_vocab.get_itos()[idx.item()])
                predictions.append(" ".join(decoded_words))
    return predictions

In [21]:
test_df = pd.read_csv("test.csv")
test_df["NL"] = "a"

In [22]:
test_dataloader = get_dataloader(
    test_df,
    batch_size=batch_size,
    input_vocab=english_vocab,
    output_vocab=dutch_vocab,
    in_column="EN",
    out_column="NL",
)

100%|██████████| 7623/7623 [00:00<00:00, 8951.17it/s]


In [23]:
encoder.eval()
decoder.eval()

predictions = predict(encoder, decoder, test_dataloader)

100%|██████████| 120/120 [00:55<00:00,  2.15it/s]


In [24]:
test_df["prediction"] = predictions
test_df[["id", "prediction"]]