In [2]:
import pandas as pd

data_path = "data/"

train_df = pd.read_csv(f"{data_path}train.csv")
train_df.head()

Unnamed: 0,id,EN,NL
0,0,I couldn't understand his joke.,Ik begreep zijn grap niet.
1,1,There was nothing Tom could do about it.,Er was niets dat Tom eraan kon doen.
2,2,He has a hat on.,Hij draagt een hoed.
3,3,Does that happen every time?,Gebeurt dat elke keer?
4,4,Please don't run in the classroom.,Alsjeblieft niet rennen in het klaslokaal.


In [3]:
import unicodedata
import re


def unicode2ascii(s):
    return "".join(
        c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"
    )


def normalize_string(s):
    s = unicode2ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [4]:
from tqdm import tqdm


def preprocess(texts):
    for text in tqdm(texts, desc="Building vocab"):
        tokens = normalize_string(str(text)).split()
        yield tokens

In [5]:
from torchtext.vocab import build_vocab_from_iterator

SPECIAL_TOKENS = ["<SOS>", "<EOS>", "<UNK>", "<PAD>"]
english_vocab = build_vocab_from_iterator(
    preprocess(train_df["EN"].values), special_first=True, specials=SPECIAL_TOKENS
)

dutch_vocab = build_vocab_from_iterator(
    preprocess(train_df["NL"].values), special_first=True, specials=SPECIAL_TOKENS
)

Building vocab: 100%|██████████| 68601/68601 [00:01<00:00, 52049.24it/s]
Building vocab: 100%|██████████| 68601/68601 [00:01<00:00, 49141.01it/s]


In [6]:
print("Length of english vocab:", len(english_vocab))
print("Length of dutch vocab:", len(dutch_vocab))

Length of english vocab: 9494
Length of dutch vocab: 13854


In [7]:
import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SOS_TOKEN_IDX = english_vocab["<SOS>"]
EOS_TOKEN_IDX = english_vocab["<EOS>"]
MAX_LENGTH = 100


def sentence2idxs(vocab, sentence):
    tokens = normalize_string(str(sentence)).split(" ")
    return [vocab[word] if word in vocab else vocab["<UNK>"] for word in tokens]


def sentence2tensor(vocab, sentence):
    indexes = sentence2idxs(vocab, sentence)
    indexes.append(EOS_TOKEN_IDX)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)


def process_sentence(vocab, sentence):
    idxs = sentence2idxs(vocab, sentence)
    idxs.append(EOS_TOKEN_IDX)
    return idxs


def get_dataloader(df, input_vocab, output_vocab, batch_size=64, in_column="EN", out_column="NL"):
    n = len(df)
    input_idxs = np.ones((n, MAX_LENGTH), dtype=np.int32) * input_vocab["<PAD>"]
    target_idxs = np.ones((n, MAX_LENGTH), dtype=np.int32) * output_vocab["<PAD>"]

    for idx, row in tqdm(df.iterrows(), total=n):
        in_lang_idxs = process_sentence(input_vocab, row[in_column])
        out_lang_idxs = process_sentence(output_vocab, row[out_column])

        input_idxs[idx, : len(in_lang_idxs)] = in_lang_idxs
        target_idxs[idx, : len(out_lang_idxs)] = out_lang_idxs

    data = TensorDataset(
        torch.LongTensor(input_idxs).to(device),
        torch.LongTensor(target_idxs).to(device),
    )

    dataloader = DataLoader(data, batch_size=batch_size)
    return dataloader

In [8]:
import torch.nn as nn


class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.2):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [9]:
import torch.nn.functional as F


class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(
            batch_size, 1, dtype=torch.long, device=device
        ).fill_(SOS_TOKEN_IDX)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden = self.forward_step(
                decoder_input, decoder_hidden
            )
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(
                    -1
                ).detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [10]:
def train_epoch(
    dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion
):

    total_loss = 0
    for data in tqdm(dataloader):
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)), target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [11]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001):
    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(n_epochs):
        loss = train_epoch(
            train_dataloader,
            encoder,
            decoder,
            encoder_optimizer,
            decoder_optimizer,
            criterion,
        )
        print("Epoch:", epoch, "loss:", loss)

In [12]:
hidden_size = 128
batch_size = 64

train_dataloader = get_dataloader(train_df, english_vocab, dutch_vocab, batch_size)

encoder = EncoderRNN(len(english_vocab), hidden_size).to(device)
decoder = DecoderRNN(hidden_size, len(dutch_vocab)).to(device)

100%|██████████| 68601/68601 [00:11<00:00, 5924.44it/s]


In [13]:
train(train_dataloader, encoder, decoder, n_epochs=20)

100%|██████████| 1072/1072 [02:20<00:00,  7.61it/s]


Epoch: 0 loss: 0.47423904305740966


100%|██████████| 1072/1072 [02:05<00:00,  8.54it/s]


Epoch: 1 loss: 0.3062274100731558


100%|██████████| 1072/1072 [01:45<00:00, 10.15it/s]


Epoch: 2 loss: 0.2542075140318319


100%|██████████| 1072/1072 [01:48<00:00,  9.88it/s]


Epoch: 3 loss: 0.22615738603661753


100%|██████████| 1072/1072 [01:56<00:00,  9.18it/s]


Epoch: 4 loss: 0.20622452106604824


100%|██████████| 1072/1072 [01:55<00:00,  9.27it/s]


Epoch: 5 loss: 0.19032614177732327


100%|██████████| 1072/1072 [01:50<00:00,  9.70it/s]


Epoch: 6 loss: 0.17713069580773363


100%|██████████| 1072/1072 [01:48<00:00,  9.89it/s]


Epoch: 7 loss: 0.1658668840137213


100%|██████████| 1072/1072 [01:49<00:00,  9.79it/s]


Epoch: 8 loss: 0.15610195814506778


100%|██████████| 1072/1072 [01:48<00:00,  9.90it/s]


Epoch: 9 loss: 0.1476164728389191


100%|██████████| 1072/1072 [01:46<00:00, 10.11it/s]


Epoch: 10 loss: 0.14015030847000542


100%|██████████| 1072/1072 [01:50<00:00,  9.68it/s]


Epoch: 11 loss: 0.13355577340238353


100%|██████████| 1072/1072 [01:48<00:00,  9.91it/s]


Epoch: 12 loss: 0.12755342138541945


100%|██████████| 1072/1072 [01:45<00:00, 10.13it/s]


Epoch: 13 loss: 0.12217226118516566


100%|██████████| 1072/1072 [01:45<00:00, 10.13it/s]


Epoch: 14 loss: 0.11723541822145457


100%|██████████| 1072/1072 [01:43<00:00, 10.33it/s]


Epoch: 15 loss: 0.1128037782288643


100%|██████████| 1072/1072 [01:47<00:00,  9.99it/s]


Epoch: 16 loss: 0.10872341763339381


100%|██████████| 1072/1072 [01:53<00:00,  9.48it/s]


Epoch: 17 loss: 0.10494116076560163


100%|██████████| 1072/1072 [01:56<00:00,  9.24it/s]


Epoch: 18 loss: 0.10142438000401677


100%|██████████| 1072/1072 [01:52<00:00,  9.49it/s]

Epoch: 19 loss: 0.09823629308019334





In [15]:
test_df = pd.read_csv(f"{data_path}test.csv")

In [52]:
def translate_sentence(sentence, encoder, decoder):
    encoder.eval()
    decoder.eval()
    with torch.no_grad():
        input_tensor = sentence2tensor(english_vocab, sentence)
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden)

        decoded_words = [dutch_vocab.lookup_token(token) for token in decoder_outputs.topk(1)[1][0]]
        answer = " ".join(decoded_words[:decoded_words.index("<EOS>")]) if "<EOS>" in decoded_words else " ".join(decoded_words)
        if answer:
            return answer
        return "Ik heb het woordenboek."

In [53]:
translations = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Translating"):
    translation = translate_sentence(row["EN"], encoder, decoder)
    translations.append(translation)

submission_df = pd.DataFrame({"id": test_df["id"], "prediction": translations})
submission_df.to_csv("submission.csv", index=False)

Translating: 100%|██████████| 7623/7623 [05:05<00:00, 24.97it/s]
