# Sequence to Sequence Learning with Neural Networks

## 1. Libraries

In [17]:
import os
import re
from typing import List

import spacy
import torch
import torchtext
import pandas as pd
from torch import nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader

## 2. The Dataset

In [2]:
DATASET_PATH = os.path.join(
    "..", 
    "..", 
    "nlp", 
    "datasets", 
    "en-fr-translation", 
    "en-fr.csv"
)

df = pd.read_csv(DATASET_PATH)
df = df.rename(columns={"English words/sentences": "EN"})
df = df.rename(columns={"French words/sentences": "FR"})

df.head()

Unnamed: 0,EN,FR
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [8]:
en_tokenizer = get_tokenizer("spacy", "en_core_web_sm")
fr_tokenizer = get_tokenizer("spacy", "fr_core_news_sm")

In [23]:
def prepare_sentence(sentence: str):
    pattern = r"([.,!?:;]+)"
    sentence = re.sub(pattern, r" \1 ", sentence)

    pattern = r"\s+"
    sentence = re.sub(pattern, " ", sentence)

    return sentence


def iterate_corpus(corpus: List[str], tokenizer: spacy.tokenizer.Tokenizer, max_len: int):
    for sentence in corpus:
        tokens = tokenizer(
            prepare_sentence(sentence)
        )

        # Adding padding if it is needed.
        if len(tokens) >= max_len:
            tokens = tokens[:max_len]
        else:
            len_diff = max_len - len(tokens)
            tokens = tokens + ["<pad>"] * len_diff

        yield tokens


en_corpus = [sent for sent in list(df["EN"])]
fr_corpus = [sent for sent in list(df["FR"])]
EN_MAX_LEN = 200
FR_MAX_LEN = 200

en_vocab = build_vocab_from_iterator(
    iterate_corpus(en_corpus, en_tokenizer, EN_MAX_LEN), 
    specials=["<unk>", "<start>", "<end>", "<pad>"]
)
en_vocab.set_default_index(en_vocab["<unk>"])

fr_vocab = build_vocab_from_iterator(
    iterate_corpus(fr_corpus, fr_tokenizer, FR_MAX_LEN), 
    specials=["<unk>", "<start>", "<end>", "<pad>"]
)
fr_vocab.set_default_index(fr_vocab["<unk>"])

In [36]:
class TranslationDataset(Dataset):

    def __init__(
        self, 
        lang1_corpus: List[str], lang2_corpus: List[str],
        lang1_tokenizer: spacy.tokenizer.Tokenizer, lang2_tokenizer: spacy.tokenizer.Tokenizer,
        lang1_vocab: torchtext.vocab.Vocab, lang2_vocab: torchtext.vocab.Vocab,
        lang1_max_len: int = 200, lang2_max_len: int = 200
    ):
        self.l1_corpus = lang1_corpus
        self.l2_corpus = lang2_corpus

        self.l1_max_len = lang1_max_len
        self.l2_max_len = lang2_max_len
        
        self.l1_tokenizer = lang1_tokenizer
        self.l2_tokenizer = lang2_tokenizer
        
        self.l1_vocab = lang1_vocab
        self.l2_vocab = lang2_vocab

        self.x, self.y = self._get_x_y()

    def __getitem__(self, idx: int):
        return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.y)

    def _get_x_y(self):
        x = TranslationDataset._parse_corpus(
            self.l1_corpus, self.l1_tokenizer, self.l1_vocab, self.l1_max_len
        )
        y = TranslationDataset._parse_corpus(
            self.l2_corpus, self.l2_tokenizer, self.l2_vocab, self.l2_max_len
        )

        return x, y

    @staticmethod
    def _parse_corpus(
        corpus: List[str], 
        tokenizer: spacy.tokenizer.Tokenizer, 
        vocab: torchtext.vocab.Vocab,
        max_len: int
    ):
        output = []

        for sent in corpus:
            tokens = tokenizer(sent)
            indices = [vocab[token] for token in tokens]
            if len(indices) >= max_len:
                output.append(indices[:max_len])
            else:
                len_diff = max_len - len(indices)
                padding = [vocab["<pad>"]] * len_diff
                output.append(indices + padding)

        return torch.LongTensor(output)

dataset = TranslationDataset(
    lang1_corpus=en_corpus, lang2_corpus=fr_corpus,
    lang1_vocab=en_vocab, lang2_vocab=fr_vocab,
    lang1_tokenizer=en_tokenizer, lang2_tokenizer=fr_tokenizer,
    lang1_max_len=EN_MAX_LEN, lang2_max_len=FR_MAX_LEN
)
print("x.shape:", dataset.x.shape)
print("y.shape:", dataset.y.shape)

x.shape: torch.Size([175621, 200])
y.shape: torch.Size([175621, 200])


## 3. Seq2Seq Model

In [None]:
class Encoder(nn.Module):

    def __init__(self):
        pass

    def forward(self):
        pass


class Decoder(nn.Module):

    def __init__(self):
        pass

    def forward(self):
        pass


class Seq2Seq(nn.Module):

    def __init__(self):
        pass

    def forward(self):
        pass