In [None]:
import re
import string
import os
import io
import tarfile
from urllib.request import urlopen

import pandas as pd
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.layers import Bidirectional, GRU, Dense, Dropout, Embedding, InputLayer
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

pd.set_option("display.max_colwidth", 1000)
tf.test.is_gpu_available()

In [None]:
DATA_DIRECTORY = os.path.join(os.path.dirname(os.getcwd()), "data")
DATA_DIRECTORY

In [None]:
class EuroParlEnFr:
    URL = "http://statmt.org/europarl/v7/fr-en.tgz"
    FR = "europarl-v7.fr-en.fr"
    EN = "europarl-v7.fr-en.en"

    def load(self):
        with open(os.path.join(DATA_DIRECTORY, self.EN), "r") as f:
            en = f.readlines()
        with open(os.path.join(DATA_DIRECTORY, self.FR), "r") as f:
            fr = f.readlines()
        return pd.DataFrame(zip(en, fr), columns=["en", "fr"])

    def download(self):
        if all(os.path.exists(os.path.join(DATA_DIRECTORY, f)) for f in (self.FR, self.EN)):
            print("Data has already been downloaded.")
            return
        os.makedirs(DATA_DIRECTORY, exist_ok=True)
        print(f"Downloading : {self.URL}")
        with urlopen(self.URL) as response:
            tf = tarfile.open(fileobj=io.BytesIO(response.read()))
        tf.extractall(path=DATA_DIRECTORY)

In [None]:
def preprocess_sentence(s):
    s = s.lower()
    s = re.sub(f'([{string.punctuation}])', r' \1 ', s)
    s = re.sub(f'\s+', r' ', s)
    return ['<sos>'] + s.strip().split() + ['<eos>']

def preprocess_sentence_pairs(df):
    for lang in ["en", "fr"]:
        df[lang] = df[lang].apply(preprocess_sentence)
    return df

In [None]:
def tokenize(sentences):
    tokenizer = Tokenizer(filters="", oov_token="<unk>")
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    sequences = pad_sequences(sequences, padding='post', value=0.0)
    return tokenizer, sequences

In [None]:
def create_datasets(en_sequences, fr_sequences, batch_size):
    num_sequences = en_sequences.shape[0]
    dataset = tf.data.Dataset.from_tensor_slices((en_sequences, fr_sequences))
    dataset = dataset.shuffle(buffer_size=num_sequences)
    split = (num_sequences * 95) // 100
    dataset_train = dataset.take(split).batch(batch_size, drop_remainder=True)
    dataset_val = dataset.skip(split).batch(batch_size, drop_remainder=True)
    return dataset_train, dataset_val

In [None]:
dataset = EuroParlEnFr()
dataset.download()
sentence_pairs = dataset.load()
sentence_pairs = preprocess_sentence_pairs(sentence_pairs)

In [None]:
en_tokenizer, en_sequences = tokenize(sentence_pairs["en"])
fr_tokenizer, fr_sequences = tokenize(sentence_pairs["fr"])
dataset_train, dataset_val = create_datasets(en_sequences, fr_sequences, 32)

In [None]:
class Encoder(Model):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = Embedding(input_dim=vocab_size, output_dim=300)
        self.gru = Bidirectional(GRU(units=256))
    
    def call(self, X, hidden):
        embedded = self.embedding(X)
        return self.gru(embedded, hidden)
    
    def initial_hidden_state(self, batch_size):
        return [tf.zeros((batch_size, 256))] * 2

In [None]:
class Decoder(Model):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = Embedding(vocab_size, 300)
        self.gru = GRU(512, return_sequences=False, return_state=False)
        self.dense = Dense(vocab_size, activation='softmax')
    
    def call(self, X, hidden):
        embedded = self.embedding(X)
        output = self.gru(embedded, hidden)
        return self.dense(output), output

In [None]:
def calculate_loss(loss_fn, y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

In [None]:
def train(encoder, decoder, en_tokenizer, fr_tokenizer, dataset_train, dataset_val):
    optimizer = Adam()
    loss_fn = SparseCategoricalCrossentropy()
    
    BATCH_SIZE = 32
    
    for epoch in range(10):
        encoder_hidden_state = encoder.initial_hidden_state(32)
        for (batch, (en, fr)) in enumerate(dataset_train):
            loss = 0
            with tf.GradientTape() as tape:
                context = encoder(en, encoder_hidden_state)
                decoder_hidden_state = context
                decoder_input = tf.expand_dims([fr_tokenizer.word_index['<sos>']] * BATCH_SIZE, 1)
                for i in range(1, fr.shape[1]):
                    predictions, decoder_hidden_state = decoder(decoder_input, decoder_hidden_state)
                    loss += calculate_loss(loss_fn, fr[:, i], predictions)
                    decoder_input = tf.expand_dims(fr[:, i], 1)
            variables = encoder.trainable_variables + decoder.trainable_variables
            gradients = tape.gradient(loss, variables)
            optimizer.apply_gradients(zip(gradients, variables))
            print(f"Epoch: {epoch+1} | Batch: {batch+1} | Loss: {loss / BATCH_SIZE}")

In [None]:
encoder = Encoder(len(en_tokenizer.word_index))
decoder = Decoder(len(fr_tokenizer.word_index))
train(encoder, decoder, en_tokenizer, fr_tokenizer, dataset_train, dataset_val)