In [None]:
import re
import os
import io
import string
import tarfile
import pickle as pkl
from urllib.request import urlopen

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.layers import Bidirectional, GRU, Dense, Dropout, Embedding, InputLayer
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

pd.set_option("display.max_colwidth", 1000)
tf.test.is_gpu_available()

In [None]:
DATA_DIRECTORY = os.path.join(os.path.dirname(os.getcwd()), "data")
DATA_DIRECTORY

In [None]:
class EuroParlEnFr:
    URL = "http://statmt.org/europarl/v7/fr-en.tgz"
    FR = "europarl-v7.fr-en.fr"
    EN = "europarl-v7.fr-en.en"

    def load(self):
        self._download()
        with open(os.path.join(DATA_DIRECTORY, self.EN), "r") as f:
            en = f.readlines()
        with open(os.path.join(DATA_DIRECTORY, self.FR), "r") as f:
            fr = f.readlines()
        return pd.DataFrame(zip(en, fr), columns=["en", "fr"])

    def _download(self):
        if all(os.path.exists(os.path.join(DATA_DIRECTORY, f)) for f in (self.FR, self.EN)):
            print("Data has already been downloaded.")
            return
        os.makedirs(DATA_DIRECTORY, exist_ok=True)
        print(f"Downloading : {self.URL}")
        with urlopen(self.URL) as response:
            tf = tarfile.open(fileobj=io.BytesIO(response.read()))
        tf.extractall(path=DATA_DIRECTORY)

In [None]:
def preprocess_sentence(s):
    s = s.lower()
    s = re.sub(f'([{string.punctuation}])', r' \1 ', s)
    s = re.sub(f'\s+', r' ', s)
    return "<sos> " + s.strip() + " <eos>"

In [None]:
def create_tokenizer(sentences):
    tokenizer = Tokenizer(filters="", oov_token="<unk>")
    tokenizer.fit_on_texts(sentences)
    return tokenizer

In [None]:
def create_tokenizer_and_preprocessed_files(dataset):
    df = dataset.load()
    df["en"] = df["en"].apply(preprocess_sentence)
    df["fr"] = df["fr"].apply(preprocess_sentence)
    df.to_csv(os.path.join(DATA_DIRECTORY, "preprocessed.csv"), sep="|", index=False)
    df["en"] = df["en"].apply(lambda s: s.split())
    df["fr"] = df["fr"].apply(lambda s: s.split())
    with open(os.path.join(DATA_DIRECTORY, "en_tokenizer.pkl"), "wb") as en_t, open(os.path.join(DATA_DIRECTORY, "fr_tokenizer.pkl"), "wb") as fr_t:
        pkl.dump(create_tokenizer(df["en"]), en_t)
        pkl.dump(create_tokenizer(df["fr"]), fr_t)

In [None]:
def load_tokenizers():
    with open(os.path.join(DATA_DIRECTORY, "en_tokenizer.pkl"), "rb") as en_t, open(os.path.join(DATA_DIRECTORY, "fr_tokenizer.pkl"), "rb") as fr_t:
        return pkl.load(en_t), pkl.load(fr_t)

In [None]:
def make_sequence_example(en, fr):
    ex = tf.train.SequenceExample()
    en_feature = ex.feature_lists.feature_list["en"]
    fr_feature = ex.feature_lists.feature_list["fr"]
    for token in en:
        en_feature.feature.add().int64_list.value.append(token)
    for token in fr:
        fr_feature.feature.add().int64_list.value.append(token)
    return ex

In [None]:
def create_tf_records():
    en_tokenizer, fr_tokenizer = load_tokenizers()
    with tf.io.TFRecordWriter(os.path.join(DATA_DIRECTORY, "europarlenfr.tfrecord")) as writer:
        for i, chunk in enumerate(pd.read_csv(os.path.join(DATA_DIRECTORY, "preprocessed.csv"), sep="|", chunksize=100000)):
            print(f"Processing chunk {i}")
            chunk["en"] = chunk["en"].apply(lambda s: s.split())
            chunk["fr"] = chunk["fr"].apply(lambda s: s.split())
            chunk["en"] = en_tokenizer.texts_to_sequences(chunk["en"])
            chunk["fr"] = en_tokenizer.texts_to_sequences(chunk["fr"])
            for _, row in chunk.iterrows():
                writer.write(make_sequence_example(row['en'], row['fr']).SerializeToString())

In [None]:
def parse_example_proto(ex):
    sequence_features = {
        "en": tf.io.FixedLenSequenceFeature([], dtype=tf.int64),
        "fr": tf.io.FixedLenSequenceFeature([], dtype=tf.int64)
    }
    _, sequence = tf.io.parse_single_sequence_example(ex, sequence_features=sequence_features)
    return sequence["en"], sequence["fr"]

In [None]:
def create_datasets(batch_size):
    dataset = tf.data.TFRecordDataset(filenames=[os.path.join(DATA_DIRECTORY, "europarlenfr.tfrecord")])
    num_examples = sum(1 for _ in dataset)
    dataset = dataset.map(parse_example_proto)
    dataset = dataset.shuffle(buffer_size=num_examples)
    split = (num_examples * 95) // 100
    dataset_train = dataset.take(split).padded_batch(batch_size, padded_shapes=([None],[None]), drop_remainder=False)
    dataset_val = dataset.skip(split).padded_batch(batch_size, padded_shapes=([None],[None]), drop_remainder=False)
    return dataset_train, dataset_val

In [None]:
class Encoder(Model):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = Embedding(input_dim=vocab_size, output_dim=300)
        self.gru = Bidirectional(GRU(units=256))
    
    def call(self, X, hidden):
        embedded = self.embedding(X)
        return self.gru(embedded, hidden)
    
    def initial_hidden_state(self, batch_size):
        return [tf.zeros((batch_size, 256))] * 2

In [None]:
class Decoder(Model):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = Embedding(vocab_size, 300)
        self.gru = GRU(512, return_sequences=False, return_state=False)
        self.dense = Dense(vocab_size, activation='softmax')
    
    def call(self, X, hidden):
        embedded = self.embedding(X)
        output = self.gru(embedded, hidden)
        return self.dense(output), output

In [None]:
def calculate_loss(loss_fn, y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

In [None]:
def train(encoder, decoder, en_tokenizer, fr_tokenizer, dataset_train, dataset_val):
    optimizer = Adam()
    loss_fn = SparseCategoricalCrossentropy()
    for epoch in range(10):
        for (batch, (en, fr)) in enumerate(dataset_train):
            batch_size = en.shape[0]
            encoder_hidden_state = encoder.initial_hidden_state(batch_size)
            loss = 0
            with tf.GradientTape() as tape:
                context = encoder(en, encoder_hidden_state)
                decoder_hidden_state = context
                decoder_input = tf.expand_dims([fr_tokenizer.word_index['<sos>']] * batch_size, 1)
                for i in range(1, fr.shape[1]):
                    predictions, decoder_hidden_state = decoder(decoder_input, decoder_hidden_state)
                    loss += calculate_loss(loss_fn, fr[:, i], predictions)
                    decoder_input = tf.expand_dims(fr[:, i], 1)
            variables = encoder.trainable_variables + decoder.trainable_variables
            gradients = tape.gradient(loss, variables)
            optimizer.apply_gradients(zip(gradients, variables))
            print(f"Epoch: {epoch+1} | Batch: {batch+1} | Loss: {loss / batch_size}")

In [None]:
create_tokenizer_and_preprocessed_files(EuroParlEnFr())

In [None]:
create_tf_records()

In [None]:
dataset_train, dataset_val = create_datasets(32)

In [None]:
en_tokenizer, fr_tokenizer = load_tokenizers()

In [None]:
encoder = Encoder(len(en_tokenizer.word_index))
decoder = Decoder(len(fr_tokenizer.word_index))

In [None]:
train(encoder, decoder, en_tokenizer, fr_tokenizer, dataset_train, dataset_val)