In [14]:
import os
import shutil
import subprocess
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import tensorflow as tf
from tensorflow import keras
from keras import layers


In [15]:
dataset = pd.read_csv('/kaggle/input/eng-french-data/eng_-french.csv', encoding='utf-8')
dataset = dataset.sample(len(dataset), random_state=42)
dataset.head()


Unnamed: 0,English words/sentences,French words/sentences
2785,Take a seat.,Prends place !
29880,I wish Tom was here.,J'aimerais que Tom soit là.
53776,How did the audition go?,Comment s'est passée l'audition ?
154386,I've no friend to talk to about my problems.,Je n'ai pas d'ami avec lequel je puisse m'entr...
149823,I really like this skirt. Can I try it on?,"J'aime beaucoup cette jupe, puis-je l'essayer ?"


In [37]:
dataset['French words/sentences'][2785]

'Prends place !'

In [16]:
dataset['English words in sentence'] = (dataset['English words/sentences'].str.split().apply(len))
dataset['French words in sentence'] = (dataset['French words/sentences'].str.split().apply(len))

fig = px.histogram(dataset, x=['English Words in Sentence', 'French Words in Sentence'], color_discrete_sequence=["#3f384a", "#e04c5f"], labels={'variable': "Variable", 'value': "Words in sentence"}, marginal='box', barmode='group', height=540, width=840, title='Data set - Words in sentence')

fig.show()

In [17]:
sentences_english = dataset['English words/sentences'].to_numpy()
sentences_french = dataset['French words/sentences'].to_numpy()

valid_fraction = 0.1
valid_len = int(valid_fraction * len(dataset))

sentences_english_train = sentences_english[:-valid_len]
sentences_french_train = sentences_french[:-valid_len]

sentences_english_valid = sentences_english[-valid_len:]
sentences_french_valid = sentences_french[-valid_len:]

In [18]:
def prepare_input_and_target(sentences_english, sentences_french):
    return (sentences_english, b"startofseq " + sentences_french), sentences_french + b" endofseq"

def from_sentences_dataset(sentences_english, sentences_french, batch_size = 32, cache=True, shuffle=False, shuffle_buffer_size=10_000, seed=None):

    dataset = tf.data.Dataset.from_tensor_slices((sentences_english, sentences_french))
    dataset = dataset.map(prepare_input_and_target, num_parallel_calls=tf.data.AUTOTUNE)

    if cache:
        dataset = dataset.cache()
    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)

    return dataset.batch(batch_size)

In [19]:
example_ds = from_sentences_dataset(sentences_english_train, sentences_french_train, batch_size=4)
list(example_ds.take(1))[0]

((<tf.Tensor: shape=(4,), dtype=string, numpy=
  array([b'Take a seat.', b'I wish Tom was here.',
         b'How did the audition go?',
         b"I've no friend to talk to about my problems."], dtype=object)>,
  <tf.Tensor: shape=(4,), dtype=string, numpy=
  array([b'startofseq Prends place !',
         b"startofseq J'aimerais que Tom soit l\xc3\xa0.",
         b"startofseq Comment s'est pass\xc3\xa9e l'audition\xc2\xa0?",
         b"startofseq Je n'ai pas d'ami avec lequel je puisse m'entretenir de mes probl\xc3\xa8mes."],
        dtype=object)>),
 <tf.Tensor: shape=(4,), dtype=string, numpy=
 array([b'Prends place ! endofseq',
        b"J'aimerais que Tom soit l\xc3\xa0. endofseq",
        b"Comment s'est pass\xc3\xa9e l'audition\xc2\xa0? endofseq",
        b"Je n'ai pas d'ami avec lequel je puisse m'entretenir de mes probl\xc3\xa8mes. endofseq"],
       dtype=object)>)

In [20]:
class ColoramaVerbose(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(
            f'Epoch: {epoch+1:02d}',
            f"loss: {logs['loss']: .5f}",
            f"accuracy: {logs['accuracy']:.5f}",
            f"val_loss: {logs['val_loss']:.5f}",
            f"val_accuracy: {logs['val_accuracy']:.5f}"
        )

In [21]:
def adapt_compile_and_fit(model, train_dataset, valid_dataset, n_epochs=25, n_patience=5, init_lr=0.001, lr_decay_rate=0.1, colorama_verbose=False):
    model.vectorization_en.adapt(
        train_dataset.map(
            lambda sentences, target: sentences[0],
            num_parallel_calls = tf.data.AUTOTUNE,
        )
    )

    model.vectorization_fr.adapt(
        train_dataset.map(
            lambda sentences, target: sentences[1] + b"endofseq",
            num_parallel_calls = tf.data.AUTOTUNE,
        )
    )

    train_dataset_prepared = train_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_fr(target)),
        num_parallel_calls = tf.data.AUTOTUNE,
    ).prefetch(tf.data.AUTOTUNE)

    valid_dataset_prepared = valid_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_fr(target)),
        num_parallel_calls = tf.data.AUTOTUNE,
    ).prefetch(tf.data.AUTOTUNE)

    early_stopping_cb = keras.callbacks.EarlyStopping(
        monitor='val_accuracy', patience=n_patience, restore_best_weights=True
    )

    n_decay_steps = n_epochs * len(list(train_dataset_prepared))
    scheduled_lr = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=init_lr,
        decay_steps=n_decay_steps,
        decay_rate=lr_decay_rate,
    )

    model_callbacks = [early_stopping_cb]
    verbose_level = 1
    if colorama_verbose:
        model_callbacks.append(ColoramaVerbose())
        verbose_level = 0

    model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.RMSprop(learning_rate=scheduled_lr), metrics=['accuracy'])

    return model.fit(train_dataset_prepared, epochs=n_epochs, validation_data=valid_dataset_prepared, callbacks=model_callbacks, verbose=verbose_level)


In [22]:
def translate(model, sentence):
    translation = ''
    for word_index in range(model.max_sentence_len):
        x_encoder = np.array([sentence])
        x_decoder = np.array(['startofseq ' + translation])
        y_prob = model.predict((x_encoder, x_decoder), verbose=0)[0, word_index]
        predicted_word_id = np.argmax(y_prob)
        predicted_word = model.vectorization_fr.get_vocabulary()[predicted_word_id]

        if predicted_word == 'endofseq':
            break

        translation += ' ' + predicted_word

    return translation.strip()


In [23]:
class BERTA(keras.Model):
    def __init__(self, vocabulary_size=5000, max_sentence_len=50, embedding_size=256, n_units_lstm=512, **kwargs):
        super().__init__(**kwargs)
        self.max_sentence_len = max_sentence_len

        self.vectorization_en = layers.TextVectorization(vocabulary_size, output_sequence_length=max_sentence_len)

        self.vectorization_fr = layers.TextVectorization(vocabulary_size, output_sequence_length=max_sentence_len)

        self.encoder_embedding = layers.Embedding(vocabulary_size, embedding_size, mask_zero=True)

        self.decoder_embedding = layers.Embedding(vocabulary_size, embedding_size, mask_zero=True)

        self.encoder = layers.Bidirectional(layers.LSTM(n_units_lstm // 2, return_sequences=True, return_state=True))

        self.decoder = layers.LSTM(n_units_lstm, return_sequences=True)

        self.attention = layers.Attention()

        self.output_layer = layers.Dense(vocabulary_size, activation='softmax')

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs

        encoder_input_ids = self.vectorization_en(encoder_inputs)
        decoder_input_ids = self.vectorization_fr(decoder_inputs)

        encoder_embeddings = self.encoder_embedding(encoder_input_ids)
        decoder_embeddings = self.decoder_embedding(decoder_input_ids)

        encoder_output, *encoder_state = self.encoder(encoder_embeddings)

        encoder_state = [
            tf.concat(encoder_state[0::2], axis=-1),  # short term state
            tf.concat(encoder_state[1::2], axis=-1)  # long term
        ]

        decoder_output = self.decoder(decoder_embeddings, initial_state=encoder_state)
        attention_output = self.attention([decoder_output, encoder_output])

        return self.output_layer(attention_output)

In [24]:
tf.random.set_seed(42)

train_ds = from_sentences_dataset(sentences_english_train, sentences_french_train, shuffle=True, seed=42)

valid_ds = from_sentences_dataset(sentences_english_valid, sentences_french_valid)

encoder_decoder = BERTA(max_sentence_len=15)

history = adapt_compile_and_fit(encoder_decoder, train_ds, valid_ds, init_lr=0.02, lr_decay_rate=0.01, colorama_verbose=True)

Epoch: 01 loss:  2.72697 accuracy: 0.47249 val_loss: 2.09899 val_accuracy: 0.55072
Epoch: 02 loss:  1.90011 accuracy: 0.58345 val_loss: 1.85590 val_accuracy: 0.59407
Epoch: 03 loss:  1.66508 accuracy: 0.62081 val_loss: 1.71082 val_accuracy: 0.61841
Epoch: 04 loss:  1.49266 accuracy: 0.64973 val_loss: 1.61662 val_accuracy: 0.63941
Epoch: 05 loss:  1.35004 accuracy: 0.67537 val_loss: 1.53634 val_accuracy: 0.65492
Epoch: 06 loss:  1.22300 accuracy: 0.69841 val_loss: 1.48496 val_accuracy: 0.66466
Epoch: 07 loss:  1.10634 accuracy: 0.72088 val_loss: 1.45406 val_accuracy: 0.67393
Epoch: 08 loss:  0.99922 accuracy: 0.74229 val_loss: 1.42068 val_accuracy: 0.68202
Epoch: 09 loss:  0.90487 accuracy: 0.76281 val_loss: 1.40723 val_accuracy: 0.68743
Epoch: 10 loss:  0.82097 accuracy: 0.78062 val_loss: 1.40045 val_accuracy: 0.69200
Epoch: 11 loss:  0.74918 accuracy: 0.79780 val_loss: 1.40187 val_accuracy: 0.69488
Epoch: 12 loss:  0.68895 accuracy: 0.81239 val_loss: 1.40295 val_accuracy: 0.69800
Epoc

In [25]:
fig = px.line(
    history.history,
    markers=True,
    height=540,
    width=840,
    symbol="variable",
    labels={"variable": "Variable", "value": "Value", "index": "Epoch"},
    title="Dataset - Encoder-Decoder RNN Training Process",
    color_discrete_sequence=px.colors.diverging.balance_r,
)

fig.show()

In [38]:
translation1 = translate(encoder_decoder, 'Take a seat.')
print(translation1)
print(dataset['French words/sentences'][2785])

prends place
Prends place !


In [43]:
class PositionalEncoding(layers.Layer):
    def __init__(self, max_sentence_len=50, embedding_size=256, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)

        if not embedding_size % 2 == 0:
            raise ValueError('The embedding size must be even')

        p, i = np.meshgrid(np.arange(max_sentence_len), np.arange(embedding_size // 2))

        pos_emb = np.empty((1, max_sentence_len, embedding_size))
        pos_emb[:, :, 0::2] = np.sin(p / 10_000 ** (2 * i / embedding_size)).T
        pos_emb[:, :, 1::2] = np.cos(p / 10_000 ** (2 * i / embedding_size)).T

        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
        self.supports_masking = True

    def call(self, inputs):
        batch_max_len = tf.shape(inputs)[1]
        return inputs + self.positional_embedding[:, :batch_max_len]


In [45]:
class Encoder(layers.Layer):
    def __init__(self, embedding_size=256, n_attention_heads=8, n_units_dense=256, dropout_rate=0.2, **kwargs):
        super().__init__(**kwargs)
        self.multi_heat_attention = layers.MultiHeadAttention(n_attention_heads, embedding_size, dropout=dropout_rate)

        self.feed_forward = keras.Sequential([
            layers.Dense(n_units_dense, activation='relu', kernel_initializer='he_normal'),
            layers.Dense(embedding_size, kernel_initializer='he_normal'),
            layers.Dropout(dropout_rate)
        ])

        self.add = layers.Add()
        self.normalization = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        y = inputs
        skip_y = y   # residual connection
        y = self.multi_heat_attention(y, value=y, attention_mask=mask)
        y = self.normalization(self.add([y, skip_y]))
        skip_y = y
        y = self.feed_forward(y)
        return self.normalization(self.add([y, skip_y]))


class Decoder(layers.Layer):
    def __init__(self, embedding_size=256, n_attention_heads=8, n_units_dense=256, dropout_rate=0.2, **kwargs):
        super().__init__(**kwargs)
        self.masked_multi_head_attention = layers.MultiHeadAttention(n_attention_heads, embedding_size, dropout=dropout_rate)

        self.multi_head_attention = layers.MultiHeadAttention(n_attention_heads, embedding_size, dropout=dropout_rate)

        self.feed_forward = keras.Sequential([
                layers.Dense(
                    n_units_dense, activation="relu", kernel_initializer="he_normal"
                ),
                layers.Dense(embedding_size, kernel_initializer="he_normal"),
                layers.Dropout(dropout_rate),
        ])
        self.add = layers.Add()
        self.normalization = layers.LayerNormalization()


    def call(self, inputs, mask=None):
        decoder_mask, encoder_mask = mask
        Z, encoder_output = inputs
        Z_skip = Z
        Z = self.masked_multi_head_attention(Z, value=Z, attention_mask=decoder_mask)
        Z = self.normalization(self.add([Z, Z_skip]))
        Z_skip = Z
        Z = self.multi_head_attention(
            Z, value=encoder_output, attention_mask=encoder_mask
        )
        Z = self.normalization(self.add([Z, Z_skip]))
        Z_skip = Z
        Z = self.feed_forward(Z)
        return self.normalization(self.add([Z, Z_skip]))

In [46]:
class Transformer(keras.Model):
    def __init__(self, vocabulary_size=5000, max_sentence_len=50, embedding_size=256, n_encoder_decoder_blocks=1,n_attention_heads=8, n_units_dense=256,dropout_rate=0.2, **kwargs,
    ):
        super().__init__(**kwargs)
        self.max_sentence_len = max_sentence_len

        self.vectorization_en = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )
        self.vectorization_fr = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )
        self.encoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )
        self.decoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )

        self.positional_encoding = PositionalEncoding(max_sentence_len, embedding_size)

        self.encoder_blocks = [
            Encoder(embedding_size, n_attention_heads, n_units_dense, dropout_rate)
            for _ in range(n_encoder_decoder_blocks)
        ]

        self.decoder_blocks = [
            Decoder(embedding_size, n_attention_heads, n_units_dense, dropout_rate)
            for _ in range(n_encoder_decoder_blocks)
        ]

        self.output_layer = layers.Dense(vocabulary_size, activation='softmax')

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs

        encoder_input_ids = self.vectorization_en(encoder_inputs)
        decoder_input_ids = self.vectorization_fr(decoder_inputs)

        encoder_embeddings = self.encoder_embedding(encoder_input_ids)
        decoder_embeddings = self.decoder_embedding(decoder_input_ids)

        encoder_pos_embeddings = self.positional_encoding(encoder_embeddings)
        decoder_pos_embeddings = self.positional_encoding(decoder_embeddings)

        encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
        decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]

        batch_max_len_decoder = tf.shape(decoder_embeddings)[1]
        decoder_causal_mask = tf.linalg.band_part(  # Lower triangular matrix.
            tf.ones((batch_max_len_decoder, batch_max_len_decoder), tf.bool), -1, 0
        )
        decoder_mask = decoder_causal_mask & decoder_pad_mask

        Z = encoder_pos_embeddings
        for encoder_block in self.encoder_blocks:
            Z = encoder_block(Z, mask=encoder_pad_mask)

        encoder_output = Z
        Z = decoder_pos_embeddings
        for decoder_block in self.decoder_blocks:
            Z = decoder_block(
                [Z, encoder_output], mask=[decoder_mask, encoder_pad_mask]
            )

        return self.output_layer(Z)

In [47]:
transformer = Transformer(max_sentence_len=15)
transformer_history = adapt_compile_and_fit(transformer, train_ds, valid_ds, colorama_verbose=True)

Epoch: 01 loss:  4.34815 accuracy: 0.27355 val_loss: 3.51217 val_accuracy: 0.36341
Epoch: 02 loss:  3.10860 accuracy: 0.42720 val_loss: 2.63405 val_accuracy: 0.49768
Epoch: 03 loss:  2.54649 accuracy: 0.51284 val_loss: 2.23466 val_accuracy: 0.56101
Epoch: 04 loss:  2.27073 accuracy: 0.55731 val_loss: 2.06548 val_accuracy: 0.59028
Epoch: 05 loss:  2.11859 accuracy: 0.58271 val_loss: 1.91548 val_accuracy: 0.61487
Epoch: 06 loss:  2.01460 accuracy: 0.60053 val_loss: 1.82954 val_accuracy: 0.63126
Epoch: 07 loss:  1.94002 accuracy: 0.61312 val_loss: 1.77424 val_accuracy: 0.64486
Epoch: 08 loss:  1.87953 accuracy: 0.62358 val_loss: 1.72180 val_accuracy: 0.65131
Epoch: 09 loss:  1.83087 accuracy: 0.63149 val_loss: 1.68647 val_accuracy: 0.65654
Epoch: 10 loss:  1.78869 accuracy: 0.63905 val_loss: 1.65709 val_accuracy: 0.66203
Epoch: 11 loss:  1.75435 accuracy: 0.64513 val_loss: 1.61815 val_accuracy: 0.66851
Epoch: 12 loss:  1.72698 accuracy: 0.64974 val_loss: 1.59883 val_accuracy: 0.67327
Epoc

In [48]:
fig = px.line(
    transformer_history.history,
    markers=True,
    height=540,
    width=840,
    symbol="variable",
    labels={"variable": "Variable", "value": "Value", "index": "Epoch"},
    title="Easy Dataset - Transformer Training Process",
    color_discrete_sequence=px.colors.diverging.balance_r,
)

fig.show()

In [49]:
translation1 = translate(transformer, 'Take a seat.')
print(translation1)
print(dataset['French words/sentences'][2785])

prends un place
Prends place !


# 