In [None]:
import re
import random
import string
import math
import warnings
import datetime

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as platform

from tqdm import tqdm
from tensorflow.data import Dataset
from tensorflow.keras.utils import get_file
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import (
    Layer, Dense, Embedding, Dropout, Input,
    MultiHeadAttention, LayerNormalization,
)

## To replicate the results
from tensorflow.random import set_seed
from numpy.random import seed

RANDOM_SEED = 42
set_seed(RANDOM_SEED)
seed(RANDOM_SEED)


## If want to train on CPU instead of GPU
tf.config.set_visible_devices([], 'GPU')


warnings.filterwarnings('ignore')


## Defining Utility Functions

In [None]:
def populate_df(df, data):

    english_lines = list()
    spanish_lines = list()
    for data_line in data:
        english_line, spanish_line = data_line.split("\t")

        english_lines.append(english_line)
        spanish_lines.append(spanish_line)
    
    df['english'] = english_lines
    df['spanish'] = spanish_lines

    return df


def generate_self_attention_mask(inputs):

        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]

        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)

        self_attention_mask = tf.cast(i >= j, dtype="int32")
        self_attention_mask = tf.reshape(self_attention_mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(self_attention_mask, mult)


def pad_and_tokenize(line, tokenizer, max_len):

    line = pad_sequences(
        tokenizer.texts_to_sequences(np.expand_dims(line, axis=0)),
        maxlen=max_len,
        padding='post',
    )

    return np.squeeze(line).tolist()


def format_dataset(english_line, spanish_line):

    return (
        {
            "encoder_inputs": english_line,
            "decoder_inputs": spanish_line[:, :-1],
        },
        spanish_line[:, 1:]
    )


def get_sentence(tokens, index_lookup):

    sentence = list()
    for token in tokens:
        word = index_lookup[token]
        sentence.append(word)

    return ' '.join(sentence)


## Downloading Dataset

In [None]:
dataset_path = get_file(
    fname="dataset.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)


In [None]:
dataset_file_path = f"{'/'.join(dataset_path.split('/')[:-1])}/spa-eng/spa.txt"

## Cleaning Data

In [None]:
with open(dataset_file_path) as f:
    data = f.read().split("\n")[:-1]

In [None]:
df = pd.DataFrame(columns=['english', 'spanish'])

In [None]:
df = populate_df(df, data)

In [None]:
df.head()

In [None]:
## Converting to lowercase

df['english'] = df['english'].apply(str.lower)
df['spanish'] = df['spanish'].apply(str.lower)

In [None]:
## Replacing puncutation

df['english'] = df['english'].apply(lambda line: re.sub(r'[^\w\s]', '', line))
df['spanish'] = df['spanish'].apply(lambda line: re.sub(r'[^\w\s]' + "¿", '', line))

In [None]:
## Adding <START> and <END> Token

df['spanish'] = df['spanish'].apply(
    lambda data_line: f'<START> {data_line} <END>')

In [None]:
df.head()

## Creating Train, Validation and Test sets

In [None]:
num_examples = df.shape[0]

print(f'There are {num_examples} training examples in the data')

In [None]:
train_size = int(num_examples * 0.94)
val_size = int(num_examples * 0.03)

In [None]:
df = df.sample(frac=1)

train_df = df.iloc[:train_size]
val_df = df.iloc[train_size:train_size+val_size]
test_df = df.iloc[train_size+val_size:]

In [None]:
print(
    f'There are:\n\t{train_df.shape[0]} training examples\n\t{val_df.shape[0]} validation examples\n\t{test_df.shape[0]} test examples'
)

## Pre-Processing Data

In [None]:
english_tokenizer = Tokenizer(oov_token='<OOV>')
spanish_tokenizer = Tokenizer(oov_token='<OOV>')

In [None]:
english_tokenizer.fit_on_texts(train_df['english'])
spanish_tokenizer.fit_on_texts(train_df['spanish'])

In [None]:
## Adding one for PADDING token

english_vocab_size = len(english_tokenizer.word_index) + 1
spanish_vocab_size = len(spanish_tokenizer.word_index) + 1

In [None]:
english_word_lookup = english_tokenizer.word_index
spanish_word_lookup = spanish_tokenizer.word_index

english_token_lookup = { value: key for key, value in english_word_lookup.items()}
spanish_token_lookup = { value: key for key, value in spanish_word_lookup.items()}

english_token_lookup[0] = ''
spanish_token_lookup[0] = ''

In [None]:
english_sentence_lengths = train_df['english'].map(str.split).map(len)
spanish_sentence_lengths = train_df['spanish'].map(str.split).map(len)

print(f'''
    English Sentence Length Stats:
    \tMaximum Headline length: {english_sentence_lengths.max()}
    \tMinimum Headline length: {english_sentence_lengths.min()}
    \tAverage Headline length: {english_sentence_lengths.mean():.2f}
    \tSTD of Headline length: {english_sentence_lengths.std():.2f}

    Spanish Sentence Length Stats:
    \tMaximum Headline length: {spanish_sentence_lengths.max()}
    \tMinimum Headline length: {spanish_sentence_lengths.min()}
    \tAverage Headline length: {spanish_sentence_lengths.mean():.2f}
    \tSTD of Headline length: {spanish_sentence_lengths.std():.2f}
''')

In [None]:
english_sequence_len = math.ceil(english_sentence_lengths.mean() + (3 * english_sentence_lengths.std()))
spanish_sequence_len = math.ceil(spanish_sentence_lengths.mean() + (3 * spanish_sentence_lengths.std()))


english_sequence_len, spanish_sequence_len

In [None]:
train_df['english'] = train_df['english'].map(
    lambda english_line: pad_and_tokenize(
        english_line,
        english_tokenizer,
        english_sequence_len
        )
)

train_df['spanish'] = train_df['spanish'].map(
    lambda spanish_line: pad_and_tokenize(
        spanish_line,
        spanish_tokenizer,
        spanish_sequence_len+1,
        )
)

In [None]:
val_df['english'] = val_df['english'].map(
    lambda english_line: pad_and_tokenize(
        english_line,
        english_tokenizer,
        english_sequence_len
        )
)

val_df['spanish'] = val_df['spanish'].map(
    lambda spanish_line: pad_and_tokenize(
        spanish_line,
        spanish_tokenizer,
        spanish_sequence_len+1,
        )
)

In [None]:
test_df['english'] = test_df['english'].map(
    lambda english_line: pad_and_tokenize(
        english_line,
        english_tokenizer,
        english_sequence_len
        )
)

test_df['spanish'] = test_df['spanish'].map(
    lambda spanish_line: pad_and_tokenize(
        spanish_line,
        spanish_tokenizer,
        spanish_sequence_len+1,
        )
)

## Creating Datasets from Dataframes

In [None]:
batch_size = 32


In [None]:
train_ds = Dataset.zip((
    Dataset.from_tensor_slices(train_df['english'].tolist()),
    Dataset.from_tensor_slices(train_df['spanish'].tolist()))).batch(
        batch_size).map(
            format_dataset).shuffle(
                train_df['english'].shape[0]).prefetch(
                batch_size).cache()

val_ds = Dataset.zip((
    Dataset.from_tensor_slices(val_df['english'].tolist()),
    Dataset.from_tensor_slices(val_df['spanish'].tolist()))).batch(
        batch_size).map(
            format_dataset).shuffle(
                val_df['english'].shape[0]).prefetch(
                batch_size).cache()

test_ds = Dataset.zip((
    Dataset.from_tensor_slices(test_df['english'].tolist()),
    Dataset.from_tensor_slices(test_df['spanish'].tolist()))).batch(
        1).map(
            format_dataset).shuffle(
                test_df['english'].shape[0]).prefetch(
                1).cache()


In [None]:
for X, y in train_ds.take(1):
    print(X)
    print(y)

## Defining Architecture of Blocks of Transformer 

In [None]:
class EmbeddingBlock(Layer):

    def __init__(self, vocab_size, embedding_dim,  sequence_length, **kwargs):

        super().__init__(**kwargs)

        self.token_embeddings = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.position_embeddings = Embedding(input_dim=sequence_length, output_dim=embedding_dim)

    def call(self, inputs):
        
        positions = tf.range(start=0, limit=tf.shape(inputs)[-1], delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        out = embedded_tokens + embedded_positions

        return out

In [None]:
class MultiLayerPerceptron(Layer):

    def __init__(self, dense_dim, out_dim, dropout_p, **kwargs):

        super().__init__(**kwargs)
        
        self.dense = Sequential([
            Dense(dense_dim, activation='relu'), 
            Dropout(dropout_p),
            Dense(out_dim, activation='relu'),
            Dropout(dropout_p),
        ])

    def call(self, inputs):
        return self.dense(inputs)
    

In [None]:
class EncoderBlock(Layer):

    def __init__(self, embedding_dim, dense_dim, num_heads, dropout_p, **kwargs):

        super().__init__(**kwargs)
        self.multi_headed_self_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.layernorm_1 = LayerNormalization()
        self.feed_forward = MultiLayerPerceptron(dense_dim, embedding_dim, dropout_p)
        self.layernorm_2 = LayerNormalization()

    def call(self, inputs):

        multi_headed_self_attention_output = self.multi_headed_self_attention(
            query=inputs,
            value=inputs,
            key=inputs,
        )
        feed_forward_input = self.layernorm_1(inputs + multi_headed_self_attention_output)
        feed_forward_output = self.feed_forward(feed_forward_input)
        out = self.layernorm_2(feed_forward_input + feed_forward_output)

        return out

In [None]:
class DecoderBlock(Layer):

    def __init__(self, embedding_dim, dense_dim, num_heads, dropout_p, **kwargs):
        
        super().__init__(**kwargs)

        self.masked_multi_headed_self_attention = attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.layernorm_1 = LayerNormalization()
        self.multi_headed_cross_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.layernorm_2 = LayerNormalization()
        self.feed_forward = MultiLayerPerceptron(dense_dim, embedding_dim, dropout_p)
        self.layernorm_3 = LayerNormalization()
        self.dropout = Dropout(dropout_p)

        self.supports_masking = True

    def call(self, inputs, encoder_outputs):

        causal_mask = generate_self_attention_mask(inputs)
        masked_multi_headed_self_attention_output = self.masked_multi_headed_self_attention(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask    
        )
        masked_multi_headed_self_attention_output_normalized = self.layernorm_1(inputs + masked_multi_headed_self_attention_output)
        masked_multi_headed_self_attention_output_normalized = self.dropout(masked_multi_headed_self_attention_output_normalized)

        multi_headed_cross_attention_output = self.multi_headed_cross_attention(
            query=masked_multi_headed_self_attention_output_normalized,
            value=encoder_outputs,
            key=encoder_outputs,
        )

        feed_forward_input = self.layernorm_2(masked_multi_headed_self_attention_output + masked_multi_headed_self_attention_output_normalized)
        feed_forward_output = self.feed_forward(feed_forward_input)

        out = self.layernorm_3(feed_forward_input + feed_forward_output)

        return out

## Defining Hyper Parameters

In [None]:
embedding_dim = 256
dense_dim = 2048
num_heads = 8
dropout_p = 0.5

## Building the Transformer

### Encoder Part


In [None]:
encoder_inputs = Input(shape=(None,), dtype="int64", name="encoder_inputs")

x = EmbeddingBlock(english_vocab_size, embedding_dim, english_sequence_len)(encoder_inputs)
encoder_block_outputs = EncoderBlock(embedding_dim, dense_dim, num_heads, dropout_p)(x)

encoder = Model(encoder_inputs, encoder_block_outputs)

### Decoder Part

In [None]:
decoder_inputs = Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = Input(shape=(None, embedding_dim), name="encoded_seq_inputs")

x = EmbeddingBlock(spanish_vocab_size, embedding_dim, spanish_sequence_len)(decoder_inputs)
x = DecoderBlock(embedding_dim, dense_dim, num_heads, dropout_p)(x, encoded_seq_inputs)
x = Dropout(dropout_p)(x)
decoder_block_outputs = Dense(spanish_vocab_size, activation="softmax")(x)


decoder = Model([decoder_inputs, encoded_seq_inputs], decoder_block_outputs)

In [None]:
transformer_inputs = [encoder_inputs, decoder_inputs]
transformer_outputs = decoder([decoder_inputs, encoder_block_outputs])

transformer = Model(
    transformer_inputs, transformer_outputs, name="transformer"
)

In [None]:
transformer.summary()

## Compling the Model

In [None]:
def decay(epoch):
        
   initial_lrate = 1e-4
   drop = 0.5
   epochs_drop = 10.0
   lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
   return lrate

In [None]:
lr_scheduler = LearningRateScheduler(decay)

In [None]:
transformer.compile(
    optimizer=Adam(learning_rate=(1e-4)),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

## Traning the Model

In [None]:
epochs = 30

In [None]:
checkpoint = ModelCheckpoint(
    filepath='model-epoch10+{epoch:02d}-loss{val_loss:.2f}.h5',
    monitor='val_loss',
    verbose=1, 
    save_best_only=True,
    mode='min')

In [None]:
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=[checkpoint, lr_scheduler])

## Loading Best Model

In [None]:
model_path = 'model-epoch10+01-loss1.78.h5'

transformer = load_model(model_path, custom_objects={
    'EmbeddingBlock': EmbeddingBlock,
    'MultiLayerPerceptron': MultiLayerPerceptron,
    'EncoderBlock': EncoderBlock,
    'DecoderBlock': DecoderBlock
})

## Evaluating the Model

In [None]:
def translate(input_sentence, model,  max_output_len, spanish_token_lookup, word_sample_size):

    translated_sentence = '<START>'
    for i in range(max_output_len):

        tokenized_target_sentence = pad_and_tokenize(
            translated_sentence,
            spanish_tokenizer,
            spanish_sequence_len
        )[:-1]

        encoder_inputs = input_sentence['encoder_inputs']
        decoder_inputs = np.expand_dims(tokenized_target_sentence, axis=0)

        predictions = model([encoder_inputs, decoder_inputs])[0]
        
        top_n_pred_tokens = np.argpartition(predictions[i, :], -word_sample_size)[-word_sample_size:]
        pred_token = np.random.choice(top_n_pred_tokens, size=1)[0]

        if pred_token:
            sampled_word = spanish_token_lookup[pred_token]
            translated_sentence += ' ' + sampled_word

        if pred_token == '<END>':
            break
    else:
        translated_sentence += ' <END>'

    return translated_sentence


In [None]:
# for X, y in test_ds.take(3):

#     actual_eng = get_sentence(X['encoder_inputs'][0].numpy(), english_token_lookup)
#     translated = translate(X, transformer, spanish_sequence_len-1, spanish_token_lookup, 2)

#     print('ACTUAL:', actual_eng)
#     print('TRANSLATED:', translated)