In [1]:
import re
import random
import string
import math

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as platform

from tqdm import tqdm
from tensorflow.data import Dataset
from tensorflow.keras.utils import get_file
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Layer, Dense, Embedding, Dropout,
    TextVectorization, LayerNormalization,
    MultiHeadAttention, Input
)


## To replicate the results
from tensorflow.random import set_seed
from numpy.random import seed

RANDOM_SEED = 42
set_seed(RANDOM_SEED)
seed(RANDOM_SEED)


## If want to train on CPU instead of GPU
tf.config.set_visible_devices([], 'GPU')


## Defining Utility Functions

In [2]:
def populate_df(df, data):

    english_lines = list()
    spanish_lines = list()
    for data_line in data:
        english_line, spanish_line = data_line.split("\t")

        english_lines.append(english_line)
        spanish_lines.append(spanish_line)
    
    df['english'] = english_lines
    df['spanish'] = spanish_lines

    return df


def get_vocab_size(data):

    vocab = set()
    for data_line in tqdm(data):
        vocab = vocab.union(data_line.split())
    
    return len(vocab)


def generate_self_attention_mask(inputs):

        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]

        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)

        self_attention_mask = tf.cast(i >= j, dtype="int32")
        self_attention_mask = tf.reshape(self_attention_mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        
        return tf.tile(self_attention_mask, mult)

## Downloading Dataset

In [3]:
dataset_path = get_file(
    fname="dataset.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)


In [4]:
dataset_file_path = f"{'/'.join(dataset_path.split('/')[:-1])}/spa-eng/spa.txt"

## Cleaning Data

In [5]:
with open(dataset_file_path) as f:
    data = f.read().split("\n")[:-1]

In [6]:
df = pd.DataFrame(columns=['english', 'spanish'])

In [7]:
df = populate_df(df, data)

In [8]:
df.head()

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [9]:
## Converting to lowercase

df['english'] = df['english'].apply(str.lower)
df['spanish'] = df['spanish'].apply(str.lower)

In [10]:
## Replacing puncutation

df['english'] = df['english'].apply(lambda line: re.sub(r'[^\w\s]', '', line))
df['spanish'] = df['spanish'].apply(lambda line: re.sub(r'[^\w\s]' + "¿", '', line))

In [11]:
## Adding <START> and <END> Token

df['spanish'] = df['spanish'].apply(
    lambda data_line: f'<START> {data_line} <END>')

In [12]:
df.head()

Unnamed: 0,english,spanish
0,go,<START> ve. <END>
1,go,<START> vete. <END>
2,go,<START> vaya. <END>
3,go,<START> váyase. <END>
4,hi,<START> hola. <END>


## Creating Train, Validation and Test sets

In [13]:
num_examples = df.shape[0]

print(f'There are {num_examples} training examples in the data')

There are 118964 training examples in the data


In [14]:
train_size = int(num_examples * 0.8)
val_size = int(num_examples * 0.1)

print(
    f'There are:\n\t{train_size} training examples\n\t{val_size} validation examples\n\t{num_examples-(train_size+val_size)} test examples'
)

There are:
	95171 training examples
	11896 validation examples
	11897 test examples


In [15]:
df = df.sample(frac=1)

train_df = df.iloc[:train_size]
val_df = df.iloc[train_size:train_size+val_size]
test_df = df.iloc[train_size+val_size:]

In [16]:
train_df.shape, val_df.shape, test_df.shape

((95171, 2), (11896, 2), (11897, 2))

## Pre-Processing Data

In [17]:
english_vocab_size = get_vocab_size(train_df['english'])
spanish_vocab_size = get_vocab_size(train_df['spanish'])

print(
    f'English Vocabulary Size: {english_vocab_size} \nSpanish Vocabulary Size: {spanish_vocab_size}'
)

100%|██████████| 95171/95171 [00:11<00:00, 8527.20it/s] 
100%|██████████| 95171/95171 [00:35<00:00, 2667.89it/s] 

English Vocabulary Size: 12576 
Spanish Vocabulary Size: 37345





In [18]:
english_sentence_lengths = train_df['english'].map(str.split).map(len)
spanish_sentence_lengths = train_df['spanish'].map(str.split).map(len)

In [19]:
print(f'''
    English Sentence Length Stats:
    \tMaximum Headline length: {english_sentence_lengths.max()}
    \tMinimum Headline length: {english_sentence_lengths.min()}
    \tAverage Headline length: {english_sentence_lengths.mean():.2f}
    \tSTD of Headline length: {english_sentence_lengths.std():.2f}

    Spanish Sentence Length Stats:
    \tMaximum Headline length: {spanish_sentence_lengths.max()}
    \tMinimum Headline length: {spanish_sentence_lengths.min()}
    \tAverage Headline length: {spanish_sentence_lengths.mean():.2f}
    \tSTD of Headline length: {spanish_sentence_lengths.std():.2f}
''')


    English Sentence Length Stats:
    	Maximum Headline length: 47
    	Minimum Headline length: 1
    	Average Headline length: 6.31
    	STD of Headline length: 2.61

    Spanish Sentence Length Stats:
    	Maximum Headline length: 51
    	Minimum Headline length: 3
    	Average Headline length: 8.09
    	STD of Headline length: 2.76



In [20]:
english_sequence_len = math.ceil(english_sentence_lengths.mean() + (3 * english_sentence_lengths.std()))
spanish_sequence_len = math.ceil(spanish_sentence_lengths.mean() + (3 * spanish_sentence_lengths.std()))


english_sequence_len, spanish_sequence_len

(15, 17)

In [21]:
english_vectorization = TextVectorization(
    output_mode="int",
    max_tokens=english_vocab_size,
    output_sequence_length=english_sequence_len,
)

spanish_vectorization = TextVectorization(
    output_mode="int",
    max_tokens=spanish_vocab_size,
    output_sequence_length=spanish_sequence_len + 1,
)

In [22]:
english_vectorization.adapt(train_df['english'])
spanish_vectorization.adapt(train_df['spanish'])

2023-02-10 15:34:36.056318: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


## Creating Datasets from Dataframes

In [39]:
batch_size = 32


In [40]:
def format_dataset(english_line, spanish_line):
    english_line = english_vectorization(english_line)
    spanish_line = spanish_vectorization(spanish_line)
    return (
        {
            "encoder_inputs": english_line,
            "decoder_inputs": spanish_line[:, :-1],
        },
        spanish_line[:, 1:]
    )

In [41]:
train_ds = dataset = Dataset.from_tensor_slices(
    ( train_df['english'], train_df['spanish'])).batch(
        batch_size).map(
            format_dataset).shuffle(
                train_df['english'].shape[0]).prefetch(
                    batch_size).cache()

val_ds = dataset = Dataset.from_tensor_slices(
    ( val_df['english'], val_df['spanish'])).batch(
        batch_size).map(
            format_dataset).shuffle(
                val_df['english'].shape[0]).prefetch(
                    batch_size).cache()

test_ds = dataset = Dataset.from_tensor_slices(
    ( test_df['english'], test_df['spanish'])).batch(
        batch_size).map(
            format_dataset).shuffle(
                test_df['english'].shape[0]).prefetch(
                    batch_size).cache()
            

In [42]:
for X, y in train_ds.take(1):
    print(X)
    print(y)

{'encoder_inputs': <tf.Tensor: shape=(32, 15), dtype=int64, numpy=
array([[ 108,    5,   75,    4,   19, 6973,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [  78, 5531,   13,  230,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [   3,   36, 1195,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [ 522,   29,    2, 1783,   35,  135,  358, 1343,    0,    0,    0,
           0,    0,    0,    0],
       [   3,   32,    5,    4,  219,   17, 3281,    0,    0,    0,    0,
           0,    0,    0,    0],
       [  97,  919,  232,    4,    6,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [  13,   14,   84, 1327,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [  53, 1065,    2, 2590,  258,   14,  226,   20,  510,    0,    0,
           0,    0,    0,    0],
       [  34,  285,  102,   95,    4,   41,   90,    0,    0,    0,    0,
   

2023-02-10 15:35:07.337678: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Defining Architecture of Blocks of Transformer 

In [43]:
class EmbeddingBlock(Layer):

    def __init__(self, vocab_size, embedding_dim,  sequence_length, **kwargs):

        super().__init__(**kwargs)

        self.token_embeddings = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.position_embeddings = Embedding(input_dim=sequence_length, output_dim=embedding_dim)

    def call(self, inputs):
        
        positions = tf.range(start=0, limit=tf.shape(inputs)[-1], delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        out = embedded_tokens + embedded_positions

        return out

In [44]:
class MultiLayerPerceptron(Layer):

    def __init__(self, dense_dim, out_dim, dropout_p, **kwargs):

        super().__init__(**kwargs)
        
        self.dense = Sequential([
            Dense(dense_dim, activation='relu'), 
            Dropout(dropout_p),
            Dense(out_dim, activation='relu')
        ])

    def call(self, inputs):
        return self.dense(inputs)
    

In [45]:
class EncoderBlock(Layer):

    def __init__(self, embedding_dim, dense_dim, num_heads, dropout_p, **kwargs):

        super().__init__(**kwargs)
        self.multi_headed_self_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.layernorm_1 = LayerNormalization()
        self.feed_forward = MultiLayerPerceptron(dense_dim, embedding_dim, dropout_p)
        self.layernorm_2 = LayerNormalization()

    def call(self, inputs, mask=None):

        multi_headed_self_attention_output = self.multi_headed_self_attention(
            query=inputs,
            value=inputs,
            key=inputs,
        )
        feed_forward_input = self.layernorm_1(inputs + multi_headed_self_attention_output)
        feed_forward_output = self.feed_forward(feed_forward_input)
        out = self.layernorm_2(feed_forward_input + feed_forward_output)

        return out

In [46]:
class DecoderBlock(Layer):

    def __init__(self, embedding_dim, dense_dim, num_heads, dropout_p, **kwargs):
        
        super().__init__(**kwargs)

        self.masked_multi_headed_self_attention = attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.layernorm_1 = LayerNormalization()
        self.multi_headed_cross_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.layernorm_2 = LayerNormalization()
        self.feed_forward = MultiLayerPerceptron(dense_dim, embedding_dim, dropout_p)
        self.layernorm_3 = LayerNormalization()

        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):

        causal_mask = generate_self_attention_mask(inputs)
        masked_multi_headed_self_attention_output = self.masked_multi_headed_self_attention(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask    
        )
        masked_multi_headed_self_attention_output_normalized = self.layernorm_1(inputs + masked_multi_headed_self_attention_output)

        multi_headed_cross_attention_output = self.multi_headed_cross_attention(
            query=masked_multi_headed_self_attention_output_normalized,
            value=encoder_outputs,
            key=encoder_outputs,
        )

        feed_forward_input = self.layernorm_2(masked_multi_headed_self_attention_output + masked_multi_headed_self_attention_output_normalized)
        feed_forward_output = self.feed_forward(feed_forward_input)
        out = self.layernorm_3(feed_forward_input + feed_forward_output)

        return out

## Defining Hyper Parameters

In [47]:
embedding_dim = 256
dense_dim = 2048
num_heads = 8
dropout_p = 0.2

## Building the Transformer

### Encoder Part


In [48]:
encoder_inputs = Input(shape=(None,), dtype="int64", name="encoder_inputs")

x = EmbeddingBlock(english_vocab_size, embedding_dim, english_sequence_len)(encoder_inputs)
encoder_block_outputs = EncoderBlock(embedding_dim, dense_dim, num_heads, dropout_p)(x)

encoder = Model(encoder_inputs, encoder_block_outputs)

### Decoder Part

In [49]:
decoder_inputs = Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = Input(shape=(None, embedding_dim), name="encoded_seq_inputs")

x = EmbeddingBlock(spanish_vocab_size, embedding_dim, spanish_sequence_len)(decoder_inputs)
x = DecoderBlock(embedding_dim, dense_dim, num_heads, dropout_p)(x, encoded_seq_inputs)
x = Dropout(dropout_p)(x)
decoder_block_outputs = Dense(spanish_vocab_size, activation="softmax")(x)


decoder = Model([decoder_inputs, encoded_seq_inputs], decoder_block_outputs)

In [50]:
transformer_inputs = [encoder_inputs, decoder_inputs]
transformer_outputs = decoder([decoder_inputs, encoder_block_outputs])

transformer = Model(
    transformer_inputs, transformer_outputs, name="transformer"
)

In [51]:
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 embedding_block_2 (EmbeddingBl  (None, None, 256)   3223296     ['encoder_inputs[0][0]']         
 ock)                                                                                             
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder_block_1 (EncoderBlock)  (None, None, 256)   3155456     ['embedding_block_2[0][

## Compling the Model

In [52]:
transformer.compile(
    # "rmsprop",
    'adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

## Traning the Model

In [53]:
epochs = 1

In [54]:
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)



<keras.callbacks.History at 0x2e57b3f70>