In [1]:
import numpy as np 
import pandas as pd 
import einops
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
import unicodedata
import tensorflow as tf 
import tensorflow.compat.v1 as tf_v1


In [2]:
dataset = pd.read_csv("D:\Datasets\eng_-french.csv")

In [3]:
input_text = []
target_text = []

for sentences_eng in dataset['English words/sentences'][:100000]:
    input_text.append(sentences_eng)

for sentences_fre in dataset['French words/sentences'][:100000]:
    sentences_fre = sentences_fre
    target_text.append(sentences_fre)

Converting the data into a numpy array

In [4]:
english = np.array(input_text)
french = np.array(target_text)

Creating a tensorflow dataset

In [5]:
buffer_size = len(input_text)
batch_size = 64

is_train = np.random.uniform(size=(len(target_text),)) < 0.8

# train_raw = (tf.data.Dataset.from_tensor_slices((english[is_train],french[is_train])).shuffle(buffer_size).batch(batch_size))
train_raw = (tf.data.Dataset.from_tensor_slices((english[is_train],french[is_train])).shuffle(buffer_size).batch(batch_size))

# val_raw = (tf.data.Dataset.from_tensor_slices((english[~is_train],french[~is_train])).shuffle(buffer_size).batch(batch_size))
val_raw = (tf.data.Dataset.from_tensor_slices((english[~is_train],french[~is_train])).shuffle(buffer_size).batch(batch_size))

Normalization of the text is not done

unicode normalization

In [6]:
def tf_lower_and_split_punct(txt):
    text = tf.strings.lower(txt)
    text = tf.strings.regex_replace(text,'[^ a-z.?!,¿]','')
    # add spaces around punctuations
    text = tf.strings.regex_replace(text,'[.?!,¿]', r'\0')
    # Strip white space
    text = tf.strings.strip(text)
    
    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

Text vectorisation 

In [7]:
vocublary_size = 5000

context_text_processor = tf.keras.layers.TextVectorization(standardize=tf_lower_and_split_punct, max_tokens=vocublary_size, ragged=True)

In [8]:
context_text_processor.adapt(train_raw.map(lambda context, target: context))
# Here are the first 10 words from the vocabulary:
a = context_text_processor.get_vocabulary()[:10]

In [9]:
print(type(a[0]))

<class 'str'>


In [10]:
target_text_processor = tf.keras.layers.TextVectorization(standardize=tf_lower_and_split_punct, max_tokens=vocublary_size, ragged=True)

In [11]:
target_text_processor.adapt(train_raw.map(lambda context, target: target))

# first 10 words from vocublary:
target_text_processor.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', 'je', 'de', 'pas', '?', 'ne', 'que']

In [12]:
context_vocab = np.array(context_text_processor.get_vocabulary())

Data Preprocessing

In [13]:
def process_text(context, target):
    # context = tf.convert_to_tensor(context_text_processor(context))
    context = context_text_processor(context).to_tensor()
    target = target_text_processor(target)
    targ_in = target[:,:-1].to_tensor()
    targ_out = target[:,1:].to_tensor()

    return (context, targ_in), targ_out

train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

Example

In [14]:
for (ex_context_tok, ex_tar_in), ex_tar_out in train_ds.take(1):
    print(ex_context_tok[0,:10].numpy())
    print()
    print(ex_tar_in[0, :10].numpy()) 
    print(ex_tar_out[0, :10].numpy())

[   2   12    5 2531    1    3    0    0    0]

[   2 2189  298 1839    7    0    0    0    0    0]
[2189  298 1839    7    3    0    0    0    0    0]


Developing the encoder-decoder architecture

In [15]:
UNITS = 256

Encoder

In [16]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, text_processor, units):
        super(Encoder, self).__init__()
        self.text_processor = text_processor
        self.units = units
        self.vocab_size = text_processor.vocabulary_size()

        self.embedding = tf.keras.layers.Embedding(self.vocab_size, units, mask_zero = True)
        self.rnn = tf.keras.layers.Bidirectional(
            merge_mode='sum',
            layer=tf.keras.layers.GRU(units,
                                # Return the sequence and state
                                return_sequences=True,
                                recurrent_initializer='glorot_uniform'))


def call(self, x):
    shape_checker = ShapeChecker()
    shape_checker(x,'batch s')

    x = self.embedding(x)
    shape_checker(x, 'batch s units')

    x = self.rnn(x)
    shape_checker(x, 'batch s units')
    return x

def convert_input(self, texts):
    texts = tf.convert_to_tensor(texts)
    if len(texts.shape) == 0:
        texts = tf.convert_to_tensor(texts)[tf.newaxis]
    context = text_preprocessor(texts).to_tensor()
    context = self(context)
    return context

Encoding the sequence

In [26]:
encoder = Encoder(context_text_processor,UNITS)
ex_cont = encoder(ex_context_tok)
print(f'Context tokens, shape (batch, s): {ex_context_tok.shape}')
print(f'Encoder output, shape (batch, s, units): {ex_context.shape}')

(64, 9)
Context tokens, shape (batch, s): (64, 9)
Encoder output, shape (batch, s, units): (64, 9)


Coding the Attention Layer

In [21]:
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
        

    def call(self, x, context):
        # shape_checker = ShapeChecker()
        # shape_checker(x, 'batch t units')
        # shape_checker(context, 'batch s units')

        attn_output, attn_scores = self.mha(
            query = x,
            value = context,
            return_attention_scores = True
        )

        # shape_checker(x, 'batch t units')
        # shape_checker(attn_scores, 'batch heads t s')

        # Cache the attention scores for plotting later.
        attn_scores = tf.reduce_mean(attn_scores, axis=1)
        shape_checker(attn_scores, 'batch t s')
        self.last_attention_weights = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x

In [22]:
attention_layer = CrossAttention(UNITS)

embd = tf.keras.layers.Embedding(target_text_processor.vocabulary_size(), output_dim = UNITS, mask_zero = True)

ex_tar_embed = embd(ex_tar_in)
result = attention_layer(ex_tar_embed, ex_context)

print(f'Context sequence, shape (batch, s, units): {ex_context.shape}')
print(f'Target sequence, shape (batch, t, units): {ex_tar_embed.shape}')
print(f'Attention result, shape (batch, t, units): {result.shape}')
print(f'Attention weights, shape (batch, t, s):    {attention_layer.last_attention_weights.shape}')

InvalidArgumentError: Exception encountered when calling layer 'key' (type EinsumDense).

cannot compute Einsum as input #1(zero-based) was expected to be a int64 tensor but is a float tensor [Op:Einsum]

Call arguments received by layer 'key' (type EinsumDense):
  • inputs=tf.Tensor(shape=(64, 9), dtype=int64)