<a href="https://colab.research.google.com/github/akankshakusf/Project-DeepLearning-English-to-French-Translation/blob/master/Neural_Machine_Translation_with_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import ML packages
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.metrics import confusion_matrix,roc_curve
import pathlib
import io
import re
import string
import time

#import DL package
import cv2
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer,Dense, Flatten, InputLayer, BatchNormalization, Bidirectional, Dropout, Input, Embedding, TextVectorization
from tensorflow.keras.layers import SimpleRNN, Conv1D, LSTM, GRU
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, TopKCategoricalAccuracy, TopKCategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

# Data Preparation

## Data Download

In [2]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2025-04-22 10:29:32--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2025-04-22 10:29:35 (5.13 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]



In [3]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


## Kaggle Dataset

In [4]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset

Dataset URL: https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset
License(s): ODbL-1.0


In [5]:
!unzip "/content/en-fr-translation-dataset.zip" -d "/content/dataset/"

Archive:  /content/en-fr-translation-dataset.zip
  inflating: /content/dataset/en-fr.csv  


In [6]:
dataset = tf.data.experimental.CsvDataset(
  "/content/dataset/en-fr.csv",
  [
    tf.string,
    tf.string
  ],
)

## Data Processing

In [7]:
text_dataset=tf.data.TextLineDataset("/content/dataset/fra.txt")

In [8]:
#review dataset
for i in text_dataset.take(3):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)


In [None]:
# #lets skip a max number of records and check what max length we find
# for i in text_dataset.skip(190000):
#   print(len(tf.strings.split(i," ")))

- Since i saw that the max len of the sentence is 107. I am going to go ahead with a sequence length of 64 as we also have french letters

In [9]:
VOCAB_SIZE= 20000
ENGLISH_SEQUENCE_LENGTH=64
FRENCH_SEQUENCE_LENGTH=64
EMBEDDING_DIM = 300
BATCH_SIZE=64

- Create vectorizer layer to create vectors
- reference :https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization

In [10]:
# Turn text to lowercase and remove punctuation
# Keep only top VOCAB_SIZE words
# Convert words to numbers
# Make all sentences the same length

In [11]:
english_vectorize_layer = TextVectorization (
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length = ENGLISH_SEQUENCE_LENGTH
)

In [12]:
french_vectorize_layer = TextVectorization (
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length = FRENCH_SEQUENCE_LENGTH
)

- Look at this sample data from dataset and get rid of tabs--->  \t
'Go.\tVa !\tCC-BY 2.0 (France)

In [13]:
def selector (input_text):
  split_text=tf.strings.split(input_text, '\t') ##after splitting collect english and french separately
  return {'input_1':split_text[0:1],'input_2':'starttoken '+split_text[1:2]},split_text[1:2]+' endtoken'

In [14]:
#map text_dataset to selector above function
split_dataset = text_dataset.map(selector)  ## this is for final dataset

In [15]:
def separator(input_text):
  split_text=tf.strings.split(input_text,'\t')
  return split_text[0:1],'starttoken '+split_text[1:2]+' endtoken' #final output

In [16]:
#map text_dataset to selector above function
init_dataset = text_dataset.map(separator)  ## this is for just intermediated start-end token consideration

In [None]:
# review the data
for i in split_dataset.take(2):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)


In [17]:
# review the data
for i in init_dataset.take(2):
  print(i)

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va ! endtoken'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche. endtoken'], dtype=object)>)


- Notice how nicely english and french text have been separated now

In [18]:
# now lets attach this Vectorizer to init_dataset to get the vocabulary list
english_training_dataset = init_dataset.map(lambda x,y:x) ##input is x, y and output is x
english_vectorize_layer.adapt(english_training_dataset) ##adapt the vectorizer layer to training data

In [19]:
# now lets attach this Vectorizer to init_dataset to get the vocabulary list
french_training_data=init_dataset.map(lambda x,y:y) ##input x,y,z and output y
french_vectorize_layer.adapt(french_training_data) ##adapt the vectorize_layer to the training data

In [20]:
print(len(english_vectorize_layer.get_vocabulary()))
print(len(french_vectorize_layer.get_vocabulary()))

16952
20000


- Now data is adapted do vectorization (convert to numbers)

In [21]:
def vectorize(inputs,output):
  return {'input_1':english_vectorize_layer(inputs['input_1']),
          'input_2':french_vectorize_layer(inputs['input_2'])},french_vectorize_layer(output)

In [22]:
split_dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'input_2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.string, name=None))>

- get the final vectorized dataset

In [23]:
dataset = split_dataset.map(vectorize)

In [24]:
#check the data "Go means Va in french"
english_vectorize_layer.get_vocabulary()[45]

np.str_('go')

In [25]:
#check the data
french_vectorize_layer.get_vocabulary()[104]

np.str_('va')

In [26]:
for i in dataset.take(2):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[  2, 104,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[104,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,

-

In [27]:
dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

- Create pipeline : by shuffling data and batch the data now

In [28]:
BATCH_SIZE=64

In [29]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [30]:
print(dataset)

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>


In [31]:
#confirm batching shape : batch
for x, y in dataset.take(1):
    print(x['input_1'].shape)  # Shape: (batch_size, seq_len)
    print(x['input_2'].shape)
    print(y.shape)

(64, 64)
(64, 64)
(64, 64)


In [32]:
#check for number of batches
NUM_BATCHES= int(200000/BATCH_SIZE) ## since i have 200,000 data point in dataset and batch =64
print(NUM_BATCHES)

3125


In [33]:
train_dataset = dataset.take(int(0.9*NUM_BATCHES))  ## i will use 90% of data for training
val_dataset = dataset.skip(int(0.9*NUM_BATCHES))   ## rest 10% i will just push in validation

In [34]:
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [35]:
val_dataset

<_SkipDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [36]:
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

# Modeling

## Embedding

In [50]:
#build postional encoding layer
def positional_encoding(model_size,SEQUENCE_LENGTH):
  output=[]
  for pos in range(SEQUENCE_LENGTH):
    PE=np.zeros((model_size))
    for i in range(model_size):
      if i%2==0:
        PE[i]=np.sin(pos/(10000**(i/model_size)))
      else:
        PE[i]=np.cos(pos/(10000**((i-1)/model_size)))
    output.append(tf.expand_dims(PE,axis=0))
  out=tf.concat(output,axis=0)
  out=tf.expand_dims(out,axis=0)
  return tf.cast(out,dtype=tf.float32)

In [53]:
print(positional_encoding(256,64).shape)

(1, 64, 256)


In [141]:
class Embeddings(Layer):
  def __init__(self, sequence_length, vocab_size, embed_dim,):
    super(Embeddings, self).__init__()
    self.token_embeddings=Embedding(
        input_dim=vocab_size, output_dim = embed_dim)
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.embed_dim = embed_dim
    self.supports_masking = True

  def call(self,inputs):
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions = positional_encoding(
        self.embed_dim, self.sequence_length)
    return embedded_tokens +embedded_positions

  def compute_mask(self, inputs, mask=None):
    return tf.math.not_equal(inputs,0)

In [250]:
class Embeddings(Layer):
  def __init__(self, sequence_length, vocab_size, embed_dim,):
    super(Embeddings, self).__init__()
    self.token_embeddings=Embedding(
        input_dim=vocab_size, output_dim = embed_dim)
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.embed_dim = embed_dim
    self.supports_masking = True

  def call(self,inputs):
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions = positional_encoding(
        self.embed_dim, self.sequence_length)
    return embedded_tokens +embedded_positions

  def compute_mask(self, inputs, mask=None):
    # Wrap tf.math.not_equal in a Lambda layer to handle KerasTensors
    return tf.keras.layers.Lambda(lambda x: tf.math.not_equal(x, 0))(inputs)

In [251]:
test_input=tf.constant([[2,4,7,21,3,5,0,0]])
emb=Embeddings(8,20000,512)
emb_out=emb(test_input)
print(emb_out.shape)

(1, 8, 512)


In [219]:
#obtain mask
mask = emb.compute_mask(test_input)
print(mask)  # check for 2 false at the end they are paddings

#obtain padding mask
padding_mask = tf.cast(
    tf.repeat(mask,repeats=tf.shape(mask)[1],axis=0),
    dtype=tf.int32)
print(padding_mask)

tf.Tensor([[ True  True  True  True  True  True False False]], shape=(1, 8), dtype=bool)
tf.Tensor(
[[1 1 1 1 1 1 0 0]
 [1 1 1 1 1 1 0 0]
 [1 1 1 1 1 1 0 0]
 [1 1 1 1 1 1 0 0]
 [1 1 1 1 1 1 0 0]
 [1 1 1 1 1 1 0 0]
 [1 1 1 1 1 1 0 0]
 [1 1 1 1 1 1 0 0]], shape=(8, 8), dtype=int32)


In [220]:
print(tf.linalg.band_part(
    tf.ones([1,8,8], dtype=tf.int32),-1,0
))

tf.Tensor(
[[[1 0 0 0 0 0 0 0]
  [1 1 0 0 0 0 0 0]
  [1 1 1 0 0 0 0 0]
  [1 1 1 1 0 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 1 0]
  [1 1 1 1 1 1 1 1]]], shape=(1, 8, 8), dtype=int32)


In [221]:
# #obtain mask
# mask = emb.compute_mask(test_input)
# print(mask)  # check for 2 false at the end they are paddings
# mask = tf.cast(mask, dtype=tf.int32)
# print(mask)
# mask = mask[:,tf.newaxis,:]
# print(tf.repeat(mask,8,axis=1))


In [253]:
mask = emb.compute_mask(test_input)
mask1 = mask[:,:,tf.newaxis]
mask2 = mask[:,tf.newaxis,:]
padding_mask = tf.cast(mask1&mask2,dtype="int32")
print(padding_mask)

tf.Tensor(
[[[1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]]], shape=(1, 8, 8), dtype=int32)


## Encoder : LSTM

In [222]:
class Encoder(tf.keras.Model):  # Inherits from the tf.keras.Model base class
    def __init__(self, vocab_size, embedding_dims, units):
        super(Encoder, self).__init__()  #Call the base class constructor
        self.vocab_size = vocab_size
        self.embedding_dims = embedding_dims
        self.units = units

    #Define the layers in the constructor
    def build(self,input_shape):
        self.embedding = Embedding(self.vocab_size, self.embedding_dims)
        self.lstm = LSTM(self.units, return_sequences=True)

    def call(self, x):  # This is the forward pass used during model execution
        x = self.embedding(x)  # Convert input tokens to dense vector embeddings
        output = self.lstm(x)  # Pass embeddings through LSTM
        return output          # Return the output of the LSTM (sequence of hidden states)



In [None]:
HIDDEN_UNITS = 256 # this are hidden state units for encoder
EMBEDDING_DIM = 256
encoder= Encoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS)
#perform a dry run to check
encoder_output=encoder(tf.zeros([128,8]))
print(encoder_output.shape)

(128, 8, 256)


## Bahdanau Attention : attention layer

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self,units):
    super(BahdanauAttention,self).__init__()  #Call the base class constructor
    self.units = units

  #define layers in model
  def build(self,input_shape):
    self.w_1=tf.keras.layers.Dense(self.units)
    self.w_2=tf.keras.layers.Dense(self.units)
    self.w  =tf.keras.layers.Dense(1)  #tanh activation

  #This is the forward pass used during model execution
  def call(self,prev_dec_state, enc_states):
    scores=self.w(
        tf.nn.tanh(
            self.w_1(tf.expand_dims(prev_dec_state,-2)) +  ## we are passing these from self.w cause it should be reduced to 1 dimension
            self.w_2(enc_states)))


    attention_weights=tf.nn.softmax(scores,axis=1)
    context_vector=attention_weights*enc_states
    context_vector=tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

In [None]:
bahdanau_attention=BahdanauAttention(256)
context_vector,attention_weights=bahdanau_attention(tf.zeros([128,32]),tf.zeros([128,8,32]))
print(context_vector.shape)
print(attention_weights.shape)

(128, 32)
(128, 8, 1)


## Decoder

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, sequence_length):
    super(Decoder, self).__init__()
    self.embedding_dim = embedding_dim
    self.vocab_size = vocab_size
    self.dec_units = dec_units
    self.sequence_length = sequence_length

  def build(self, input_shape):
    self.dense = Dense(self.vocab_size, activation="softmax")
    self.gru = GRU(self.dec_units, return_sequences=True, return_state=True)
    self.attention = BahdanauAttention(self.dec_units)
    self.embedding = Embedding(self.vocab_size, self.embedding_dim)

  def call(self, x, hidden, shifted_target):
    outputs = []
    context_vectors = []
    attention_weightss = []

    shifted_target = self.embedding(shifted_target)  # shape: (128, 64, embedding_dim)

    for t in range(0, self.sequence_length):
      context_vector, attention_weights = self.attention(hidden, x)  # (128, dec_units), (128, 8, 1)
      dec_input = context_vector + shifted_target[:, t, :]  # shape: (128, embedding_dim)
      output, hidden = self.gru(tf.expand_dims(dec_input, 1))  # output: (128, 1, dec_units)
      outputs.append(output[:, 0])  # (128, dec_units)
      attention_weightss.append(attention_weights)  # each is (128, 8, 1)

    outputs = tf.stack(outputs, axis=1)  # (128, 64, dec_units)
    outputs = self.dense(outputs)        # (128, 64, vocab_size)

    attention_weightss = tf.stack(attention_weightss, axis=1)  # (128, 64, 8, 1)
    attention_weights = attention_weightss[:, -1, :, :]        # just final timestep: (128, 8, 1)

    return outputs, attention_weights


In [64]:
decoder=Decoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS,FRENCH_SEQUENCE_LENGTH)
outputs,attention_weights=decoder(encoder_output,tf.zeros([128,HIDDEN_UNITS]),tf.zeros([128,64]))
print(outputs.shape)
print(attention_weights.shape)

In [None]:
### ENCODER
input = Input(shape=(ENGLISH_SEQUENCE_LENGTH,), dtype="int64", name="input_1")
encoder=Encoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS)
encoder_output=encoder(input)

### DECODER
shifted_target=Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")
decoder=Decoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS,FRENCH_SEQUENCE_LENGTH)
decoder_output,attention_weightss=decoder(encoder_output,tf.zeros([1,HIDDEN_UNITS]),shifted_target)

### OUTPUT
bahdanau=Model([input,shifted_target],decoder_output)
bahdanau.summary()

## Custom MultiHeadAttention

In [294]:
class CustomSelfAttention(Layer):
  def __init__(self,model_size):
    super(CustomSelfAttention,self).__init__()
    self.model_size=model_size
  def call(self,query,key,value,masking):
    ######## compute scores
    score=tf.matmul(query,key,transpose_b=True)
    ######## scaling
    score/=tf.math.sqrt(tf.cast(self.model_size,tf.float32))
    print('score--->', score)
    ######## masking
    masking=tf.cast(masking,dtype=tf.float32)
    print('mask--->', masking)
    score+=(1.-masking)*-1e10
    print('scoringaftermasking--->', score)
    ######## attention_weights
    attention=tf.nn.softmax(score,axis=-1)*masking
    print(attention)
    ######## output
    head=tf.matmul(attention,value)
    return head

In [295]:
attention=CustomSelfAttention(256)
attention(tf.ones([1,8,256]),tf.ones([1,8,256]),tf.ones([1,8,256]),padding_mask)

score---> tf.Tensor(
[[[16. 16. 16. 16. 16. 16. 16. 16.]
  [16. 16. 16. 16. 16. 16. 16. 16.]
  [16. 16. 16. 16. 16. 16. 16. 16.]
  [16. 16. 16. 16. 16. 16. 16. 16.]
  [16. 16. 16. 16. 16. 16. 16. 16.]
  [16. 16. 16. 16. 16. 16. 16. 16.]
  [16. 16. 16. 16. 16. 16. 16. 16.]
  [16. 16. 16. 16. 16. 16. 16. 16.]]], shape=(1, 8, 8), dtype=float32)
mask---> tf.Tensor(
[[[1. 1. 1. 1. 1. 1. 0. 0.]
  [1. 1. 1. 1. 1. 1. 0. 0.]
  [1. 1. 1. 1. 1. 1. 0. 0.]
  [1. 1. 1. 1. 1. 1. 0. 0.]
  [1. 1. 1. 1. 1. 1. 0. 0.]
  [1. 1. 1. 1. 1. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0.]]], shape=(1, 8, 8), dtype=float32)
scoringaftermasking---> tf.Tensor(
[[[ 1.6e+01  1.6e+01  1.6e+01  1.6e+01  1.6e+01  1.6e+01 -1.0e+10
   -1.0e+10]
  [ 1.6e+01  1.6e+01  1.6e+01  1.6e+01  1.6e+01  1.6e+01 -1.0e+10
   -1.0e+10]
  [ 1.6e+01  1.6e+01  1.6e+01  1.6e+01  1.6e+01  1.6e+01 -1.0e+10
   -1.0e+10]
  [ 1.6e+01  1.6e+01  1.6e+01  1.6e+01  1.6e+01  1.6e+01 -1.0e+10
   -1.0e+10]
  [ 1.6e+01  1.6e+01  1.6e

<tf.Tensor: shape=(1, 8, 256), dtype=float32, numpy=
array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)>

In [296]:
class CustomMultiHeadAttention(Layer):
  def __init__(self,num_heads,key_dim):
    super(CustomMultiHeadAttention,self).__init__()

    self.num_heads=num_heads
    self.dense_q=[Dense(key_dim//num_heads) for _ in range(num_heads)]
    self.dense_k=[Dense(key_dim//num_heads) for _ in range(num_heads)]
    self.dense_v=[Dense(key_dim//num_heads) for _ in range(num_heads)]
    self.dense_o=Dense(key_dim) #output
    self.self_attention=CustomSelfAttention(key_dim)

  def call(self,query,key,value,attention_mask):
    heads=[]

    for i in range(self.num_heads):
      print("hello", self.dense_q[i](query).shape)
      head=self.self_attention(self.dense_q[i](query),
                               self.dense_k[i](key),
                               self.dense_v[i](value),
                               attention_mask)
      heads.append(head)
    print("head", tf.convert_to_tensor(heads).shape)
    heads=tf.concat(heads,axis=2)
    heads=self.dense_o(heads)
    return heads

## Encoder Transformer Model

In [297]:
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads,):
        super(TransformerEncoder, self).__init__()
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = CustomMultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim,
        )
        self.dense_proj=tf.keras.Sequential(
            [Dense(dense_dim, activation="relu"),
             Dense(embed_dim),]
        )
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):

      if mask is not None:
        mask = tf.cast(
            mask[:,tf.newaxis, :], dtype="int32")
        T = tf.shape(mask)[2]
        padding_mask = tf.repeat(mask,T,axis=1)
      attention_output = self.attention(
          query=inputs, key=inputs,value=inputs,
          attention_mask=padding_mask
      )

      proj_input = self.layernorm_1(inputs + attention_output)
      proj_output = self.dense_proj(proj_input)
      return self.layernorm_2(proj_input + proj_output)

In [298]:
encoder_outputs = TransformerEncoder(512, 2048, 8)(emb_out, mask=mask)
print(encoder_outputs.shape)

hello (1, 8, 64)
score---> Tensor("custom_self_attention_30_1/truediv:0", shape=(1, 8, 8), dtype=float32)
mask---> Tensor("custom_self_attention_30_1/Cast_1:0", shape=(1, 8, 8), dtype=float32)
scoringaftermasking---> Tensor("custom_self_attention_30_1/add:0", shape=(1, 8, 8), dtype=float32)
Tensor("custom_self_attention_30_1/mul_1:0", shape=(1, 8, 8), dtype=float32)
hello (1, 8, 64)
score---> Tensor("custom_self_attention_30_3/truediv:0", shape=(1, 8, 8), dtype=float32)
mask---> Tensor("custom_self_attention_30_3/Cast_1:0", shape=(1, 8, 8), dtype=float32)
scoringaftermasking---> Tensor("custom_self_attention_30_3/add:0", shape=(1, 8, 8), dtype=float32)
Tensor("custom_self_attention_30_3/mul_1:0", shape=(1, 8, 8), dtype=float32)
hello (1, 8, 64)
score---> Tensor("custom_self_attention_30_5/truediv:0", shape=(1, 8, 8), dtype=float32)
mask---> Tensor("custom_self_attention_30_5/Cast_1:0", shape=(1, 8, 8), dtype=float32)
scoringaftermasking---> Tensor("custom_self_attention_30_5/add:0", sh



score---> Tensor("custom_self_attention_30_11/truediv:0", shape=(1, 8, 8), dtype=float32)
mask---> Tensor("custom_self_attention_30_11/Cast_1:0", shape=(1, 8, 8), dtype=float32)
scoringaftermasking---> Tensor("custom_self_attention_30_11/add:0", shape=(1, 8, 8), dtype=float32)
Tensor("custom_self_attention_30_11/mul_1:0", shape=(1, 8, 8), dtype=float32)
hello (1, 8, 64)
score---> Tensor("custom_self_attention_30_13/truediv:0", shape=(1, 8, 8), dtype=float32)
mask---> Tensor("custom_self_attention_30_13/Cast_1:0", shape=(1, 8, 8), dtype=float32)
scoringaftermasking---> Tensor("custom_self_attention_30_13/add:0", shape=(1, 8, 8), dtype=float32)
Tensor("custom_self_attention_30_13/mul_1:0", shape=(1, 8, 8), dtype=float32)
hello (1, 8, 64)
score---> Tensor("custom_self_attention_30_15/truediv:0", shape=(1, 8, 8), dtype=float32)
mask---> Tensor("custom_self_attention_30_15/Cast_1:0", shape=(1, 8, 8), dtype=float32)
scoringaftermasking---> Tensor("custom_self_attention_30_15/add:0", shape=(1



## Decoder Transformer Model

In [299]:
print(tf.linalg.band_part(
        tf.ones([1,8, 8],dtype=tf.int32),-1,0))

tf.Tensor(
[[[1 0 0 0 0 0 0 0]
  [1 1 0 0 0 0 0 0]
  [1 1 1 0 0 0 0 0]
  [1 1 1 1 0 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 1 0]
  [1 1 1 1 1 1 1 1]]], shape=(1, 8, 8), dtype=int32)


In [304]:
class TransformerDecoder(Layer):
  def __init__(self, embed_dim, latent_dim, num_heads,):
    super(TransformerDecoder, self).__init__()
    self.embed_dim = embed_dim
    self.latent_dim = latent_dim
    self.num_heads = num_heads
    self.attention_1=CustomMultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim
    )
    self.attention_2=CustomMultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim
    )
    self.dense_proj = tf.keras.Sequential(
        [Dense(latent_dim, activation="relu"),Dense(embed_dim),]
    )
    self.layernorm_1=tf.keras.layers.LayerNormalization()
    self.layernorm_2=tf.keras.layers.LayerNormalization()
    self.layernorm_3=tf.keras.layers.LayerNormalization()
    self.supports_masking = True
  def call(self, inputs, encoder_outputs, enc_mask, mask=None):


    if mask is not None:
      causal_mask=tf.linalg.band_part(
        tf.ones([tf.shape(inputs)[0],
                 tf.shape(inputs)[1],
                 tf.shape(inputs)[1]],dtype=tf.int32),-1,0)
      mask = tf.cast(
          mask[:,tf.newaxis, :], dtype="int32")
      enc_mask = tf.cast(
          enc_mask[:,tf.newaxis, :], dtype="int32")
      T = tf.shape(mask)[2]
      padding_mask = tf.repeat(mask,T,axis=1)
      cross_attn_mask = tf.repeat(enc_mask,T,axis=1)
      combined_mask=tf.minimum(padding_mask,causal_mask)

    attention_output_1 = self.attention_1(
        query=inputs,key=inputs,value=inputs,
        attention_mask=combined_mask,

    )

    out_1 = self.layernorm_1(inputs + attention_output_1)

    attention_output_2= self.attention_2(
        query=out_1,key=encoder_outputs,value=encoder_outputs,
        attention_mask=cross_attn_mask,

    )
    out_2 = self.layernorm_2(out_1 + attention_output_2)

    proj_output = self.dense_proj(out_2)
    return self.layernorm_3(out_2 + proj_output)

In [305]:
enc_mask=mask
decoder_outputs = TransformerDecoder(512,2048,4)(
    emb_out,encoder_outputs,enc_mask)
print(decoder_outputs.shape)


## Transformer Model

In [287]:
EMBEDDING_DIM=128
D_FF=1024
NUM_HEADS=8
NUM_LAYERS = 1
NUM_EPOCHS =10

In [288]:
encoder_inputs=Input(shape=(None,), dtype="int64", name="input_1")
emb = Embeddings(ENGLISH_SEQUENCE_LENGTH,VOCAB_SIZE,EMBEDDING_DIM)
x = emb(encoder_inputs)
enc_mask = emb.compute_mask(encoder_inputs)

for _ in range(NUM_LAYERS):
  x=TransformerEncoder(EMBEDDING_DIM,D_FF,NUM_HEADS)(x)
encoder_outputs=x

decoder_inputs=Input(shape=(None,), dtype="int64", name="input_2")

x = Embeddings(FRENCH_SEQUENCE_LENGTH,VOCAB_SIZE,EMBEDDING_DIM)(decoder_inputs)
for i in range(NUM_LAYERS):
  x=TransformerDecoder(EMBEDDING_DIM,D_FF,NUM_HEADS)(x, encoder_outputs,enc_mask)
x=tf.keras.layers.Dropout(0.5)(x)
decoder_outputs=Dense(VOCAB_SIZE, activation="softmax")(x)

transformer = tf.keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)
transformer.summary()

1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''cannot access local variable 'padding_mask' where it is not associated with a value''


hello (None, 64, 16)
score---> Tensor("custom_self_attention_22_1/truediv:0", shape=(None, 64, 64), dtype=float32)
mask---> Tensor("custom_self_attention_22_1/Cast_1:0", shape=(None, None, None), dtype=float32)
scoringaftermasking---> Tensor("custom_self_attention_22_1/add:0", shape=(None, 64, 64), dtype=float32)
Tensor("custom_self_attention_22_1/mul_1:0", shape=(None, 64, 64), dtype=float32)
hello (None, 64, 16)
score---> Tensor("custom_self_attention_22_3/truediv:0", shape=(None, 64, 64), dtype=float32)
mask---> Tensor("custom_self_attention_22_3/Cast_1:0", shape=(None, None, None), dtype=float32)
scoringaftermasking---> Tensor("custom_self_attention_22_3/add:0", shape=(None, 64, 64), dtype=float32)
Tensor("custom_self_attention_22_3/mul_1:0", shape=(None, 64, 64), dtype=float32)
hello (None, 64, 16)
score---> Tensor("custom_self_attention_22_5/truediv:0", shape=(None, 64, 64), dtype=float32)
mask---> Tensor("custom_self_attention_22_5/Cast_1:0", shape=(None, None, None), dtype=floa

ValueError: Exception encountered when calling TransformerDecoder.call().

[1mCould not automatically infer the output shape / dtype of 'transformer_decoder_24' (of type TransformerDecoder). Either the `TransformerDecoder.call()` method is incorrect, or you need to implement the `TransformerDecoder.compute_output_spec() / compute_output_shape()` method. Error encountered:

Exception encountered when calling CustomSelfAttention.call().

[1mNone values not supported.[0m

Arguments received by CustomSelfAttention.call():
  • query=tf.Tensor(shape=(None, 64, 16), dtype=float32)
  • key=tf.Tensor(shape=(None, 64, 16), dtype=float32)
  • value=tf.Tensor(shape=(None, 64, 16), dtype=float32)
  • masking=None[0m

Arguments received by TransformerDecoder.call():
  • args=('<KerasTensor shape=(None, 64, 128), dtype=float32, sparse=False, name=keras_tensor_288>', '<KerasTensor shape=(None, 64, 128), dtype=float32, sparse=False, name=keras_tensor_286>', '<KerasTensor shape=(None, None), dtype=float32, sparse=False, name=keras_tensor_281>')
  • kwargs=<class 'inspect._empty'>

## Simple GRU

In [None]:
NUM_UNITS =256

In [None]:
### ENCODER (english input)
input = Input(shape=(ENGLISH_SEQUENCE_LENGTH,), dtype="int64", name="input_1")  # English sentence as input
x = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(input)                                 # Convert words to dense vector form
encoded_input = Bidirectional(GRU(NUM_UNITS))(x)                                # Understand context from both directions

### DECODER (french output)
shifted_target = Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")  # French sentence with 'start' token
x = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(shifted_target)                                # Convert French words to dense vectors
x = GRU(NUM_UNITS * 2, return_sequences=True)(x, initial_state=encoded_input)           # Generate output using English context

### OUTPUT
x = Dropout(0.5)(x)                                              # Prevent overfitting
target = Dense(VOCAB_SIZE, activation="softmax")(x)              # Predict the next French word
seq2seq_gru = Model([input, shifted_target], target)             # Build the full model
seq2seq_gru.summary()                                            # Show model architecture


- Reference : https://www.tensorflow.org/api_docs/python/tf/keras/Metric

In [None]:
class BLEU(tf.keras.metrics.Metric): #custom class inherits from Metrics class

    def __init__(self, name='bleu_score'):
        super(BLEU,self).__init__(name=name)
        self.bleu_score =0

    @tf.function #decorator added to update_state function
    def update_state(self, y_true, y_pred, sample_weight=None):
      y_pred=tf.argmax(y_pred, axis=-1)
      self.bleu_score=0

      #zip up the pair of y_true and y_pred
      for i, j in zip(y_true,y_pred):
        tf.autograph.experimental.set_loop_options()

        total_words=tf.math.count_nonzero(i)
        total_matches = 0

        for word in i:
          if word==0:
            break
          for q in range(len(j)):
            if j[q]==0:
              break
            if j[q]==word:
              total_matches+=1
              j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
              break
        self.bleu_score+= total_matches/total_words

    def result(self):
      return self.bleu_score/BATCH_SIZE


In [None]:
# ### example illustration to show how boolean works
# j = tf.constant([2,3,4,5,0,0])
# print([False if y==2 else True for y in range(len(j))])
# j = tf.boolean_mask(j,[False if y==2 else True for y in range(len(j))])
# print(j)

## Scheduler

In [232]:
class Scheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps):
    super(Scheduler, self).__init__()
    self.d_model = tf.cast(d_model, tf.float64)
    self.warmup_steps = tf.cast(warmup_steps, dtype=tf.float64)

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float64)
    #copied from paper
    return (self.d_model**(-0.5))*tf.math.minimum(step**(-0.5), step * (self.warmup_steps ** -1.5))

In [233]:
WARM_UP_STEPS = 4000
lr_scheduled = Scheduler(EMBEDDING_DIM, WARM_UP_STEPS)

# TRAINING

In [244]:
#compile the model
transformer.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = Adam(lr_scheduled, beta_1=0.9, beta_2=0.98, epsilon=1e-9),)
    #metrics=[BLEU()],
    #run_eagerly=True)

In [None]:
checkpoint_filepath = '/content/drive/MyDrive/nlp/translation/gru1.keras'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_loss',
    mode='min',
    save_best_only=True,)

In [245]:
#fit the model
history=transformer.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10)

Epoch 1/10
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1296s[0m 456ms/step - loss: 0.5153 - val_loss: 0.7469
Epoch 2/10
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 411ms/step - loss: 0.5670

KeyboardInterrupt: 

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# Evaluation

In [None]:
seq2seq_gru.evaluate(val_dataset)

     70/Unknown [1m116s[0m 1s/step - bleu_score: 0.0000e+00 - loss: 1.1114

KeyboardInterrupt: 

# Testing

In [None]:
index_to_word ={x:y for x,y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                   french_vectorize_layer.get_vocabulary())}

NameError: name 'french_vectorize_layer' is not defined

In [None]:
index_to_word

NameError: name 'index_to_word' is not defined

In [None]:
def  translator(english_sentence):
  tokenized_english_sentence = english_vectorize_layer([english_sentence])
  shifted_target = 'starttoken'

  for i in range(FRENCH_SEQUENCE_LENGTH):
     tokenized_shifted_target = french_vectorize_layer([shifted_target])
     output = seq2seq_gru.predict([tokenized_english_sentence, tokenized_shifted_target])
     french_word_index = tf.argmax(output,axis=-1)[0][i].numpy()
     current_word=index_to_word[french_word_index]
     if current_word=="endtoken":
      break
      shifted_target += ' '+current_word
  return shifted_target[11:]

In [None]:
translator("what makes you think that is not true?").shape

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40

AttributeError: 'str' object has no attribute 'shape'

In [None]:
#pull top 5 volcabulary from english
vocab = english_vectorize_layer.get_vocabulary()
print([str(word) for word in vocab[:5]])

['', '[UNK]', 'i', 'you', 'to']


In [None]:
#pull top 5 volcabulary from english
vocab = french_vectorize_layer.get_vocabulary()
print([str(word) for word in vocab[:5]])

['', '[UNK]', 'starttoken', 'endtoken', 'je']


In [None]:
word_to_index={y:x for x, y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                   french_vectorize_layer.get_vocabulary())}

In [None]:
word_to_index['football']