<a href="https://colab.research.google.com/github/akankshakusf/Project-DeepLearning-English-to-French-Translation/blob/master/Neural_Machine_Translation_with_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import ML packages
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.metrics import confusion_matrix,roc_curve
import pathlib
import io
import re
import string
import time

#import DL package
import cv2
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer,Dense, Flatten, InputLayer, BatchNormalization, Bidirectional, Dropout, Input, Embedding, TextVectorization
from tensorflow.keras.layers import SimpleRNN, Conv1D, LSTM, GRU
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, TopKCategoricalAccuracy, TopKCategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

# Data Preparation

## Data Download

In [2]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2025-04-15 16:30:34--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2025-04-15 16:30:36 (6.18 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]



In [3]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


## Kaggle Dataset

In [4]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset

Dataset URL: https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset
License(s): ODbL-1.0


In [5]:
!unzip "/content/en-fr-translation-dataset.zip" -d "/content/dataset/"

Archive:  /content/en-fr-translation-dataset.zip
  inflating: /content/dataset/en-fr.csv  


In [6]:
dataset = tf.data.experimental.CsvDataset(
  "/content/dataset/en-fr.csv",
  [
    tf.string,
    tf.string
  ],
)

## Data Processing

In [7]:
text_dataset=tf.data.TextLineDataset("/content/dataset/fra.txt")

In [8]:
#review dataset
for i in text_dataset.take(3):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)


In [None]:
# #lets skip a max number of records and check what max length we find
# for i in text_dataset.skip(190000):
#   print(len(tf.strings.split(i," ")))

- Since i saw that the max len of the sentence is 107. I am going to go ahead with a sequence length of 64 as we also have french letters

In [9]:
VOCAB_SIZE= 20000
ENGLISH_SEQUENCE_LENGTH=64
FRENCH_SEQUENCE_LENGTH=64
EMBEDDING_DIM = 300
BATCH_SIZE=64

- Create vectorizer layer to create vectors
- reference :https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization

In [10]:
# Turn text to lowercase and remove punctuation
# Keep only top VOCAB_SIZE words
# Convert words to numbers
# Make all sentences the same length

In [11]:
english_vectorize_layer = TextVectorization (
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length = ENGLISH_SEQUENCE_LENGTH
)

In [12]:
french_vectorize_layer = TextVectorization (
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length = FRENCH_SEQUENCE_LENGTH
)

- Look at this sample data from dataset and get rid of tabs--->  \t
'Go.\tVa !\tCC-BY 2.0 (France)

In [13]:
def selector (input_text):
  split_text=tf.strings.split(input_text, '\t') ##after splitting collect english and french separately
  return {'input_1':split_text[0:1],'input_2':'starttoken '+split_text[1:2]},split_text[1:2]+' endtoken'

In [14]:
#map text_dataset to selector above function
split_dataset = text_dataset.map(selector)  ## this is for final dataset

In [15]:
def separator(input_text):
  split_text=tf.strings.split(input_text,'\t')
  return split_text[0:1],'starttoken '+split_text[1:2]+' endtoken' #final output

In [16]:
#map text_dataset to selector above function
init_dataset = text_dataset.map(separator)  ## this is for just intermediated start-end token consideration

In [17]:
# review the data
for i in split_dataset.take(2):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)


In [18]:
# review the data
for i in init_dataset.take(2):
  print(i)

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va ! endtoken'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche. endtoken'], dtype=object)>)


- Notice how nicely english and french text have been separated now

In [19]:
# now lets attach this Vectorizer to init_dataset to get the vocabulary list
english_training_dataset = init_dataset.map(lambda x,y:x) ##input is x, y and output is x
english_vectorize_layer.adapt(english_training_dataset) ##adapth the vectorizer layer to training data

In [20]:
# now lets attach this Vectorizer to init_dataset to get the vocabulary list
french_training_data=init_dataset.map(lambda x,y:y) ##input x,y,z and output y
french_vectorize_layer.adapt(french_training_data) ##adapt the vectorize_layer to the training data

In [21]:
print(len(english_vectorize_layer.get_vocabulary()))
print(len(french_vectorize_layer.get_vocabulary()))

16952
20000


- Now data is adapted do vectorization (convert to numbers)

In [22]:
def vectorize(inputs,output):
  return {'input_1':english_vectorize_layer(inputs['input_1']),
          'input_2':french_vectorize_layer(inputs['input_2'])},french_vectorize_layer(output)

In [23]:
split_dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'input_2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.string, name=None))>

- get the final vectorized dataset

In [24]:
dataset = split_dataset.map(vectorize)

In [25]:
#check the data "Go means Va in french"
english_vectorize_layer.get_vocabulary()[45]

np.str_('go')

In [26]:
#check the data
french_vectorize_layer.get_vocabulary()[104]

np.str_('va')

In [27]:
for i in dataset.take(2):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[  2, 104,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[104,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,

-

In [28]:
dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

- Create pipeline : by shuffling data and batch the data now

In [74]:
BATCH_SIZE=64

In [75]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)



In [76]:
print(dataset)

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>


In [77]:
#confirm batching shape : batch
for x, y in dataset.take(1):
    print(x['input_1'].shape)  # Shape: (batch_size, seq_len)
    print(x['input_2'].shape)
    print(y.shape)

(64, 64)
(64, 64)
(64, 64)


In [78]:
#check for number of batches
NUM_BATCHES= int(200000/BATCH_SIZE) ## since i have 200,000 data point in dataset and batch =64
print(NUM_BATCHES)

3125


In [79]:
train_dataset = dataset.take(int(0.9*NUM_BATCHES))  ## i will use 90% of data for training
val_dataset = dataset.skip(int(0.9*NUM_BATCHES))   ## rest 10% i will just push in validation

In [80]:
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [81]:
val_dataset

<_SkipDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [82]:
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

# Modeling

## Encoder : LSTM

In [83]:
class Encoder(tf.keras.Model):  # Inherits from the tf.keras.Model base class
    def __init__(self, vocab_size, embedding_dims, units):
        super(Encoder, self).__init__()  #Call the base class constructor
        self.vocab_size = vocab_size
        self.embedding_dims = embedding_dims
        self.units = units

    #Define the layers in the constructor
    def build(self,input_shape):
        self.embedding = Embedding(self.vocab_size, self.embedding_dims)
        self.lstm = LSTM(self.units, return_sequences=True)

    def call(self, x):  # This is the forward pass used during model execution
        x = self.embedding(x)  # Convert input tokens to dense vector embeddings
        output = self.lstm(x)  # Pass embeddings through LSTM
        return output          # Return the output of the LSTM (sequence of hidden states)



In [84]:
HIDDEN_UNITS = 256 # this are hidden state units for encoder
EMBEDDING_DIM = 256
encoder= Encoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS)
#perform a dry run to check
encoder_output=encoder(tf.zeros([128,8]))
print(encoder_output.shape)

(128, 8, 256)


## Bahdanau Attention : attention layer

In [85]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self,units):
    super(BahdanauAttention,self).__init__()  #Call the base class constructor
    self.units = units

  #define layers in model
  def build(self,input_shape):
    self.w_1=tf.keras.layers.Dense(self.units)
    self.w_2=tf.keras.layers.Dense(self.units)
    self.w  =tf.keras.layers.Dense(1)  #tanh activation

  #This is the forward pass used during model execution
  def call(self,prev_dec_state, enc_states):
    scores=self.w(
        tf.nn.tanh(
            self.w_1(tf.expand_dims(prev_dec_state,-2)) +  ## we are passing these from self.w cause it should be reduced to 1 dimension
            self.w_2(enc_states)))


    attention_weights=tf.nn.softmax(scores,axis=1)
    context_vector=attention_weights*enc_states
    context_vector=tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

In [86]:
bahdanau_attention=BahdanauAttention(256)
context_vector,attention_weights=bahdanau_attention(tf.zeros([128,32]),tf.zeros([128,8,32]))
print(context_vector.shape)
print(attention_weights.shape)

(128, 32)
(128, 8, 1)


## Decoder

In [89]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, sequence_length):
    super(Decoder, self).__init__()
    self.embedding_dim = embedding_dim
    self.vocab_size = vocab_size
    self.dec_units = dec_units
    self.sequence_length = sequence_length

  def build(self, input_shape):
    self.dense = Dense(self.vocab_size, activation="softmax")
    self.gru = GRU(self.dec_units, return_sequences=True, return_state=True)
    self.attention = BahdanauAttention(self.dec_units)
    self.embedding = Embedding(self.vocab_size, self.embedding_dim)

  def call(self, x, hidden, shifted_target):
    outputs = []
    context_vectors = []
    attention_weightss = []

    shifted_target = self.embedding(shifted_target)  # shape: (128, 64, embedding_dim)

    for t in range(0, self.sequence_length):
      context_vector, attention_weights = self.attention(hidden, x)  # (128, dec_units), (128, 8, 1)
      dec_input = context_vector + shifted_target[:, t, :]  # shape: (128, embedding_dim)
      output, hidden = self.gru(tf.expand_dims(dec_input, 1))  # output: (128, 1, dec_units)
      outputs.append(output[:, 0])  # (128, dec_units)
      attention_weightss.append(attention_weights)  # each is (128, 8, 1)

    outputs = tf.stack(outputs, axis=1)  # (128, 64, dec_units)
    outputs = self.dense(outputs)        # (128, 64, vocab_size)

    attention_weightss = tf.stack(attention_weightss, axis=1)  # (128, 64, 8, 1)
    attention_weights = attention_weightss[:, -1, :, :]        # just final timestep: (128, 8, 1)

    return outputs, attention_weights


In [93]:
class Decoder(tf.keras.Model):
  def __init__(self,vocab_size,embedding_dim,dec_units,sequence_length):
    super(Decoder,self).__init__()
    self.embedding_dim=embedding_dim
    self.vocab_size=vocab_size
    self.dec_units=dec_units
    self.sequence_length=sequence_length

  def build(self,input_shape):
    self.dense=Dense(self.vocab_size,activation="softmax")
    self.gru=GRU(
        self.dec_units,return_sequences=True,return_state=True)
    self.attention=BahdanauAttention(self.dec_units)
    self.embedding=Embedding(self.vocab_size,self.embedding_dim)

  def call(self,x,hidden,shifted_target):
    outputs=[]
    context_vectors=[]
    attention_weights=[]
    shifted_target=self.embedding(shifted_target)

    for t in range(0,self.sequence_length):
      context_vector,attention_weights=self.attention(hidden,x)
      dec_input=context_vector+shifted_target[:,t]
      output,hidden=self.gru(tf.expand_dims(dec_input,1))
      outputs.append(output[:,0])

    outputs=tf.convert_to_tensor(outputs)
    outputs=tf.transpose(outputs, perm=[1,0,2])

    outputs=self.dense(outputs)
    return outputs,attention_weights

In [94]:
decoder=Decoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS,FRENCH_SEQUENCE_LENGTH)
outputs,attention_weights=decoder(encoder_output,tf.zeros([128,HIDDEN_UNITS]),tf.zeros([128,64]))
print(outputs.shape)
print(attention_weights.shape)

ValueError: Exception encountered when calling Decoder.call().

[1mtoo many values to unpack (expected 2)[0m

Arguments received by Decoder.call():
  • x=tf.Tensor(shape=(128, 8, 256), dtype=float32)
  • hidden=tf.Tensor(shape=(128, 256), dtype=float32)
  • shifted_target=tf.Tensor(shape=(128, 64), dtype=float32)

In [60]:
### ENCODER
input = Input(shape=(ENGLISH_SEQUENCE_LENGTH,), dtype="int64", name="input_1")
encoder=Encoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS)
encoder_output=encoder(input)

### DECODER
shifted_target=Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")
decoder=Decoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS,FRENCH_SEQUENCE_LENGTH)
decoder_output,attention_weightss=decoder(encoder_output,tf.zeros([1,HIDDEN_UNITS]),shifted_target)

### OUTPUT
bahdanau=Model([input,shifted_target],decoder_output)
bahdanau.summary()

In [95]:
# Your provided BahdanauAttention (unchanged)
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.units = units

    def build(self, input_shape):
        self.w_1 = tf.keras.layers.Dense(self.units)
        self.w_2 = tf.keras.layers.Dense(self.units)
        self.w = tf.keras.layers.Dense(1)

    def call(self, prev_dec_state, enc_states):
        scores = self.w(
            tf.nn.tanh(
                self.w_1(tf.expand_dims(prev_dec_state, -2)) +
                self.w_2(enc_states)
            )
        )
        attention_weights = tf.nn.softmax(scores, axis=1)
        context_vector = attention_weights * enc_states
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [96]:
# Decoder that works with your BahdanauAttention
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, sequence_length):
        super(Decoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.dec_units = dec_units
        self.sequence_length = sequence_length

    def build(self, input_shape):
        # Dense layer projects the GRU outputs to the vocabulary space.
        self.dense = tf.keras.layers.Dense(self.vocab_size, activation="softmax")
        self.gru = tf.keras.layers.GRU(
            self.dec_units, return_sequences=True, return_state=True
        )
        self.attention = BahdanauAttention(self.dec_units)
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
        super(Decoder, self).build(input_shape)

    def call(self, x, hidden, shifted_target):
        """
        Arguments:
          x            : Encoder outputs, shape (batch, encoder_time_steps, encoder_feature_dim)
          hidden       : Initial decoder hidden state, shape (batch, dec_units)
          shifted_target: Target tokens (shifted by one), shape (batch, sequence_length)
        Returns:
          outputs      : Decoder outputs, shape (batch, sequence_length, vocab_size)
          attention_weights: Attention weights from the last time step, shape (batch, encoder_time_steps, 1)
        """
        outputs = []
        # Embed the shifted target tokens.
        # New shape: (batch, sequence_length, embedding_dim)
        shifted_target = self.embedding(shifted_target)

        # Decode one time step at a time.
        for t in range(self.sequence_length):
            # Use current decoder hidden state and encoder outputs to compute attention.
            context_vector, attention_weights = self.attention(hidden, x)
            # Select token embedding at current time step.
            # shifted_target[:, t, :] has shape (batch, embedding_dim)
            token_embedding = shifted_target[:, t, :]
            # Add context vector (shape: (batch, dec_units)) and token embedding.
            dec_input = context_vector + token_embedding
            # GRU expects input with time dimension; reshape to (batch, 1, dec_units)
            dec_input = tf.expand_dims(dec_input, 1)
            # Pass through GRU; output shape will be (batch, 1, dec_units)
            output, hidden = self.gru(dec_input, initial_state=hidden)
            # Remove time dimension and save output.
            outputs.append(output[:, 0, :])

        # Stack outputs to get shape (batch, sequence_length, dec_units).
        outputs = tf.stack(outputs, axis=1)
        # Map GRU outputs to the vocabulary space.
        outputs = self.dense(outputs)
        # Return decoder outputs and the attention weights from the last time step.
        return outputs, attention_weights


In [97]:
# Testing the integrated model
if __name__ == "__main__":
    # Hyperparameters and input dimensions.
    batch_size = 128
    sequence_length = 64
    vocab_size = 20000
    embedding_dim = 32   # Must equal dec_units so addition works properly.
    dec_units = 32
    encoder_time_steps = 8
    encoder_feature_dim = 256

    # Dummy encoder output: shape (128, 8, 256)
    encoder_output = tf.random.normal((batch_size, encoder_time_steps, encoder_feature_dim))
    # Initial hidden state for decoder: shape (128, 32)
    initial_hidden = tf.random.normal((batch_size, dec_units))
    # Dummy shifted target tokens: shape (128, 64)
    shifted_target = tf.random.uniform(
        (batch_size, sequence_length), minval=0, maxval=vocab_size, dtype=tf.int32
    )

    # Instantiate and call the Decoder.
    decoder = Decoder(vocab_size, embedding_dim, dec_units, sequence_length)
    outputs, attn_weights = decoder(encoder_output, initial_hidden, shifted_target)

    # Expected shapes:
    # outputs: (128, 64, 20000)
    # attn_weights: (128, 8, 1)
    print("Decoder outputs shape:", outputs.shape)
    print("Attention weights shape:", attn_weights.shape)

InvalidArgumentError: Exception encountered when calling Decoder.call().

[1m{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} required broadcastable shapes [Op:AddV2] name: [0m

Arguments received by Decoder.call():
  • x=tf.Tensor(shape=(128, 8, 256), dtype=float32)
  • hidden=tf.Tensor(shape=(128, 32), dtype=float32)
  • shifted_target=tf.Tensor(shape=(128, 64), dtype=int32)

In [98]:
import tensorflow as tf

# Your BahdanauAttention code with a minimal change:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.units = units

    def build(self, input_shape):
        self.w_1 = tf.keras.layers.Dense(self.units)
        self.w_2 = tf.keras.layers.Dense(self.units)
        self.w = tf.keras.layers.Dense(1)
        # Added projection layer to transform encoder states to the same dimension as decoder units.
        self.value_dense = tf.keras.layers.Dense(self.units)

    def call(self, prev_dec_state, enc_states):
        scores = self.w(
            tf.nn.tanh(
                self.w_1(tf.expand_dims(prev_dec_state, -2)) +
                self.w_2(enc_states)
            )
        )
        attention_weights = tf.nn.softmax(scores, axis=1)
        # Project the encoder states so that after the weighted sum,
        # the context vector has shape (batch, self.units) (e.g. (128, 32))
        projected_enc_states = self.value_dense(enc_states)
        context_vector = attention_weights * projected_enc_states
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

# Decoder using your original structure
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, sequence_length):
        super(Decoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.dec_units = dec_units
        self.sequence_length = sequence_length

    def build(self, input_shape):
        # Dense layer that maps GRU outputs to vocab space.
        self.dense = tf.keras.layers.Dense(self.vocab_size, activation="softmax")
        self.gru = tf.keras.layers.GRU(
            self.dec_units, return_sequences=True, return_state=True
        )
        self.attention = BahdanauAttention(self.dec_units)
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
        super(Decoder, self).build(input_shape)

    def call(self, x, hidden, shifted_target):
        """
        Arguments:
          x             : Encoder outputs, shape (batch, encoder_time_steps, encoder_feature_dim)
          hidden        : Initial decoder hidden state, shape (batch, dec_units)
          shifted_target: Target tokens (shifted by one), shape (batch, sequence_length)
        Returns:
          outputs       : Decoder outputs, shape (batch, sequence_length, vocab_size)
          attention_weights : Attention weights from the last time step, shape (batch, encoder_time_steps, 1)
        """
        outputs = []
        # Embed target tokens; resulting shape: (batch, sequence_length, embedding_dim)
        shifted_target = self.embedding(shifted_target)

        for t in range(self.sequence_length):
            # Compute attention; context_vector now has shape (batch, dec_units) i.e. (128, 32)
            context_vector, attention_weights = self.attention(hidden, x)
            # Get token embedding at time step t; shape: (batch, embedding_dim)
            token_embedding = shifted_target[:, t, :]
            # Add context vector and token embedding
            # (both now have matching shape (batch, 32)).
            dec_input = context_vector + token_embedding
            # Expand dims to create time axis for GRU: (batch, 1, dec_units)
            dec_input = tf.expand_dims(dec_input, 1)
            # Pass through GRU; output shape: (batch, 1, dec_units)
            output, hidden = self.gru(dec_input, initial_state=hidden)
            outputs.append(output[:, 0, :])

        # Stack outputs over time to get shape (batch, sequence_length, dec_units)
        outputs = tf.stack(outputs, axis=1)
        # Map GRU outputs to vocabulary probabilities; final shape: (batch, sequence_length, vocab_size)
        outputs = self.dense(outputs)
        return outputs, attention_weights

# Testing the integrated model:
if __name__ == "__main__":
    # Hyperparameters and input dimensions
    batch_size = 128
    sequence_length = 64
    vocab_size = 20000
    embedding_dim = 32   # Must equal dec_units to support the addition.
    dec_units = 32
    encoder_time_steps = 8
    encoder_feature_dim = 256

    # Dummy encoder output: shape (128, 8, 256)
    encoder_output = tf.random.normal((batch_size, encoder_time_steps, encoder_feature_dim))
    # Initial hidden state for decoder: shape (128, 32)
    initial_hidden = tf.random.normal((batch_size, dec_units))
    # Dummy shifted target tokens: shape (128, 64)
    shifted_target = tf.random.uniform(
        (batch_size, sequence_length), minval=0, maxval=vocab_size, dtype=tf.int32
    )

    # Instantiate the Decoder.
    decoder = Decoder(vocab_size, embedding_dim, dec_units, sequence_length)
    outputs, attn_weights = decoder(encoder_output, initial_hidden, shifted_target)

    # Expected shapes:
    # outputs: (128, 64, 20000)
    # attn_weights: (128, 8, 1)
    print("Decoder outputs shape:", outputs.shape)
    print("Attention weights shape:", attn_weights.shape)


ValueError: Exception encountered when calling Decoder.call().

[1mtoo many values to unpack (expected 2)[0m

Arguments received by Decoder.call():
  • x=tf.Tensor(shape=(128, 8, 256), dtype=float32)
  • hidden=tf.Tensor(shape=(128, 32), dtype=float32)
  • shifted_target=tf.Tensor(shape=(128, 64), dtype=int32)

## Simple GRU

In [None]:
NUM_UNITS =256

In [None]:
### ENCODER (english input)
input = Input(shape=(ENGLISH_SEQUENCE_LENGTH,), dtype="int64", name="input_1")  # English sentence as input
x = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(input)                                 # Convert words to dense vector form
encoded_input = Bidirectional(GRU(NUM_UNITS))(x)                                # Understand context from both directions

### DECODER (french output)
shifted_target = Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")  # French sentence with 'start' token
x = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(shifted_target)                                # Convert French words to dense vectors
x = GRU(NUM_UNITS * 2, return_sequences=True)(x, initial_state=encoded_input)           # Generate output using English context

### OUTPUT
x = Dropout(0.5)(x)                                              # Prevent overfitting
target = Dense(VOCAB_SIZE, activation="softmax")(x)              # Predict the next French word
seq2seq_gru = Model([input, shifted_target], target)             # Build the full model
seq2seq_gru.summary()                                            # Show model architecture


- Reference : https://www.tensorflow.org/api_docs/python/tf/keras/Metric

In [None]:
class BLEU(tf.keras.metrics.Metric): #custom class inherits from Metrics class

    def __init__(self, name='bleu_score'):
        super(BLEU,self).__init__(name=name)
        self.bleu_score =0

    @tf.function #decorator added to update_state function
    def update_state(self, y_true, y_pred, sample_weight=None):
      y_pred=tf.argmax(y_pred, axis=-1)
      self.bleu_score=0

      #zip up the pair of y_true and y_pred
      for i, j in zip(y_true,y_pred):
        tf.autograph.experimental.set_loop_options()

        total_words=tf.math.count_nonzero(i)
        total_matches = 0

        for word in i:
          if word==0:
            break
          for q in range(len(j)):
            if j[q]==0:
              break
            if j[q]==word:
              total_matches+=1
              j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
              break
        self.bleu_score+= total_matches/total_words

    def result(self):
      return self.bleu_score/BATCH_SIZE


In [None]:
# ### example illustration to show how boolean works
# j = tf.constant([2,3,4,5,0,0])
# print([False if y==2 else True for y in range(len(j))])
# j = tf.boolean_mask(j,[False if y==2 else True for y in range(len(j))])
# print(j)

# TRAINING

In [None]:
#compile the model
bahdanau.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(5e-4),
    metrics=[BLEU()],
    run_eagerly=True)

In [None]:
checkpoint_filepath = '/content/drive/MyDrive/nlp/translation/gru1.keras'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_loss',
    mode='min',
    save_best_only=True,)

In [None]:
#fit the model
history=bahdanau.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=30,)
    #callbacks=[model_checkpoint_callback])

Epoch 1/5


OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code for more information.

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# Evaluation

In [None]:
seq2seq_gru.evaluate(val_dataset)

     70/Unknown [1m116s[0m 1s/step - bleu_score: 0.0000e+00 - loss: 1.1114

KeyboardInterrupt: 

# Testing

In [None]:
index_to_word ={x:y for x,y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                   french_vectorize_layer.get_vocabulary())}

NameError: name 'french_vectorize_layer' is not defined

In [None]:
index_to_word

NameError: name 'index_to_word' is not defined

In [None]:
def  translator(english_sentence):
  tokenized_english_sentence = english_vectorize_layer([english_sentence])
  shifted_target = 'starttoken'

  for i in range(FRENCH_SEQUENCE_LENGTH):
     tokenized_shifted_target = french_vectorize_layer([shifted_target])
     output = seq2seq_gru.predict([tokenized_english_sentence, tokenized_shifted_target])
     french_word_index = tf.argmax(output,axis=-1)[0][i].numpy()
     current_word=index_to_word[french_word_index]
     if current_word=="endtoken":
      break
      shifted_target += ' '+current_word
  return shifted_target[11:]

In [None]:
translator("what makes you think that is not true?").shape

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40

AttributeError: 'str' object has no attribute 'shape'

In [None]:
#pull top 5 volcabulary from english
vocab = english_vectorize_layer.get_vocabulary()
print([str(word) for word in vocab[:5]])

['', '[UNK]', 'i', 'you', 'to']


In [None]:
#pull top 5 volcabulary from english
vocab = french_vectorize_layer.get_vocabulary()
print([str(word) for word in vocab[:5]])

['', '[UNK]', 'starttoken', 'endtoken', 'je']


In [None]:
word_to_index={y:x for x, y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                   french_vectorize_layer.get_vocabulary())}

In [None]:
word_to_index['football']