In [43]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Input, Embedding, Dense, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Concatenate


In [2]:
# Example data
english_sentences = ['hello', 'how are you', 'good morning']
french_sentences = ['bonjour', 'comment ça va', 'bonjour']



In [13]:
def tokenize(sentences):
    tokenizer  = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(sentences)
    return tokenizer, tokenizer.texts_to_sequences(sentences)

In [70]:
eng_tokenizer, eng_sequences = tokenize(english_sentences)

In [71]:
eng_tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x20c52d41360>

In [72]:
eng_sequences

[[1], [2, 3, 4], [5, 6]]

In [73]:
fr_tokenizer, fr_sequences = tokenize(french_sentences)

In [74]:
fr_sequences

[[1], [2, 3, 4], [1]]

In [75]:
max_eng_len = max(len(seq) for seq in eng_sequences)
max_eng_len

3

In [76]:
max_fr_len = max(len(seq) for seq in fr_sequences)
max_fr_len

3

In [77]:
# Paddding
eng_sequences = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fr_sequences = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')

In [78]:
eng_sequences

array([[1, 0, 0],
       [2, 3, 4],
       [5, 6, 0]])

In [79]:
fr_sequences

array([[1, 0, 0],
       [2, 3, 4],
       [1, 0, 0]])

In [80]:
# Vocabulary sizes
eng_vocab_size = len(eng_tokenizer.word_index)+1
fr_vocab_size = len(fr_tokenizer.word_index)+1

In [81]:
eng_tokenizer.word_index

{'hello': 1, 'how': 2, 'are': 3, 'you': 4, 'good': 5, 'morning': 6}

### Build Encoder-Decoder Model with Attention

In [82]:
# Hyperparameters
embedding_dim = 64
hidden_dim = 128

# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
encoder_embedding = Embedding(input_dim=eng_vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]


# Decoder
decoder_inputs = Input(shape=(max_fr_len,))
decoder_embedding = Embedding(input_dim=fr_vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units=hidden_dim, return_sequences=True, return_state=True )
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Attention layer
attention=Attention()
context_vector = attention([decoder_outputs, encoder_outputs])

# Concatenate attention output with decoder outputs
concat_outputs = Concatenate(axis=-1)([decoder_outputs, context_vector])

# Dense layer
decoder_dense = Dense(units=fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(concat_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

### Prepare Data for Training

In [83]:
fr_sequences

array([[1, 0, 0],
       [2, 3, 4],
       [1, 0, 0]])

In [84]:
eng_sequences

array([[1, 0, 0],
       [2, 3, 4],
       [5, 6, 0]])

In [85]:
# Target data for decoder (shifts outputs for teacher forcing)
decoder_input_data = fr_sequences[:,:-1]  # Remove last word
decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_fr_len, padding='post')
decoder_input_data

array([[1, 0, 0],
       [2, 3, 0],
       [1, 0, 0]])

In [86]:
decoder_target_data = fr_sequences[:,1:] #Remove first word
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_fr_len, padding='post')
decoder_input_data

array([[1, 0, 0],
       [2, 3, 0],
       [1, 0, 0]])

In [87]:
# # Reshape target data to match sparse categorical cross-entropy
# decoder_target_data = np.expand_dims(decoder_target_data, -1)
# decoder_target_data

In [None]:
print("Eng sequences shape:", eng_sequences.shape)
print("Decoder input shape:", decoder_input_data.shape)
print("Decoder target shape:", decoder_target_data.shape)

In [89]:
X_train, X_test = train_test_split(eng_sequences, test_size=0.2)
decoder_input_train, decoder_input_test = train_test_split(decoder_input_data, test_size=0.2)
decoder_target_train, decoder_target_test = train_test_split(decoder_target_data, test_size=0.2)


In [90]:
print(X_train.shape, decoder_input_train.shape, decoder_target_train.shape)


(2, 3) (2, 3) (2, 3)


In [91]:
model.fit(
    [X_train, decoder_input_train],
    decoder_target_train,
    batch_size=32,
    epochs=50,
    validation_split=0.2
)

Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0000e+00 - loss: 1.6099 - val_accuracy: 1.0000 - val_loss: 1.5937
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 1.0000 - loss: 1.5861 - val_accuracy: 1.0000 - val_loss: 1.5820
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 1.0000 - loss: 1.5619 - val_accuracy: 1.0000 - val_loss: 1.5698
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - accuracy: 1.0000 - loss: 1.5367 - val_accuracy: 1.0000 - val_loss: 1.5567
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 1.0000 - loss: 1.5099 - val_accuracy: 1.0000 - val_loss: 1.5425
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 1.0000 - loss: 1.4807 - val_accuracy: 1.0000 - val_loss: 1.5269
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x20c52e3ea70>

In [127]:
model.layers

[<InputLayer name=input_layer_8, built=True>,
 <InputLayer name=input_layer_9, built=True>,
 <Embedding name=embedding_8, built=True>,
 <Embedding name=embedding_9, built=True>,
 <LSTM name=lstm_7, built=True>,
 <LSTM name=lstm_8, built=True>,
 <Attention name=attention_3, built=True>,
 <Concatenate name=concatenate_1, built=True>,
 <Dense name=dense_1, built=True>]

In [128]:
# Assuming your original model is named `model`
encoder_inputs = model.input[0]  # First input (English sentence)
encoder_lstm = model.get_layer('lstm_7')  # LSTM layer of the encoder
encoder_states = encoder_lstm.output[1:]  # Hidden states (h, c) from the encoder LSTM
encoder_output = encoder_lstm.output[1]
# Create the encoder model
encoder_model = Model(encoder_inputs, encoder_states)


In [129]:
encoder_output

<KerasTensor shape=(None, 128), dtype=float32, sparse=False, name=keras_tensor_42>

In [130]:
# Decoder input for inference (timestep)
decoder_inputs = model.input[1]

decoder_lstm = model.layers[5]  # Decoder LSTM layer
decoder_output = decoder_lstm.output[1]
attention = model.layers[6]  # Attention layer


In [131]:
decoder_output

<KerasTensor shape=(None, 128), dtype=float32, sparse=False, name=keras_tensor_47>

In [125]:
decoder_inputs

<KerasTensor shape=(None, 3), dtype=float32, sparse=None, name=keras_tensor_44>

In [132]:
context_vector = attention([decoder_output, encoder_output])  # or use encoder_states_value


In [133]:
concat_outputs = Concatenate(axis=-1)([decoder_output, context_vector])


In [134]:
decoder_dense = model.layers[8]  # Dense output layer
decoder_output = decoder_dense(concat_outputs)

In [136]:
# Define the decoder model
decoder_model = tf.keras.Model([decoder_inputs] + encoder_states, [concat_outputs, state_h, state_c])

In [147]:
# Example sentence to translate
input_sentence = ['Hello, how are you?']
# Tokenize and pad the input sentence
input_sentence_seq = eng_tokenizer.texts_to_sequences(input_sentence)
input_sentence_padded = pad_sequences(input_sentence_seq, maxlen=max_eng_len, padding='post')
input_sentence_padded

array([[2, 3, 4]])