In [1]:
import tensorflow as tf
print(tf.__version__)


2.17.0


In [2]:
# Simple English to French dataset
data = [("hello", "bonjour"),
        ("how are you", "comment ça va"),
        ("I am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"),
        ("my name is", "je m'appelle"),
        ("thank you", "merci"),
        ("goodbye", "au revoir")]


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


english_sentences = [pair[0] for pair in data]
french_sentences = [pair[1] for pair in data]


eng_tokenizer = Tokenizer()
fre_tokenizer = Tokenizer()

# Fit tokenizers on the respective sentences
eng_tokenizer.fit_on_texts(english_sentences)
fre_tokenizer.fit_on_texts(french_sentences)

# Convert the texts to sequences of integers
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
fre_sequences = fre_tokenizer.texts_to_sequences(french_sentences)

# Pad sequences to ensure uniform length
max_eng_len = max([len(seq) for seq in eng_sequences])
max_fre_len = max([len(seq) for seq in fre_sequences])

eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fre_padded = pad_sequences(fre_sequences, maxlen=max_fre_len, padding='post')

# Create the input and target data for training
input_data = np.array(eng_padded)
output_data = np.array(fre_padded)


In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Define model parameters
embedding_dim = 64
latent_dim = 128

# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
encoder_embedding = Embedding(input_dim=len(eng_tokenizer.word_index) + 1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(max_fre_len,))
decoder_embedding = Embedding(input_dim=len(fre_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(len(fre_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [5]:
# Prepare target data for training by shifting the French sentence by one position
fre_target_data = np.zeros_like(output_data)
fre_target_data[:, :-1] = output_data[:, 1:]
fre_target_data[:, -1] = 0

# Train the Seq2Seq model
model.fit([input_data, output_data], fre_target_data, batch_size=16, epochs=100, validation_split=0.2)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0667 - loss: 2.6377 - val_accuracy: 0.8333 - val_loss: 2.6156
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.6000 - loss: 2.6225 - val_accuracy: 0.8333 - val_loss: 2.5946
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5333 - loss: 2.6069 - val_accuracy: 0.8333 - val_loss: 2.5719
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.5333 - loss: 2.5902 - val_accuracy: 0.8333 - val_loss: 2.5467
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.5333 - loss: 2.5718 - val_accuracy: 0.8333 - val_loss: 2.5179
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5333 - loss: 2.5511 - val_accuracy: 0.8333 - val_loss: 2.4845
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x79cf490c3520>

In [7]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, [state_h, state_c])

# Decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_inputs_single = Input(shape=(1,))

# Create a new embedding layer for the decoder during inference
decoder_embedding_inference = Embedding(input_dim=len(fre_tokenizer.word_index) + 1, output_dim=embedding_dim)
decoder_inputs_single_x = decoder_embedding_inference(decoder_inputs_single)

# LSTM and Dense layers can be reused from the original model
decoder_outputs_single, state_h_dec, state_c_dec = decoder_lstm(decoder_inputs_single_x, initial_state=[decoder_state_input_h, decoder_state_input_c])
decoder_outputs_single = decoder_dense(decoder_outputs_single)

# Create the inference decoder model
decoder_model = Model([decoder_inputs_single, decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs_single, state_h_dec, state_c_dec])


In [8]:
def translate_sentence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate an empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Start the target sequence with the start word (usually index 1 or a special <START> token)
    target_seq[0, 0] = fre_tokenizer.word_index['bonjour']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample the next word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = None
        for word, index in fre_tokenizer.word_index.items():
            if index == sampled_token_index:
                sampled_word = word
                break

        if sampled_word is None or sampled_word == '<end>':
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()

# Test the model with a new sentence
test_sentence = "how are you"
test_seq = eng_tokenizer.texts_to_sequences([test_sentence])
test_seq_padded = pad_sequences(test_seq, maxlen=max_eng_len, padding='post')
translation = translate_sentence(test_seq_padded)
print("Translation:", translation)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Translation: ça
