In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Simple dataset for demonstration

data = [
    "hello \t bonjour",
    "how are you \t comment ça va",
    "good morning \t bonjour",
    "thank you \t merci",
    "good night \t bonne nuit",
    "goodbye \t au revoir",
    "please \t s'il vous plaît",
    "yes \t oui",
    "no \t non",
    "good evening \t bonsoir",
    "what is your name \t comment vous appelez-vous",
    "nice to meet you \t enchanté",
    "excuse me \t excusez-moi",
    "I'm sorry \t je suis désolé",
    "I love you \t je t'aime",
    "see you soon \t à bientôt",
    "take care \t prends soin de toi",
    "have a good day \t bonne journée",
    "where are you from \t d'où viens-tu",
    "I am fine \t je vais bien",
    "do you speak English \t parlez-vous anglais",
    "I don't understand \t je ne comprends pas",
    "help \t aide",
    "I'm hungry \t j'ai faim",
    "I'm thirsty \t j'ai soif",
    "what time is it \t quelle heure est-il",
]
# Prepare the data
input_texts = []
target_texts = []
for line in data:
    input_text, target_text = line.split('\t')
    input_texts.append(input_text.strip())
    target_texts.append(f"start {target_text.strip()} end")

# Tokenize input and target texts
input_tokenizer = Tokenizer(filters='')
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_vocab_size = len(input_tokenizer.word_index) + 1
max_length_input = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_length_input, padding='post')

target_tokenizer = Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_vocab_size = len(target_tokenizer.word_index) + 1
max_length_target = max(len(seq) for seq in target_sequences)
target_sequences = pad_sequences(target_sequences, maxlen=max_length_target, padding='post')

# Prepare data for decoder
decoder_input_data = target_sequences[:, :-1]
decoder_target_data = target_sequences[:, 1:]
decoder_target_data = np.expand_dims(decoder_target_data, -1)

# Model parameters
embedding_dim = 64
latent_dim = 128

# Encoder model
encoder_inputs = Input(shape=(max_length_input,))
encoder_embedding = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder model
decoder_inputs = Input(shape=(max_length_target - 1,))
decoder_embedding = Embedding(target_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.fit([input_sequences, decoder_input_data], decoder_target_data, batch_size=2, epochs=30, validation_split=0.2)

# Inference models for testing
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Translation function
def translate_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_tokenizer.word_index['start']
    decoded_sentence = ''
    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_tokenizer.index_word.get(sampled_token_index)
        if sampled_word == 'end' or sampled_word is None or len(decoded_sentence.split()) > max_length_target:
            break
        decoded_sentence += ' ' + sampled_word
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    return decoded_sentence.strip()

# Test translation
test_sentence = "nice"
test_sequence = pad_sequences(input_tokenizer.texts_to_sequences([test_sentence]), maxlen=max_length_input)
translated_sentence = translate_sequence(test_sequence)
print(f"Original sentence: {test_sentence}")
print(f"Translated sentence: {translated_sentence}")


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Original sentence: nice
Translated sentence: bonjour
