In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 2: Load the data

In [None]:
# Load dataset (assuming a CSV file with 'english' and 'french' columns)
data = pd.read_csv('english-french.csv')
english_sentences = data['english'].values
french_sentences = data['french'].values

# Tokenize the sentences
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_vocab_size = len(eng_tokenizer.word_index) + 1

french_tokenizer = Tokenizer()
french_tokenizer.fit_on_texts(french_sentences)
french_vocab_size = len(french_tokenizer.word_index) + 1

# Convert sentences to sequences of integers
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)

# Padding sequences to ensure uniform length
max_eng_len = max([len(seq) for seq in eng_sequences])
max_french_len = max([len(seq) for seq in french_sequences])

eng_sequences = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
french_sequences = pad_sequences(french_sequences, maxlen=max_french_len, padding='post')


# Define the Seq2Seq model

In [None]:
# Encoder
latent_dim = 256
encoder_inputs = Input(shape=(max_eng_len,))
encoder_embedding = Embedding(eng_vocab_size, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_french_len,))
decoder_embedding = Embedding(french_vocab_size, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(french_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# Print the summary
model.summary()


# Train the model

In [None]:
# Shift French sequences one time step to the right for decoder input
french_sequences_input = french_sequences[:, :-1]
french_sequences_output = french_sequences[:, 1:]

# Reshape for sparse categorical cross-entropy
french_sequences_output = np.expand_dims(french_sequences_output, -1)

# Train the model
model.fit([eng_sequences, french_sequences_input], french_sequences_output,
          batch_size=64, epochs=50, validation_split=0.2)

# Define the inference model for translation

In [None]:
# Inference encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Inference decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Translate function

In [None]:
def translate_sentence(input_sentence):
    # Tokenize and pad the input sentence
    input_seq = eng_tokenizer.texts_to_sequences([input_sentence])
    input_seq = pad_sequences(input_seq, maxlen=max_eng_len, padding='post')

    # Get the encoder states
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence with only the start token
    target_seq = np.zeros((1, 1))
    
    # Start translation
    stop_condition = False
    translated_sentence = ''

    while not stop_condition:
        # Predict next token
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the token index with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = french_tokenizer.index_word.get(sampled_token_index, '')

        if sampled_word == '' or sampled_word == '<eos>':
            stop_condition = True
        else
            translated_sentence += ' ' + sampled_word

        # Update the target sequence and states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return translated_sentence


In [None]:
while True:
    input_sentence = input("Enter an English sentence (or type 'exit' to quit): ")
    if input_sentence.lower() == 'exit':
        break
    translation = translate_sentence(input_sentence)
    print(f"French translation: {translation}")