In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import numpy as np


english_sentences = [
    "Hello, how are you?",
    "I love deep learning.",
    "What is your name?",
    "Where do you live?"
]

tamil_sentences = [
    "வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?",
    "எனக்கு ஆழ்ந்த கற்றல் பிடிக்கும்.",
    "உங்கள் பெயர் என்ன?",
    "நீங்கள் எங்கு வாழ்கிறீர்கள்?"
]

# Tokenize English sentences
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)

# Tokenize Tamil sentences
tam_tokenizer = Tokenizer()
tam_tokenizer.fit_on_texts(tamil_sentences)
tam_vocab_size = len(tam_tokenizer.word_index) + 1
tam_sequences = tam_tokenizer.texts_to_sequences(tamil_sentences)

# Pad sequences to ensure uniform length
max_length = max(max(len(seq) for seq in eng_sequences), max(len(seq) for seq in tam_sequences))
eng_sequences = pad_sequences(eng_sequences, maxlen=max_length, padding='post')
tam_sequences = pad_sequences(tam_sequences, maxlen=max_length, padding='post')

# Split input-output pairs
X_train = eng_sequences
y_train = tam_sequences

# Define Seq2Seq Model (Encoder-Decoder)
embedding_dim = 128
units = 256

# Encoder
encoder_inputs = keras.layers.Input(shape=(max_length,))
enc_emb = keras.layers.Embedding(eng_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = keras.layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = keras.layers.Input(shape=(max_length,))
dec_emb = keras.layers.Embedding(tam_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = keras.layers.LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(tam_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile Model
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Shift decoder input (y_train) correctly
decoder_input_data = y_train[:, :-1]  # Remove last token
decoder_target_data = y_train[:, 1:]  # Remove first token

# Ensure shapes match
print("Encoder Input Shape:", X_train.shape)  # (num_samples, max_length)
print("Decoder Input Shape:", decoder_input_data.shape)  # (num_samples, max_length - 1)
print("Decoder Target Shape:", decoder_target_data.shape)  # (num_samples, max_length - 1)

# Fix shape mismatch by adjusting max_length
decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_length, padding='post')
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_length, padding='post')

# Train Model
model.fit([X_train, decoder_input_data], decoder_target_data, batch_size=64, epochs=100)




In [None]:
# Function to translate English → Tamil
def translate(sentence):
    sequence = eng_tokenizer.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    states_value = model.predict([sequence, np.zeros((1, max_length))])
    predicted_seq = np.argmax(states_value, axis=-1)[0]
    output_sentence = ' '.join([word for word, index in tam_tokenizer.word_index.items() if index in predicted_seq])
    return output_sentence

In [None]:
print(translate("what is  name"))