In [34]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [50]:
english_sentences = [
    "hello",
    "how are you"
]

hindi_sentences = [
    "नमस्ते",
    "आप कैसे हैं"
]

# Add start and end tokens for decoder
hindi_sentences = ["<start> " + s + " <end>" for s in hindi_sentences]



In [51]:
## Preprocess Text (Tokenization and Padding)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Tokenize English
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
max_eng_len = max(len(seq) for seq in eng_sequences)
eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')

# Tokenize Hindi
hin_tokenizer = Tokenizer()
hin_tokenizer.fit_on_texts(hindi_sentences)
hin_sequences = hin_tokenizer.texts_to_sequences(hindi_sentences)
max_hin_len = max(len(seq) for seq in hin_sequences)
hin_padded = pad_sequences(hin_sequences, maxlen=max_hin_len, padding='post')
print('hin_padded: ',hin_padded)

# Prepare decoder input and output
decoder_input_data = hin_padded[:, :-1]
decoder_target_data = hin_padded[:, 1:]
print('decoder_input_data: ',decoder_input_data)
print('decoder_target_data: ',decoder_target_data)

# Reshape target to 3D
decoder_target_data = np.expand_dims(decoder_target_data, -1)
print('decoder_target_data2: ',decoder_target_data)

# Vocabulary sizes
eng_vocab_size = len(eng_tokenizer.word_index) + 1
hin_vocab_size = len(hin_tokenizer.word_index) + 1


hin_padded:  [[1 3 2 0 0]
 [1 4 5 6 2]]
decoder_input_data:  [[1 3 2 0]
 [1 4 5 6]]
decoder_target_data:  [[3 2 0 0]
 [4 5 6 2]]
decoder_target_data2:  [[[3]
  [2]
  [0]
  [0]]

 [[4]
  [5]
  [6]
  [2]]]


In [37]:
print(hin_tokenizer.word_index.keys())


dict_keys(['start', 'end', 'हैं', 'आप', 'है', 'क्या', 'रहे', 'मैं', 'हूँ', 'आपका', 'जा', 'वह', 'एक', 'नमस्ते', 'कैसे', 'ठीक', 'नाम', 'धन्यवाद', 'सुप्रभात', 'शुभ\u202fरात्रि', 'जल्दी', 'मिलते', 'कहाँ', 'यहाँ', 'आओ', 'तुमसे', 'प्यार', 'करता', 'दिन', 'शुभ', 'हो', 'करते', 'मुझे', 'मदद', 'चाहिए', 'अंग्रेजी', 'बोल', 'सकते', 'यह', 'मेरा', 'मित्र', 'हम', 'घर', 'डॉक्टर', 'छात्र', 'वे', 'फुटबॉल', 'खेल'])


In [38]:
## Build Encoder-Decoder Model

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(eng_vocab_size, latent_dim)(encoder_inputs)
_, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(hin_vocab_size, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(hin_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


In [39]:
## Train the Model

model.fit([eng_padded, decoder_input_data], decoder_target_data, batch_size=2, epochs=500)


Epoch 1/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 3.7160
Epoch 2/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 2.5814
Epoch 3/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 2.2926
Epoch 4/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 2.1016
Epoch 5/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 2.0160
Epoch 6/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 1.9395
Epoch 7/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 1.8555
Epoch 8/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 1.7928
Epoch 9/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 1.7337
Epoch 10/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 

<keras.src.callbacks.history.History at 0x165eb068450>

In [41]:
## Inference Models

# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb_inf = dec_emb_layer(decoder_inputs)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(dec_emb_inf, initial_state=decoder_states_inputs)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs_inf] + decoder_states_inf)


In [42]:
## Translate Function

reverse_hin_index = {i: w for w, i in hin_tokenizer.word_index.items()}

def translate(sentence):
    seq = eng_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_eng_len, padding='post')
    enc_h, enc_c = encoder_model.predict(seq)
    states = [enc_h, enc_c]

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hin_tokenizer.word_index["start"]

    stop = False
    translated = []

    while not stop:
        output_tokens, h, c = decoder_model.predict([target_seq] + states)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_hin_index.get(sampled_token_index, '')

        if sampled_word == "end" or len(translated) > max_hin_len:
            stop = True
        elif sampled_word != "start":
            translated.append(sampled_word)

        target_seq[0, 0] = sampled_token_index
        states = [h, c]

    return ' '.join(translated)


In [46]:
print("English:", "how are you")
print("Hindi:", translate("how are you"))


English: how are you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Hindi: आप कैसे हैं


In [44]:
import pickle

# Save tokenizers
with open('eng_tokenizer.pkl', 'wb') as f:
    pickle.dump(eng_tokenizer, f)
with open('hin_tokenizer.pkl', 'wb') as f:
    pickle.dump(hin_tokenizer, f)

# Save models
encoder_model.save('encoder_model.h5')
decoder_model.save('decoder_model.h5')



In [45]:
with open("seq_lengths.pkl", "wb") as f:
    pickle.dump((max_eng_len, max_hin_len), f)