In [2]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle

# Load dataset
data = pd.read_csv("english_hindi.csv")

# Tokenize English and Hindi words
english_tokenizer = Tokenizer(char_level=False)  # Word-level tokenizer
hindi_tokenizer = Tokenizer(char_level=False)

english_tokenizer.fit_on_texts(data['English'])
hindi_tokenizer.fit_on_texts(data['Hindi'])

# Convert text to sequences
english_sequences = english_tokenizer.texts_to_sequences(data['English'])
hindi_sequences = hindi_tokenizer.texts_to_sequences(data['Hindi'])

# Pad sequences
english_padded = pad_sequences(english_sequences, padding='post')
hindi_padded = pad_sequences(hindi_sequences, padding='post')

# Convert Hindi sequences to class indices
y_data = hindi_padded.argmax(axis=-1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(english_padded, y_data, test_size=0.2, random_state=42)

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=16, input_length=english_padded.shape[1]),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(hindi_tokenizer.word_index) + 1, activation='softmax')  # Output matches Hindi vocabulary size
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test))

# Save the trained model
model.save("english_to_hindi_model.h5")

# Save the tokenizers
with open('english_tokenizer.pkl', 'wb') as f:
    pickle.dump(english_tokenizer, f)

with open('hindi_tokenizer.pkl', 'wb') as f:
    pickle.dump(hindi_tokenizer, f)


Epoch 1/50




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step - accuracy: 0.5000 - loss: 1.9458 - val_accuracy: 1.0000 - val_loss: 1.9370
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 1.0000 - loss: 1.9369 - val_accuracy: 1.0000 - val_loss: 1.9274
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 1.0000 - loss: 1.9272 - val_accuracy: 1.0000 - val_loss: 1.9171
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 1.0000 - loss: 1.9167 - val_accuracy: 1.0000 - val_loss: 1.9061
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 1.0000 - loss: 1.9055 - val_accuracy: 1.0000 - val_loss: 1.8946
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 1.0000 - loss: 1.8938 - val_accuracy: 1.0000 - val_loss: 1.8826
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

