In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Bidirectional


In [2]:
# Load and preprocess dataset
with open("data/geeta.txt", "r", encoding="utf-8") as file:
    lines = [line.strip() for line in file if line.strip()]


In [3]:
sanskrit_lines = lines[0::2]  # Sanskrit in even indexes
english_lines = lines[1::2]   # English in odd indexes

In [4]:
# Ensure both lists have the same length
min_length = min(len(sanskrit_lines), len(english_lines))
sanskrit_lines = sanskrit_lines[:min_length]
english_lines = english_lines[:min_length]

In [5]:

# Add start and end tokens
def add_tokens(text):
    return 'start_ ' + text.lower() + ' _end'

english_lines = [add_tokens(sent) for sent in english_lines]


In [6]:
# Tokenization
tokenizer_sanskrit = Tokenizer()
tokenizer_english = Tokenizer()

In [7]:
tokenizer_sanskrit.fit_on_texts(sanskrit_lines)
tokenizer_english.fit_on_texts(english_lines)


In [8]:
sanskrit_sequences = tokenizer_sanskrit.texts_to_sequences(sanskrit_lines)
english_sequences = tokenizer_english.texts_to_sequences(english_lines)


In [9]:
# Padding
max_length_sanskrit = max(len(seq) for seq in sanskrit_sequences)
max_length_english = max(len(seq) for seq in english_sequences)

sanskrit_padded = pad_sequences(sanskrit_sequences, maxlen=max_length_sanskrit, padding='post')
english_padded = pad_sequences(english_sequences, maxlen=max_length_english, padding='post')


In [10]:

# Model Architecture
embedding_dim = 256
lstm_units = 512

In [12]:
encoder_inputs = Input(shape=(max_length_sanskrit,))
enc_embedding = Embedding(input_dim=len(tokenizer_sanskrit.word_index)+1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(lstm_units, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(enc_embedding)


In [13]:
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])


In [14]:
encoder_states = [state_h, state_c]

In [15]:
# Decoder
decoder_inputs = Input(shape=(max_length_english,))
dec_embedding = Embedding(input_dim=len(tokenizer_english.word_index)+1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(tokenizer_english.word_index)+1, activation='softmax')
output = decoder_dense(decoder_outputs)

In [None]:
# Define Model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [20]:
print("Encoder input shape:", sanskrit_padded.shape)  # Should be (num_samples, max_length_sanskrit)
print("Decoder input shape:", decoder_input_data.shape)  # Should be (num_samples, max_length_english - 1)
print("Decoder target shape:", decoder_target_data.shape)  # Should be (num_samples, max_length_english - 1)

Encoder input shape: (960, 46)
Decoder input shape: (960, 154)
Decoder target shape: (960, 154)


In [21]:
model.summary()

In [25]:
expected_decoder_length = max_length_english - 1  # Should be 154
decoder_input_data = pad_sequences(english_padded[:, :-1], maxlen=expected_decoder_length, padding='post')
decoder_target_data = pad_sequences(english_padded[:, 1:], maxlen=expected_decoder_length, padding='post')

In [26]:
print("Decoder Input Shape:", decoder_input_data.shape)  # Should be (960, 154)
print("Decoder Target Shape:", decoder_target_data.shape)  # Should be (960, 154)

Decoder Input Shape: (960, 154)
Decoder Target Shape: (960, 154)


In [27]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [28]:
decoder_input_data = english_padded[:, :-1]  # Remove last token
decoder_target_data = english_padded[:, 1:]  # Shift left for training labels

# Ensure both have the same shape as max_length_english
expected_decoder_length = max_length_english  

decoder_input_data = pad_sequences(decoder_input_data, maxlen=expected_decoder_length, padding='post')
decoder_target_data = pad_sequences(decoder_target_data, maxlen=expected_decoder_length, padding='post')

print("Encoder Input Shape:", sanskrit_padded.shape)  # Should be (960, 46)
print("Decoder Input Shape:", decoder_input_data.shape)  # Should be (960, 155)
print("Decoder Target Shape:", decoder_target_data.shape)  # Should be (960, 155)

model.fit([sanskrit_padded, decoder_input_data], decoder_target_data, batch_size=64, epochs=50, validation_split=0.2)

Encoder Input Shape: (960, 46)
Decoder Input Shape: (960, 155)
Decoder Target Shape: (960, 155)
Epoch 1/50


2025-02-09 22:35:25.985308: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 530ms/step - accuracy: 0.5980 - loss: 4.6653 - val_accuracy: 0.8156 - val_loss: 1.5294
Epoch 2/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 505ms/step - accuracy: 0.7980 - loss: 1.6150 - val_accuracy: 0.8256 - val_loss: 1.2672
Epoch 3/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 510ms/step - accuracy: 0.8111 - loss: 1.3048 - val_accuracy: 0.8280 - val_loss: 1.1735
Epoch 4/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 511ms/step - accuracy: 0.8168 - loss: 1.2035 - val_accuracy: 0.8336 - val_loss: 1.1588
Epoch 5/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 516ms/step - accuracy: 0.8111 - loss: 1.2261 - val_accuracy: 0.8324 - val_loss: 1.1527
Epoch 6/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 509ms/step - accuracy: 0.8163 - loss: 1.1939 - val_accuracy: 0.8344 - val_loss: 1.1520
Epoch 7/50
[1m12/12[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x321783f20>

In [29]:
# Translation Function
def translate_sentence(input_text):
    input_seq = tokenizer_sanskrit.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_seq, maxlen=max_length_sanskrit, padding='post')
    
    prediction = model.predict([input_padded, english_padded[:1]])
    predicted_indices = tf.argmax(prediction[0], axis=-1).numpy()
    translated_words = [tokenizer_english.index_word.get(idx, '') for idx in predicted_indices]
    return ' '.join(translated_words)

In [30]:
# Example Translation
example_sentence = "धर्मक्षेत्रे कुरुक्षेत्रे"
print("Translated:", translate_sentence(example_sentence))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Translated: the said o king of the the the son of the kurus the and the the the the the and in great in great end of gavalgana came end to the sun of the end the the the end each presence of the celestial of  end were like the side with was great of the kurus with was the by                                                                                              


In [31]:
# Example Translation
example_sentence = "धर्मक्षेत्रे कुरुक्षेत्रे"
print("Translated:", translate_sentence(example_sentence))
print("Expected:", "start_ in the land of righteousness, in the land of the kurus _end")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Translated: the said o king of the the the son of the kurus the and the the the the the and in great in great end of gavalgana came end to the sun of the end the the the end each presence of the celestial of  end were like the side with was great of the kurus with was the by                                                                                              
Expected: start_ in the land of righteousness, in the land of the kurus _end


In [32]:
example_sentence = "यदा यदा हि धर्मस्य ग्लानिर्भवति भारत अभ्युत्थानमधर्मस्य तदात्मानं सृजाम्यहम् "
print("Translated:", translate_sentence(example_sentence))
print("Expected:", "start_ whenever righteousness declines and unrighteousness increases, at that time I manifest myself _end")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
Translated: the said o king of the the the son of the kurus the and the the the the the and in great in great end of gavalgana came end to the enemy of the end the the the end each celestial of the celestial of the end was like the side with was great of the kurus with was the by                                                                                              
Expected: start_ whenever righteousness declines and unrighteousness increases, at that time I manifest myself _end


In [None]:
#new model to check if it can be trained
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Load and preprocess dataset
with open("data/geeta.txt", "r", encoding="utf-8") as file:
    lines = [line.strip() for line in file if line.strip()]

sanskrit_lines = lines[0::2]  # Sanskrit in even indexes
english_lines = lines[1::2]   # English in odd indexes

# Ensure both lists have the same length
min_length = min(len(sanskrit_lines), len(english_lines))
sanskrit_lines = sanskrit_lines[:min_length]
english_lines = english_lines[:min_length]

# Add start and end tokens
def add_tokens(text):
    return 'start_ ' + text.lower() + ' _end'

english_lines = [add_tokens(sent) for sent in english_lines]

# Tokenization
tokenizer_sanskrit = Tokenizer()
tokenizer_english = Tokenizer()

tokenizer_sanskrit.fit_on_texts(sanskrit_lines)
tokenizer_english.fit_on_texts(english_lines)

sanskrit_sequences = tokenizer_sanskrit.texts_to_sequences(sanskrit_lines)
english_sequences = tokenizer_english.texts_to_sequences(english_lines)

# Padding
max_length_sanskrit = max(len(seq) for seq in sanskrit_sequences)
max_length_english = max(len(seq) for seq in english_sequences)

sanskrit_padded = pad_sequences(sanskrit_sequences, maxlen=max_length_sanskrit, padding='post')
english_padded = pad_sequences(english_sequences, maxlen=max_length_english, padding='post')

# Prepare decoder input and target data
decoder_input_data = english_padded[:, :-1]  # Remove last token
decoder_target_data = english_padded[:, 1:]  # Remove first token

# Model Parameters
embedding_dim = 256
lstm_units = 512

# Encoder
encoder_inputs = Input(shape=(max_length_sanskrit,))
enc_embedding = Embedding(input_dim=len(tokenizer_sanskrit.word_index)+1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_embedding = Embedding(input_dim=len(tokenizer_english.word_index)+1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(tokenizer_english.word_index)+1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the Model
model.fit(
    [sanskrit_padded, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=50,
    validation_split=0.2
)


Epoch 1/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 615ms/step - accuracy: 0.6026 - loss: 5.4808 - val_accuracy: 0.8082 - val_loss: 1.6695
Epoch 2/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 594ms/step - accuracy: 0.7944 - loss: 1.8054 - val_accuracy: 0.8202 - val_loss: 1.5288
Epoch 3/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 543ms/step - accuracy: 0.7978 - loss: 1.6999 - val_accuracy: 0.8152 - val_loss: 1.3815
Epoch 4/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 512ms/step - accuracy: 0.8006 - loss: 1.4757 - val_accuracy: 0.8191 - val_loss: 1.2346
Epoch 5/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 564ms/step - accuracy: 0.8012 - loss: 1.3297 - val_accuracy: 0.8232 - val_loss: 1.1556
Epoch 6/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 573ms/step - accuracy: 0.8069 - loss: 1.2541 - val_accuracy: 0.8247 - val_loss: 1.1492
Epoch 7/50
[1m12/12[0m [

KeyError: 'start_'

In [62]:
def translate_sentence(input_text):
    # Convert the input text to sequence and pad it
    input_seq = tokenizer_sanskrit.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_seq, maxlen=max_length_sanskrit, padding='post')

    # Initialize the decoder input with the correct start token ('start' not 'start_')
    decoder_input = np.array([[tokenizer_english.word_index['start']]])
    translated_sentence = []

    # Generate tokens one by one up to the maximum length
    for _ in range(max_length_english):
        predictions = model.predict([input_padded, decoder_input], verbose=0)
        predicted_id = np.argmax(predictions[0, -1, :])

        # Break if the predicted token is the end token ('end' not '_end')
        if predicted_id == tokenizer_english.word_index['end']:
            break

        # Retrieve the predicted word
        predicted_word = tokenizer_english.index_word.get(predicted_id, '')
        translated_sentence.append(predicted_word)

        # Append the predicted token to the decoder input
        decoder_input = np.concatenate([decoder_input, np.array([[predicted_id]])], axis=1)

    return ' '.join(translated_sentence)



In [63]:

# Example Translation
example_sentence = "धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः"
print("Translated:", translate_sentence(example_sentence))

Translated: the great one said o partha o partha o partha o son of kunti o son of kunti bhishma of the field of the kurus and the celestial


In [64]:

# Example Translation
example_sentence = "यस्मिन् सत्यं च मेधा च नीतिश्च भरतर्षभे। अप्रमेयाणि दुर्धर्षे कथं स निहतो युधि"
print("Translated:", translate_sentence(example_sentence))

Translated: the great one said o partha o partha o partha o son of kunti o son of kunti bhishma of the field of the kurus and the celestial
