In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Bidirectional, AdditiveAttention

# Load and preprocess dataset
with open("data/geeta.txt", "r", encoding="utf-8") as file:
    lines = [line.strip() for line in file if line.strip()]

sanskrit_lines = lines[0::2]  # Sanskrit in even indexes
english_lines = lines[1::2]   # English in odd indexes

# Ensure both lists have the same length
min_length = min(len(sanskrit_lines), len(english_lines))
sanskrit_lines = sanskrit_lines[:min_length]
english_lines = english_lines[:min_length]

# Add start and end tokens
def add_tokens(text):
    return 'start_ ' + text.lower() + ' _end'

english_lines = [add_tokens(sent) for sent in english_lines]

# Tokenization
tokenizer_sanskrit = Tokenizer()
tokenizer_english = Tokenizer()

tokenizer_sanskrit.fit_on_texts(sanskrit_lines)
tokenizer_english.fit_on_texts(english_lines)

sanskrit_sequences = tokenizer_sanskrit.texts_to_sequences(sanskrit_lines)
english_sequences = tokenizer_english.texts_to_sequences(english_lines)

# Padding
max_length_sanskrit = max(len(seq) for seq in sanskrit_sequences)
max_length_english = max(len(seq) for seq in english_sequences)

sanskrit_padded = pad_sequences(sanskrit_sequences, maxlen=max_length_sanskrit, padding='post')
english_padded = pad_sequences(english_sequences, maxlen=max_length_english, padding='post')

# Prepare decoder input and target data
decoder_input_data = english_padded[:, :-1]  # Remove last token
decoder_target_data = english_padded[:, 1:]  # Remove first token

# Model Parameters
embedding_dim = 256
lstm_units = 512

# Encoder
encoder_inputs = Input(shape=(max_length_sanskrit,))
enc_embedding = Embedding(input_dim=len(tokenizer_sanskrit.word_index)+1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(enc_embedding)

state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_embedding = Embedding(input_dim=len(tokenizer_english.word_index)+1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_embedding, initial_state=encoder_states)

# Attention Layer
attention = AdditiveAttention()
attention_result = attention([decoder_outputs, encoder_outputs])
decoder_combined_context = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs, attention_result])

decoder_dense = Dense(len(tokenizer_english.word_index)+1, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined_context)

# Define the Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the Model
model.fit(
    [sanskrit_padded, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=50,
    validation_split=0.2
)

Epoch 1/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2s/step - accuracy: 0.6036 - loss: 4.6900 - val_accuracy: 0.8236 - val_loss: 1.6107
Epoch 2/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy: 0.8059 - loss: 1.5804 - val_accuracy: 0.8278 - val_loss: 1.3524
Epoch 3/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy: 0.8080 - loss: 1.3227 - val_accuracy: 0.8250 - val_loss: 1.2373
Epoch 4/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy: 0.8123 - loss: 1.2142 - val_accuracy: 0.8307 - val_loss: 1.2111
Epoch 5/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy: 0.8147 - loss: 1.2191 - val_accuracy: 0.8302 - val_loss: 1.1917
Epoch 6/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy: 0.8171 - loss: 1.1985 - val_accuracy: 0.8331 - val_loss: 1.1889
Epoch 7/50
[1m12/12[0m [32m━━━━━━━━━━

KeyError: 'start_'

In [None]:
def translate_sentence(input_text):
    # Convert the input text to sequence and pad it
    input_seq = tokenizer_sanskrit.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_seq, maxlen=max_length_sanskrit, padding='post')

    # Initialize the decoder input with the correct start token ('start' not 'start_')
    decoder_input = np.array([[tokenizer_english.word_index['start']]])
    translated_sentence = []

    # Generate tokens one by one up to the maximum length
    for _ in range(max_length_english):
        predictions = model.predict([input_padded, decoder_input], verbose=0)
        predicted_id = np.argmax(predictions[0, -1, :])

        # Break if the predicted token is the end token ('end' not '_end')
        if predicted_id == tokenizer_english.word_index['end']:
            break

        # Retrieve the predicted word
        predicted_word = tokenizer_english.index_word.get(predicted_id, '')
        translated_sentence.append(predicted_word)

        # Append the predicted token to the decoder input
        decoder_input = np.concatenate([decoder_input, np.array([[predicted_id]])], axis=1)

    return ' '.join(translated_sentence)



Translated: others offer few age age achieve achieve difficulties restraint destroyed degrade honour and pain and humility and humility and egoism achieved achieved achieved achieved proper expedients expedients in the correct conclusion of the effulgence of the rik saman and yajus yajus self


In [66]:

# Example Translation
example_sentence = "धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः"
print("Translated:", translate_sentence(example_sentence))

Translated: others offer few age age achieve achieve difficulties restraint destroyed degrade honour and pain and humility and humility and egoism achieved achieved achieved achieved proper expedients expedients in the correct conclusion of the effulgence of the rik saman and yajus yajus self
