In [2]:
import pandas as pd
from google.colab import drive
import os
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
import numpy as np

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
folder_path = "/content/drive/MyDrive/Translation"
os.chdir(folder_path)

In [None]:
df = pd.read_excel('data.xlsx')
df.head()

Unnamed: 0,Sanskrit Word,English Meanings
0,Abhanavarana:,"['Screening the outshining Bragman', ' one of ..."
1,Abhasa:,"['Reflection', ' appearance', ' semblance', ' ..."
2,Abhasamatra:,['In name only.']
3,Abhasavada:,['Doctrine holding that all creation is reflec...
4,Abhati:,"['Shines', ' illumines.']"


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df['Sanskrit Tokenized'] = df['Sanskrit Word'].apply(nltk.word_tokenize)

In [None]:
import spacy

In [None]:
nlp_english = spacy.load("en_core_web_sm")

In [None]:
def tokenize_english_sentence(english_sentence):
    english_doc = nlp_english(english_sentence)
    return [token.text for token in english_doc]

df['English Tokenized'] = df['English Meanings'].apply(tokenize_english_sentence)

In [None]:
from gensim.models import Word2Vec

In [None]:
# Train or load Word2Vec model on the tokenized words for Sanskrit
word2vec_model_sanskrit = Word2Vec(sentences=df['Sanskrit Tokenized'], vector_size=100, window=5, min_count=1, workers=4)

# Train or load Word2Vec model on the tokenized words for English
word2vec_model_english = Word2Vec(sentences=df['English Tokenized'], vector_size=100, window=5, min_count=1, workers=4)

# Convert tokenized words to word embeddings for Sanskrit
df['Sanskrit Embeddings'] = df['Sanskrit Tokenized'].apply(lambda tokens: [word2vec_model_sanskrit.wv[word] for word in tokens])

# Convert tokenized words to word embeddings for English
df['English Embeddings'] = df['English Tokenized'].apply(lambda tokens: [word2vec_model_english.wv[word] for word in tokens])

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Create a tokenizer for Sanskrit words
sanskrit_tokenizer = Tokenizer(oov_token='<OOV>')
sanskrit_tokenizer.fit_on_texts(df['Sanskrit Tokenized'])

# Create a tokenizer for English words
english_tokenizer = Tokenizer(oov_token='<OOV>')
english_tokenizer.fit_on_texts(df['English Tokenized'])

In [None]:
# Convert tokenized Sanskrit sequences to index sequences
sanskrit_sequences = sanskrit_tokenizer.texts_to_sequences(df['Sanskrit Tokenized'])

# Convert tokenized English sequences to index sequences
english_sequences = english_tokenizer.texts_to_sequences(df['English Tokenized'])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the maximum sequence lengths (input and output)
max_sanskrit_length = 20
max_english_length = 25

# Pad or truncate the sequences to the maximum lengths
padded_sanskrit_sequences = pad_sequences(sanskrit_sequences, maxlen=max_sanskrit_length, padding='post', truncating='post')
padded_english_sequences = pad_sequences(english_sequences, maxlen=max_english_length, padding='post', truncating='post')

In [None]:
# Input will be the padded Sanskrit sequences
encoder_input_data = padded_sanskrit_sequences

# Output will be the padded English sequences shifted by one time step (teacher forcing)
decoder_input_data = padded_english_sequences[:, :-1]
decoder_output_data = padded_english_sequences[:, 1:]

# The first time step of the output is not used (remove the first token from each sequence)
# The last token in each output sequence is <PAD> due to padding, and it won't have a corresponding input token
# To maintain consistency, the last token in each output sequence is also removed

In [None]:
# Define the input shape for the model
input_shape = (max_sanskrit_length,)

# Define the size of the vocabulary for both Sanskrit and English
sanskrit_vocab_size = len(sanskrit_tokenizer.word_index) + 1
english_vocab_size = len(english_tokenizer.word_index) + 1

# Define the embedding dimension
embedding_dim = 100

# Encoder
encoder_input = Input(shape=input_shape)
encoder_embedding = Embedding(input_dim=sanskrit_vocab_size, output_dim=embedding_dim)(encoder_input)
encoder_lstm = LSTM(300)(encoder_embedding)  # Adjust the number of LSTM units as needed

# Decoder
decoder_input = Input(shape=(max_english_length - 1,))
decoder_embedding = Embedding(input_dim=english_vocab_size, output_dim=embedding_dim)(decoder_input)
decoder_lstm = LSTM(300, return_sequences=True)(decoder_embedding, initial_state=[encoder_lstm, encoder_lstm])
decoder_output = Dense(english_vocab_size, activation='softmax')(decoder_lstm)

# Model
model = Model(inputs=[encoder_input, decoder_input], outputs=decoder_output)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the summary of the model
model.summary()


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 24)]         0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 20, 100)      248600      ['input_5[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, 24, 100)      384900      ['input_6[0][0]']                
                                                                                            

In [None]:
# Train the model using provided data
model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=32, epochs=80, validation_split=0.1)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x7a3128175a80>

In [None]:
# New Sanskrit word for translation
new_sanskrit_word = "Abhimani:"

# Tokenize and pad the new Sanskrit word
new_sanskrit_sequences = sanskrit_tokenizer.texts_to_sequences([new_sanskrit_word])
padded_new_sanskrit_sequences = pad_sequences(new_sanskrit_sequences, maxlen=max_sanskrit_length, padding='post', truncating='post')

# Perform inference for translation
encoder_input_for_translation = padded_new_sanskrit_sequences
decoder_input_for_translation = np.zeros((1, max_english_length - 1))  # Initialize decoder input with zeros
start_token_index = 1  # Default to index 1 as the start token index
if '<start>' in english_tokenizer.word_index:
    start_token_index = english_tokenizer.word_index['<start>']
decoder_input_for_translation[:, 0] = start_token_index  # Set the start token index
translated_word_sequence = []  # To store the translated word sequence

max_translation_length = 20  # Set a maximum translation length (adjust as needed)

for i in range(max_translation_length - 1):  # Loop until the maximum translation length
    predictions = model.predict([encoder_input_for_translation, decoder_input_for_translation])
    next_word_index = np.argmax(predictions[:, i, :], axis=-1)  # Get the index of the next word in the sequence
    translated_word_sequence.append(int(next_word_index))  # Convert numpy int to Python int
    decoder_input_for_translation[:, i+1] = next_word_index  # Update decoder input for the next time step

    if next_word_index == 0:  # Check if the predicted word is the padding token (index 0)
        break

# Convert the translated_word_sequence back to English words
translated_english_words = english_tokenizer.sequences_to_texts([translated_word_sequence])[0]

print("Sanskrit Word:", new_sanskrit_word)
print("Translated English Meaning:", translated_english_words)


Sanskrit Word: Abhimani:
Translated English Meaning: ' the state of being an experiencer or enjoyer . ' ] <OOV>
