In [114]:
import pandas as pd

#Filtering dataset and arranging into columns of Sanskrit and English

with open("data/geeta.txt", "r", encoding="utf-8") as file:
    lines = [line.strip() for line in file if line.strip()]  

# Separate Sanskrit and English lines
sanskrit_lines = lines[0::2]  # Sanskrit in even indexes
english_lines = lines[1::2]   # English in odd indexes

# Ensure both lists have the same length

min_length = min(len(sanskrit_lines), len(english_lines))
sanskrit_lines = sanskrit_lines[:min_length]
english_lines = english_lines[:min_length]

# Create DataFrame
df = pd.DataFrame({"Sanskrit": sanskrit_lines, "English": english_lines})

df.head(5)

Unnamed: 0,Sanskrit,English
0,वैशम्पायन उवाच अथ गावल्गणिविद्वान् संयुगादेत्य...,"Vaishampayana said O descendant of Bharata, po..."
1,संजय उवाच संजयोऽहं महाराज नमस्ते भरतर्षभ। हतो ...,"Sanjaya said O great king, I am Sanjaya , O fo..."
2,ककुदं सर्वयोधानां धाम सर्वधनुष्मताम्। शरतल्पगत...,"The foremost of all warriors, that prowess per..."
3,यस्य वीर्यं समाश्रित्य द्यूतं पुत्रस्तवाकरोत्।...,"O king, relaying on whose energy, your son pla..."
4,यः सर्वान् पृथिवीपालान् समवेतान् महामृधे। जिगा...,That mighty car-warrior who on a single car ha...


In [115]:
df.tail(5)

Unnamed: 0,Sanskrit,English
955,संजय उवाच इत्यहं वासुदेवस्य पार्थस्य च महात्मन...,"Sanjay said O king, I heard this wonderful and..."
956,व्यासप्रसादाच्छ्रुतवानेतद् गुह्यमहं परम्। योगय...,"Through the favour of Vyasa, I myself heard th..."
957,राजन् संस्मृत्य संस्मृत्य संवादमिममद्भुतम्। के...,"O king, I am feeling more and more pleasure as..."
958,तच्च संस्मृत्य संस्मृत्य रूपमत्यद्भुतं हरेः। व...,"O king, I am feeling more and more pleasure as..."
959,यत्र योगेश्वरः कृष्णो यत्र पार्थो धनुर्धरः। तत...,Wherever exist the Lord of Yoga Srikrishna and...


In [116]:
#Lowecasing sentences of english so that it can be used for training
df['English'] = df['English'].str.lower()

In [117]:
df.head(4)

Unnamed: 0,Sanskrit,English
0,वैशम्पायन उवाच अथ गावल्गणिविद्वान् संयुगादेत्य...,"vaishampayana said o descendant of bharata, po..."
1,संजय उवाच संजयोऽहं महाराज नमस्ते भरतर्षभ। हतो ...,"sanjaya said o great king, i am sanjaya , o fo..."
2,ककुदं सर्वयोधानां धाम सर्वधनुष्मताम्। शरतल्पगत...,"the foremost of all warriors, that prowess per..."
3,यस्य वीर्यं समाश्रित्य द्यूतं पुत्रस्तवाकरोत्।...,"o king, relaying on whose energy, your son pla..."


In [118]:
#For encoder and decoder model we need to add start and end tokens to the target sentences (sanskrit)
df['English'] = df['English'].apply(lambda x: 'start_ ' + x + ' _end')


In [119]:
df.head(4)

Unnamed: 0,Sanskrit,English
0,वैशम्पायन उवाच अथ गावल्गणिविद्वान् संयुगादेत्य...,start_ vaishampayana said o descendant of bhar...
1,संजय उवाच संजयोऽहं महाराज नमस्ते भरतर्षभ। हतो ...,"start_ sanjaya said o great king, i am sanjaya..."
2,ककुदं सर्वयोधानां धाम सर्वधनुष्मताम्। शरतल्पगत...,"start_ the foremost of all warriors, that prow..."
3,यस्य वीर्यं समाश्रित्य द्यूतं पुत्रस्तवाकरोत्।...,"start_ o king, relaying on whose energy, your ..."


In [120]:
#Get the english and sanskrit vocabulary
all_eng_words=set()
for eng in df['English']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_san_words=set()
for sanskrit in df['Sanskrit']:
    for word in sanskrit.split():
        if word not in all_san_words:
            all_san_words.add(word)
print(len(all_eng_words))
print(len(all_san_words))

4926
6305


In [121]:
df.head(1000)

Unnamed: 0,Sanskrit,English
0,वैशम्पायन उवाच अथ गावल्गणिविद्वान् संयुगादेत्य...,start_ vaishampayana said o descendant of bhar...
1,संजय उवाच संजयोऽहं महाराज नमस्ते भरतर्षभ। हतो ...,"start_ sanjaya said o great king, i am sanjaya..."
2,ककुदं सर्वयोधानां धाम सर्वधनुष्मताम्। शरतल्पगत...,"start_ the foremost of all warriors, that prow..."
3,यस्य वीर्यं समाश्रित्य द्यूतं पुत्रस्तवाकरोत्।...,"start_ o king, relaying on whose energy, your ..."
4,यः सर्वान् पृथिवीपालान् समवेतान् महामृधे। जिगा...,start_ that mighty car-warrior who on a single...
...,...,...
955,संजय उवाच इत्यहं वासुदेवस्य पार्थस्य च महात्मन...,"start_ sanjay said o king, i heard this wonder..."
956,व्यासप्रसादाच्छ्रुतवानेतद् गुह्यमहं परम्। योगय...,"start_ through the favour of vyasa, i myself h..."
957,राजन् संस्मृत्य संस्मृत्य संवादमिममद्भुतम्। के...,"start_ o king, i am feeling more and more plea..."
958,तच्च संस्मृत्य संस्मृत्य रूपमत्यद्भुतं हरेः। व...,"start_ o king, i am feeling more and more plea..."


In [122]:
df['length_san_sentence']=df['Sanskrit'].apply(lambda x:len(x.split(" ")))
df['length_eng_sentence']=df['English'].apply(lambda x:len(x.split(" ")))

In [123]:
df.head(4)

Unnamed: 0,Sanskrit,English,length_san_sentence,length_eng_sentence
0,वैशम्पायन उवाच अथ गावल्गणिविद्वान् संयुगादेत्य...,start_ vaishampayana said o descendant of bhar...,18,62
1,संजय उवाच संजयोऽहं महाराज नमस्ते भरतर्षभ। हतो ...,"start_ sanjaya said o great king, i am sanjaya...",11,33
2,ककुदं सर्वयोधानां धाम सर्वधनुष्मताम्। शरतल्पगत...,"start_ the foremost of all warriors, that prow...",8,25
3,यस्य वीर्यं समाश्रित्य द्यूतं पुत्रस्तवाकरोत्।...,"start_ o king, relaying on whose energy, your ...",11,29


In [124]:
df[df['length_san_sentence']>30].shape

(5, 4)

In [125]:
df=df[df['length_san_sentence']<=20]
df=df[df['length_eng_sentence']<=20]

In [126]:
df.shape

(71, 4)

In [127]:
print("maximum length of English Sentence ",max(df['length_eng_sentence']))
print("maximum length of Sanskrit Sentence ",max(df['length_san_sentence']))

maximum length of English Sentence  20
maximum length of Sanskrit Sentence  12


In [128]:
max_length_src=max(df['length_eng_sentence'])
max_length_tar=max(df['length_san_sentence'])

In [130]:
# Fix for DataFrame error - directly use the sorted sets
input_words = sorted(list(all_san_words))
target_words = sorted(list(all_eng_words))
num_encoder_tokens = len(all_san_words)
num_decoder_tokens = len(all_eng_words)
print(num_encoder_tokens, num_decoder_tokens)

6305 4926


In [131]:
num_decoder_tokens += 1 #for zero padding

In [132]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [136]:
from sklearn.utils import shuffle
df = shuffle(df)
df.head(10)

Unnamed: 0,Sanskrit,English,length_san_sentence,length_eng_sentence
67,य आत्मनो दुष्चरितादशुभं प्राप्नुयानरः। एनसा ते...,start_ the man who suffers evil for his own mi...,9,17
835,दम्भो दर्पोऽभिमानश्च क्रोधः पारुष्यमेव च। अज्ञ...,"start_ hypocrisy, pride, conceit, wrath, ruden...",9,17
903,पृथक्त्वेन तु यज्ज्ञानं नानाभावान् पृथग्विधान्...,start_ raja knowledge is that which sees vario...,11,18
143,तस्य पर्वतसंकाशा व्यरोचन्त महागजाः यन्त्रतोमरत...,start_ his clephants cach loo king like a hill...,7,19
184,अक्षौहिण्याथ पाञ्चाल्यो यज्ञसेनो महामनाः। विरा...,start_ the illustration yajnasena the greatly ...,8,20
293,अनन्तविजयं राजा कुन्तीपुत्रो युधिष्ठिरः। नकुलः...,"start_ the son of kunti, king yudhishthira, an...",7,19
303,श्वशुरान् सुहृदश्चैव सेनयोरुभयोरपि। तान् समीक्...,start_ seeing in the two armies all friends an...,12,19
802,सत्त्वात् संजायते ज्ञानं रजसो लोभ एव च। प्रमाद...,start_ from sattva is produced knowledge from ...,11,17
737,ये त्वक्षरमनिर्देश्यमव्यक्तं पर्युपासते। सर्वत...,"start_ those, however, who worship the imperis...",7,18
880,सद्भावे साधुभावे च सदित्येतत् प्रयुज्यते। प्रश...,"start_ sat denotes existence and goodness, o p...",11,16


In [138]:
from sklearn.model_selection import train_test_split
X, y = df['Sanskrit'], df['English']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((56,), (15,))

In [139]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')

In [140]:
import numpy as np
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)


In [141]:
latent_dim=300

In [143]:

from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [144]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [145]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [146]:
model.summary()

In [147]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100

In [150]:

model.fit(
    generate_batch(X_train, y_train, batch_size=batch_size),
    steps_per_epoch=train_samples//batch_size,
    epochs=epochs,
    validation_data=generate_batch(X_test, y_test, batch_size=batch_size),
    validation_steps=val_samples//batch_size
)

IndexError: index 12 is out of bounds for axis 1 with size 12