In [1]:
#%pip install numpy keras tensorflow pyttsx3 gtts pygame

  # Language Translation using Neural Networks 



In [2]:
#Reading and preprocessing
import numpy as np

In [3]:
f = open('Sanskrit.txt', 'r+', encoding="utf8")
x = f.readlines()

In [4]:
f = open('English.txt', 'r+', encoding="utf8")
y = f.readlines()

In [5]:
x = x[:]

In [6]:
y = y[:]

In [7]:
len(x)

700

### Text Pre-processing

In [8]:
x[0]= x[0].strip('\ufeffMMA')
y[0]= y[0].strip('\ufeffMMA')

In [9]:
import string
exclude = set(string.punctuation)
for i in range(0,len(x)):
    x[i] = x[i].strip('\n')
    x[i] = ''.join(ch for ch in x[i] if ch not in exclude)

In [10]:
for i in range(0,len(y)):
    y[i] = y[i].lower()
    y[i] = y[i].strip('\n')
    y[i] = ''.join(ch for ch in y[i] if ch not in exclude)

### Cleaned text

In [11]:
print("Sanskrit Text:",x[1],"\n")
print("English Text:",y[1])

Sanskrit Text: सञ्जय उवाच ।दृष्ट्वा तु पाण्डवानीकं व्यूढं दुर्योधनस्तदा ।आचार्यमुपसङ्गम्य राजा वचनमब्रवीत् ।। ।। 

English Text: sanjay said on observing the pandava army standing in military formation king duryodhan approached his teacher dronacharya and said the following words


In [12]:
len(x)

700

In [13]:
len(y)

700

In [14]:
english_words = []
for i in range(0,len(y)):
    english_words.append(y[i].split())   

In [15]:
english_words = [j for sub in english_words for j in sub]

In [16]:
print("Number of Unique English words:",len(set(english_words)))

Number of Unique English words: 2861


In [17]:
sans_words = []
for i in range(0,len(x)):
    sans_words.append(x[i].split())  
sans_words = [j for sub in sans_words for j in sub]

In [18]:
print("Number of Unique sans words:",len(set(sans_words)))

Number of Unique sans words: 4047


In [19]:
sansvocab = len(set(sans_words))
engvocab = len(set(english_words))

### Looks like there are more number of unique words in sans.....as expected

In [20]:
length_sans=[]
for i in range(0,len(x)):
    length_sans.append(len(x[i].split()))

In [21]:
length_english=[]
for i in range(0,len(y)):
    length_english.append(len(y[i].split()))

### Average number of words in each sentence

In [22]:
sum(length_english)/len(length_english)

35.20142857142857

In [23]:
sum(length_sans)/len(length_sans)

10.938571428571429

In [24]:
print(max(length_english))
print(max(length_sans))

139
40


In [25]:
import collections
english_words_counter = collections.Counter([word for sentence in y for word in sentence.split()])
sans_words_counter = collections.Counter([word for sentence in x for word in sentence.split()])

### Most common words in both languages

In [26]:
english_words_counter.most_common(10)

[('the', 1879),
 ('of', 1141),
 ('and', 978),
 ('to', 555),
 ('in', 528),
 ('is', 412),
 ('are', 348),
 ('i', 335),
 ('who', 290),
 ('me', 249)]

In [27]:
sans_words_counter.most_common(10)

[('च', 229),
 ('न', 173),
 ('स', 73),
 ('मे', 61),
 ('मां', 48),
 ('ते', 41),
 ('हि', 39),
 ('उवाच', 36),
 ('पार्थ', 36),
 ('कर्म', 35)]

In [28]:
#%pip install --upgrade tensorflow


In [29]:
from keras.preprocessing.text import Tokenizer
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x) 
    return tokenizer.texts_to_sequences(x), tokenizer

# Tokenizer:

Now that our corpus is ready we have to represent it in a way that the neural network can understand, So we convert the text representation to number representation. In words based representation each word his assigned a number abd in character based representation each character is assigned a number. I am using a word level model for its simpler complexity

Keras Tokenizer simplifies the representation process for us (This class allows to vectorize a text corpus, by turning each text into either a sequence of integers)


In [30]:
x[0]

'धृतराष्ट्र उवाच धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय '

In [31]:
z=(tokenize(x))
z[0][1]

[65, 8, 1165, 14, 1166, 1167, 1168, 1169, 200, 1170, 110, 110]

In [32]:
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',]
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'jumps': 5, 'over': 6, 'lazy': 7, 'dog': 8, 'by': 9, 'jove': 10, 'my': 11, 'study': 12, 'of': 13, 'lexicography': 14, 'won': 15, 'a': 16, 'prize': 17}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 3, 4, 5, 6, 1, 7, 8]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [9, 10, 11, 2, 12, 13, 14, 15, 16, 17]


# Padding:

When batching the sequence of word ids together, each sequence needs to be the same length. Since sentences are dynamic in length, we can add padding to the end of the sequences to make them the same length.




In [33]:
from keras.preprocessing.sequence import pad_sequences
def pad(x, length=None):
    return pad_sequences(x, maxlen=length, padding='post')

In [34]:
test_pad = pad(text_tokenized)
print("OUTPUT IS ALWAYS A LENGTH 10 ARRAY....FILLED BY 0s IN THE END")
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

OUTPUT IS ALWAYS A LENGTH 10 ARRAY....FILLED BY 0s IN THE END
Sequence 1 in x
  Input:  [1 2 3 4 5 6 1 7 8]
  Output: [1 2 3 4 5 6 1 7 8 0]
Sequence 2 in x
  Input:  [ 9 10 11  2 12 13 14 15 16 17]
  Output: [ 9 10 11  2 12 13 14 15 16 17]


### Apply all the tested preprocessing functions to our corpus

In [63]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    print('shape before: ', preprocess_y.shape)
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    print('shape after: ', preprocess_y.shape)
        
    return preprocess_x, preprocess_y, x_tk, y_tk

In [64]:
preprocess_x, x_tk = tokenize(x)
preprocess_x = pad(preprocess_x)
preprocess_x.shape

(700, 40)

In [65]:
preproc_sans_sentences, preproc_english_sentences, sans_tokenizer, english_tokenizer =\
    preprocess(x, y)

shape before:  (700, 139)
shape after:  (700, 139, 1)


## Assigning a number to each word

In [66]:
list(sans_tokenizer.word_index.items())[:5]

[('च', 1), ('न', 2), ('स', 3), ('मे', 4), ('मां', 5)]

In [67]:
list(english_tokenizer.word_index.items())[:5]

[('the', 1), ('of', 2), ('and', 3), ('to', 4), ('in', 5)]

# Logits to text

The neural network will be translating the input to words ids, which isn't the final form we want. We want the sans translation. The function logits_to_text will bridge the gab between the logits from the neural network to the sans translation.



In [73]:
def logits_to_text(logits, tok enizer):

    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [74]:
import os
from keras.models import load_model
import numpy as np

In [75]:
print("sans_sentences shape: ", preproc_english_sentences.shape)
print("english_sentences  shape: ", preproc_sans_sentences.shape)
print('output sequence length: ', preproc_english_sentences.shape[1])

sans_sentences shape:  (700, 139, 1)
english_sentences  shape:  (700, 40)
output sequence length:  139


In [76]:
tmp_x = pad(preproc_sans_sentences, preproc_english_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2], 1))

In [77]:
tmp_x.shape

(700, 139, 1)

# LSTM Embedding Model



In [45]:
from keras.layers import LSTM
from keras.layers import Dense
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.layers.embeddings import Embedding
from keras.losses import sparse_categorical_crossentropy

def embed_model(input_shape, output_sequence_length, sans_vocab_size, english_vocab_size, learning_rate=0.1):
    model = Sequential()
    model.add(Embedding(max(sans_vocab_size, english_vocab_size) ,128 , input_length=output_sequence_length))
    model.add(LSTM(128, dropout=0.1, return_sequences=True))
    model.add(Dense(english_vocab_size, activation='softmax'))
    model.summary()
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

In [46]:
#reshaping the input for processing embeddings
tmp_x = pad(preproc_sans_sentences, preproc_english_sentences.shape[1])

In [47]:
tmp_x.shape

(700, 139)

In [48]:
embed_rnn_model = embed_model(
    tmp_x.shape,
    preproc_english_sentences.shape[1],
    len(sans_tokenizer.word_index)+1,
    len(english_tokenizer.word_index)+1)
if os.path.exists(os.path.join("model", "Final_LSTM.h5"))== False:
    embedrnn = embed_rnn_model.fit(tmp_x, preproc_english_sentences, batch_size=100, epochs=20, validation_split=0.2)
else:
    embed_rnn_model = load_model(os.path.join("model", "Final_LSTM.h5"))

# embedrnn = embed_rnn_model.fit(tmp_x, preproc_english_sentences, batch_size=50, epochs=50, validation_split=0.3)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 139, 128)          518144    
_________________________________________________________________
lstm (LSTM)                  (None, 139, 128)          131584    
_________________________________________________________________
dense (Dense)                (None, 139, 2862)         369198    
Total params: 1,018,926
Trainable params: 1,018,926
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [50]:
embed_rnn_model.save(os.path.join("model", "Final_LSTM.h5"))

### Validation loss seems to converge much faster

In [51]:
score = embed_rnn_model.evaluate(tmp_x, preproc_english_sentences, verbose=0)
print("Train accurancy: ", score[1])

Train accurancy:  0.7694553136825562


In [52]:
x[167][:]

'यदा यदा हि धर्मस्य ग्लानिर्भवति भारत अभ्युत्थानमधर्मस्य तदात्मानं सृजाम्यहम्  '

In [53]:
print(logits_to_text(embed_rnn_model.predict(tmp_x[:])[167][:], english_tokenizer))

those those is the the the the the the all <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


# Limitting the translated text until a word repeats itself!

In [54]:
def limiten(n):
    inp = logits_to_text(embed_rnn_model.predict(tmp_x[:])[n][:], english_tokenizer)
    inp = inp.split(" ")

    nl = []

    try:
        for i in range(len(inp)+1):
            if inp[i] != inp[i+1] and inp[i] != inp[i+2]:
                # z = inp[i].join(inp[i])
                nl.append(inp[i])
                # print(str(nl[-1]))
            a = ' '.join(nl)
    except:
        IndexError

    return a

In [55]:
p = limiten(167)
print(p)

those is the all


# English Text-to-Speech

In [56]:
import pyttsx3

def enTTS(n):
    engine = pyttsx3.init()
    voices = engine.getProperty('voices')
    engine.setProperty('voice', voices[1].id)

    rate = engine.getProperty('rate')
    engine.setProperty('rate', 150)

    engine.say("English\n")
    engine.say(limiten(n))

    engine.save_to_file(limiten(n), "en_x[{}].mp3".format(n))
    engine.runAndWait()

    return print("File saved as en_x[{}].mp3".format(n))

In [57]:
enTTS(167)

File saved as en_x[167].mp3


# Sanskrit Text-to-Speech

In [58]:
from gtts import gTTS
from pygame import mixer
import time

def saTTS(n):
    txt = x[n]

    obj = gTTS(text=txt, lang='hi', slow=False, )

    obj.save("sa_x[{}].mp3".format(n))    

    mixer.init()
    mixer.music.load("sa_x[{}].mp3".format(n))
    mixer.music.play()
    
    while mixer.music.get_busy():
        time.sleep(1)

    return print("File saved as sa_x[{}].mp3".format(n))

pygame 2.5.2 (SDL 2.28.3, Python 3.6.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [59]:
saTTS(167)

File saved as sa_x[167].mp3
