In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
import numpy as np
import pandas as pd
import pickle
import numpy.linalg as lin
import unicodedata
import warnings 
from sklearn.cluster import KMeans
stop_words = set(stopwords.words('english'))
warnings.filterwarnings('ignore')

In [2]:
file=open('sentence.pkl','rb')
verse_embeddings_sentence=pickle.load(file)
file.close()
file=open('max.pkl','rb')
verse_embeddings_max=pickle.load(file)
file.close()
file=open('mean.pkl','rb')
verse_embeddings_mean=pickle.load(file)
file.close()

In [3]:
file=open('whole.pkl','rb')
verse_embeddings_whole=pickle.load(file)
file.close()

In [4]:
file=open('fasttext.pkl','rb')
fasttext=pickle.load(file)
file.close()

In [5]:
def remove_special_characters(text):
    regex = re.compile('[^a-zA-Z\s]')
    text_returned = re.sub(regex,' ',text)
    return text_returned

In [6]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [7]:
def word_embedding(word):
    word=word.lower()
    if word in fasttext.keys():
        return np.array(fasttext[word],dtype=np.float64)
    return np.array([0.0]*300)

In [8]:
def cosine_similarity(a,b):
    try:
        return a.dot(b)/(lin.norm(a)*lin.norm(b))
    except:
        return -1.0

In [9]:
def ed(a,b):
    return lin.norm(a-b)

In [10]:
def sentence_embedding(sentence):
    sentence=strip_accents(sentence)
    sentence=remove_special_characters(sentence)
    words=word_tokenize(sentence)
    if len(words)>0:
        words=[word  for word in words if word not in stop_words]
        sentence_embedding=[word_embedding(word) for word in words]
        return np.array(list(map(lambda x: sum(x)/len(x), zip(*sentence_embedding))))
    return np.array([0]*300)

In [11]:
data=pd.read_csv('../English.csv')

In [12]:
query='Importance of bhagavad gita'
query_embedding=sentence_embedding(query)
query_embedding_list=query_embedding.reshape(1,-1).astype(np.float64)

# Max Strategy

## Cosine Similarity

In [13]:
def max_cosine(query_embedding,counter=10):
    scoring=[cosine_similarity(query_embedding,i) for i in verse_embeddings_max]
    verse_index=np.argsort(scoring)[-1::-1][:counter]
    return verse_index

In [14]:
verse_index=max_cosine(query_embedding)

In [15]:
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
583,18,64,सर्वगुह्यतमं भूय: श‍ृणु मे परमं वच: ।इष्टोऽसि ...,The Lord has given Arjuna knowledge that is co...
542,18,13,पञ्चैतानि महाबाहो कारणानि निबोध मे ।सांख्ये कृ...,A question may be raised that since any activi...
370,10,37,वृष्णीनां वासुदेवोऽस्मि पाण्डवानां धनञ्जय: ।मु...,Kṛṣṇa is the original Supreme Personality of G...
529,17,28,अश्रद्धया हुतं दत्तं तपस्तप्‍तं कृतं च यत् ।अस...,Anything done without the transcendental objec...
526,17,23,ॐ तत्सदिति निर्देशो ब्रह्मणस्त्रिविध: स्मृत: ।...,"It has been explained that penance, sacrifice,..."
366,10,33,अक्षराणामकारोऽस्मि द्वन्द्व: सामासिकस्य च ।अहम...,"A-kāra, the first letter of the Sanskrit alpha..."
491,15,19,यो मामेवमसम्मूढो जानाति पुरुषोत्तमम् ।स सर्ववि...,There are many philosophical speculations abou...
73,2,46,यावानर्थ उदपाने सर्वतः सम्प्लुतोदके ।तावान्सर्...,The rituals and sacrifices mentioned in the ka...
99,3,1,अर्जुन उवाचज्यायसी चेत्कर्मणस्ते मता बुद्धिर्ज...,The Supreme Personality of Godhead Śrī Kṛṣṇa h...
432,13,16,बहिरन्तश्च भूतानामचरं चरमेव च ।सूक्ष्मत्वात्तद...,In Vedic literature we understand that Nārāyaṇ...


## Euclidean Distance

In [16]:
def max_ed(query_embedding,counter=10):
    scoring=[ed(query_embedding,i) for i in verse_embeddings_max]
    verse_index=np.argsort(scoring)[:counter]
    return verse_index

In [17]:
verse_index=max_ed(query_embedding)

In [18]:
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
99,3,1,अर्जुन उवाचज्यायसी चेत्कर्मणस्ते मता बुद्धिर्ज...,The Supreme Personality of Godhead Śrī Kṛṣṇa h...
370,10,37,वृष्णीनां वासुदेवोऽस्मि पाण्डवानां धनञ्जय: ।मु...,Kṛṣṇa is the original Supreme Personality of G...
586,18,67,इदं ते नातपस्काय नाभक्ताय कदाचन ।न चाश‍ुश्रूषव...,Persons who have not undergone the austerities...
3,1,4,अत्र श‍ूरा महेष्वासा भीमार्जुनसमा युधि ।युयुधा...,Even though Dhṛṣṭadyumna was not a very import...
395,11,35,सञ्जय उवाचएतच्छ्रुत्वा वचनं केशवस्यकृताञ्जलिर्...,"As we have already explained, because of the s..."
323,9,20,त्रैविद्या मां सोमपा: पूतपापायज्ञैरिष्ट्वा स्व...,The word trai-vidyāḥ refers to the three Vedas...
319,9,16,अहं क्रतुरहं यज्ञ: स्वधाहमहमौषधम् ।मन्‍त्रोऽहम...,The Vedic sacrifice known as Jyotiṣṭoma is als...
542,18,13,पञ्चैतानि महाबाहो कारणानि निबोध मे ।सांख्ये कृ...,A question may be raised that since any activi...
552,18,23,नियतं सङ्गरहितमरागद्वेषत: कृतम् ।अफलप्रेप्सुना...,"Regulated occupational duties, as prescribed i..."
557,18,29,बुद्धेर्भेदं धृतेश्चैव गुणतस्त्रिविधं श‍ृणु ।प...,"Now after explaining knowledge, the object of ..."


# Mean Pooling

## Cosine Similarity

In [19]:
def mean_cosine(query_embedding,counter=10):
    scoring=[cosine_similarity(query_embedding,i) for i in verse_embeddings_mean]
    verse_index=np.argsort(scoring)[-1::-1][:counter]
    return verse_index

In [20]:
verse_index=mean_cosine(query_embedding)

In [21]:
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
527,17,24,तस्माद् ॐ इत्युदाहृत्य यज्ञदानतप:क्रिया: ।प्रव...,Oṁ tad viṣṇoḥ paramaṁ padam (Ṛg Veda 1.22.20)....
428,13,5,ऋषिभिर्बहुधा गीतं छन्दोभिर्विविधै: पृथक् ।ब्रह...,"The Supreme Personality of Godhead, Kṛṣṇa, is ..."
142,4,1,श्रीभगवानुवाचइमं विवस्वते योगं प्रोक्तवानहमव्य...,Herein we find the history of the Bhagavad-gīt...
198,5,17,तद्बुद्धयस्तदात्मानस्तन्निष्ठास्तत्परायणा: ।गच...,The Supreme Transcendental Truth is Lord Kṛṣṇa...
530,18,1,सन्न्यासस्य महाबाहो तत्त्वमिच्छामि वेदितुम् ।त...,Actually the Bhagavad-gītā is finished in seve...
0,1,1,धृतराष्ट्र उवाचधर्मक्षेत्रे कुरुक्षेत्रे समवेत...,Bhagavad-gītā is the widely read theistic scie...
368,10,35,बृहत्साम तथा साम्न‍ां गायत्री छन्दसामहम् ।मासा...,It has already been explained by the Lord that...
487,15,15,सर्वस्य चाहं हृदि सन्निविष्टोमत्त: स्मृतिर्ज्ञ...,The Supreme Lord is situated as Paramātmā in e...
73,2,46,यावानर्थ उदपाने सर्वतः सम्प्लुतोदके ।तावान्सर्...,The rituals and sacrifices mentioned in the ka...
347,10,14,सर्वमेतदृतं मन्ये यन्मां वदसि केशव ।न हि ते भग...,Arjuna herein confirms that persons of faithle...


## Euclidean Distance

In [22]:
def mean_ed(query_embedding,counter=10):
    scoring=[ed(query_embedding,i) for i in verse_embeddings_mean]
    verse_index=np.argsort(scoring)[:counter]
    return verse_index

In [23]:
verse_index=mean_ed(query_embedding)

In [24]:
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
226,6,27,प्रशान्तमनसं ह्येनं योगिनं सुखमुत्तमम् ।उपैति ...,Brahma-bhūta is the state of being free from m...
347,10,14,सर्वमेतदृतं मन्ये यन्मां वदसि केशव ।न हि ते भग...,Arjuna herein confirms that persons of faithle...
0,1,1,धृतराष्ट्र उवाचधर्मक्षेत्रे कुरुक्षेत्रे समवेत...,Bhagavad-gītā is the widely read theistic scie...
198,5,17,तद्बुद्धयस्तदात्मानस्तन्निष्ठास्तत्परायणा: ।गच...,The Supreme Transcendental Truth is Lord Kṛṣṇa...
142,4,1,श्रीभगवानुवाचइमं विवस्वते योगं प्रोक्तवानहमव्य...,Herein we find the history of the Bhagavad-gīt...
581,18,62,तमेव शरणं गच्छ सर्वभावेन भारत ।तत्प्रसादात्परा...,A living entity should therefore surrender unt...
527,17,24,तस्माद् ॐ इत्युदाहृत्य यज्ञदानतप:क्रिया: ।प्रव...,Oṁ tad viṣṇoḥ paramaṁ padam (Ṛg Veda 1.22.20)....
108,3,10,सहयज्ञाः प्रजाः सृष्ट्वा पुरोवाच प्रजापतिः ।अन...,The material creation by the Lord of creatures...
323,9,20,त्रैविद्या मां सोमपा: पूतपापायज्ञैरिष्ट्वा स्व...,The word trai-vidyāḥ refers to the three Vedas...
530,18,1,सन्न्यासस्य महाबाहो तत्त्वमिच्छामि वेदितुम् ।त...,Actually the Bhagavad-gītā is finished in seve...


# Line Matching

## Cosine Similarity

In [25]:
def line_cosine(query_embeddings,counter=10):
    scoring=[]
    for embeddings in verse_embeddings_sentence:
        score=[cosine_similarity(query_embedding,embed) for embed in embeddings]
        scoring.append(max(score))
    verse_index=np.argsort(scoring)[-1::-1][:counter]
    return verse_index

In [26]:
verse_index=line_cosine(query_embedding)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
0,1,1,धृतराष्ट्र उवाचधर्मक्षेत्रे कुरुक्षेत्रे समवेत...,Bhagavad-gītā is the widely read theistic scie...
36,2,7,कार्पण्यदोषोपहतस्वभावःपृच्छामि त्वां धर्मसम्मू...,By nature’s own way the complete system of mat...
487,15,15,सर्वस्य चाहं हृदि सन्निविष्टोमत्त: स्मृतिर्ज्ञ...,The Supreme Lord is situated as Paramātmā in e...
346,10,11,तेषामेवानुकम्पार्थमहमज्ञानजं तम: ।नाशयाम्यात्म...,When Lord Caitanya was in Benares promulgating...
595,18,78,यत्र योगेश्वर: कृष्णो यत्र पार्थो धनुर्धर: ।तत...,The Bhagavad-gītā began with an inquiry of Dhṛ...
41,2,12,न त्वेवाहं जातु नासं न त्वं नेमे जनाधिपाः ।न च...,In the Vedas – in the Kaṭha Upaniṣad as well a...
270,7,24,अव्यक्तं व्यक्तिमापन्नं मन्यन्ते मामबुद्धय: ।प...,Those who are worshipers of demigods have been...
68,2,39,एषा तेऽभिहिता सांख्ये बुद्धिर्योगे त्विमां श‍ृ...,"According to the Nirukti, or the Vedic diction..."
490,15,18,यस्मात्क्षरमतीतोऽहमक्षरादपि चोत्तम: ।अतोऽस्मि ...,No one can surpass the Supreme Personality of ...
142,4,1,श्रीभगवानुवाचइमं विवस्वते योगं प्रोक्तवानहमव्य...,Herein we find the history of the Bhagavad-gīt...


## Euclidean Distance

In [27]:
def line_ed(query_embeddings,counter=10):
    scoring=[]
    for embeddings in verse_embeddings_sentence:
        score=[ed(query_embedding,embed) for embed in embeddings]
        scoring.append(max(score))
    verse_index=np.argsort(scoring)[:counter]
    return verse_index

In [28]:
verse_index=line_ed(query_embedding)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
395,11,35,सञ्जय उवाचएतच्छ्रुत्वा वचनं केशवस्यकृताञ्जलिर्...,"As we have already explained, because of the s..."
3,1,4,अत्र श‍ूरा महेष्वासा भीमार्जुनसमा युधि ।युयुधा...,Even though Dhṛṣṭadyumna was not a very import...
552,18,23,नियतं सङ्गरहितमरागद्वेषत: कृतम् ।अफलप्रेप्सुना...,"Regulated occupational duties, as prescribed i..."
557,18,29,बुद्धेर्भेदं धृतेश्चैव गुणतस्त्रिविधं श‍ृणु ।प...,"Now after explaining knowledge, the object of ..."
388,11,16,अनेकबाहूदरवक्‍त्रनेत्रंपश्यामि त्वां सर्वतोऽनन...,Kṛṣṇa is the Supreme Personality of Godhead an...
65,2,36,अवाच्यवादांश्च बहून्वदिष्यन्ति तवाहिताः ।निन्द...,Lord Kṛṣṇa was astonished in the beginning at ...
583,18,64,सर्वगुह्यतमं भूय: श‍ृणु मे परमं वच: ।इष्टोऽसि ...,The Lord has given Arjuna knowledge that is co...
222,6,19,यथा दीपो निवातस्थो नेङ्गते सोपमा स्मृता ।योगिन...,"A truly Kṛṣṇa conscious person, always absorbe..."
320,9,17,पिताहमस्य जगतो माता धाता पितामह: ।वेद्यं पवित्...,"The entire cosmic manifestations, moving and n..."
189,5,6,सन्न्यासस्तु महाबाहो दु:खमाप्‍तुमयोगत: ।योगयुक...,"There are two classes of sannyāsīs, or persons..."


# Whole Part

## Cosine Similarity

In [29]:
def whole_cosine(query_embedding,counter=10):
    scoring=[cosine_similarity(query_embedding,i) for i in verse_embeddings_whole]
    verse_index=np.argsort(scoring)[-1::-1][:counter]
    return verse_index

In [30]:
verse_index=whole_cosine(query_embedding)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
343,10,8,अहं सर्वस्य प्रभवो मत्त: सर्वं प्रवर्तते ।इति ...,A learned scholar who has studied the Vedas pe...
487,15,15,सर्वस्य चाहं हृदि सन्निविष्टोमत्त: स्मृतिर्ज्ञ...,The Supreme Lord is situated as Paramātmā in e...
530,18,1,सन्न्यासस्य महाबाहो तत्त्वमिच्छामि वेदितुम् ।त...,Actually the Bhagavad-gītā is finished in seve...
428,13,5,ऋषिभिर्बहुधा गीतं छन्दोभिर्विविधै: पृथक् ।ब्रह...,"The Supreme Personality of Godhead, Kṛṣṇa, is ..."
73,2,46,यावानर्थ उदपाने सर्वतः सम्प्लुतोदके ।तावान्सर्...,The rituals and sacrifices mentioned in the ka...
527,17,24,तस्माद् ॐ इत्युदाहृत्य यज्ञदानतप:क्रिया: ।प्रव...,Oṁ tad viṣṇoḥ paramaṁ padam (Ṛg Veda 1.22.20)....
347,10,14,सर्वमेतदृतं मन्ये यन्मां वदसि केशव ।न हि ते भग...,Arjuna herein confirms that persons of faithle...
142,4,1,श्रीभगवानुवाचइमं विवस्वते योगं प्रोक्तवानहमव्य...,Herein we find the history of the Bhagavad-gīt...
253,7,7,मत्त: परतरं नान्यत्किञ्चिदस्ति धनञ्जय ।मयि सर्...,There is a common controversy over whether the...
108,3,10,सहयज्ञाः प्रजाः सृष्ट्वा पुरोवाच प्रजापतिः ।अन...,The material creation by the Lord of creatures...


## Euclidean Distance

In [31]:
def whole_ed(query_embedding,counter=10):
    scoring=[cosine_similarity(query_embedding,i) for i in verse_embeddings_whole]
    verse_index=np.argsort(scoring)[:counter]
    return verse_index

In [32]:
verse_index=whole_ed(query_embedding)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
364,10,31,पवन: पवतामस्मि राम: शस्त्रभृतामहम् ।झषाणां मकर...,Of all the aquatics the shark is one of the bi...
538,18,9,कार्यमित्येव यत्कर्म नियतं क्रियतेऽर्जुन ।सङ्ग...,Prescribed duties must be performed with this ...
5,1,9,अन्ये च बहवः श‍ूरा मदर्थे त्यक्तजीविताः ।नानाश...,As far as the others are concerned – like Jaya...
402,11,44,तस्मात्प्रणम्य प्रणिधाय कायंप्रसादये त्वामहमीश...,Kṛṣṇa’s devotees relate to Kṛṣṇa in various re...
222,6,19,यथा दीपो निवातस्थो नेङ्गते सोपमा स्मृता ।योगिन...,"A truly Kṛṣṇa conscious person, always absorbe..."
369,10,36,द्यूतं छलयतामस्मि तेजस्तेजस्विनामहम् ।जयोऽस्मि...,There are many kinds of cheaters all over the ...
16,1,26,तत्रापश्यत्स्थितान्पार्थः पितॄनथ पितामहान्।आचा...,On the battlefield Arjuna could see all kinds ...
541,18,12,अनिष्टमिष्टं मिश्रं च त्रिविधं कर्मण: फलम् ।भव...,A person in Kṛṣṇa consciousness acting in know...
555,18,27,रागी कर्मफलप्रेप्सुर्लुब्धो हिंसात्मकोऽश‍ुचि: ...,A person is too much attached to a certain kin...
204,5,23,शक्न‍ोतीहैव य: सोढुं प्राक्शरीरविमोक्षणात् ।का...,If one wants to make steady progress on the pa...


# Clustering

## Max Pooling

In [33]:
kmeans_max = KMeans(n_clusters=60, random_state=0).fit(verse_embeddings_max)

In [34]:
def kmeans_max_cluster(query_embedding_list):
    label=kmeans_max.predict(query_embedding_list)[0]
    verse_index=[i for i in range(len(data)) if kmeans_max.labels_[i]==label]
    return verse_index

In [35]:
verse_index=kmeans_max_cluster(query_embedding_list)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
20,1,31,न च श्रेयोऽनुपश्यामि हत्वा स्वजनमाहवे ।न काङ्क...,Without knowing that one’s self-interest is in...
46,2,17,अविनाशि तु तद्विद्धि येन सर्वमिदं ततम् ।विनाशम...,This verse more clearly explains the real natu...
57,2,28,अव्यक्तादीनि भूतानि व्यक्तमध्यानि भारत ।अव्यक्...,Accepting that there are two classes of philos...
99,3,1,अर्जुन उवाचज्यायसी चेत्कर्मणस्ते मता बुद्धिर्ज...,The Supreme Personality of Godhead Śrī Kṛṣṇa h...
118,3,20,कर्मणैव हि संसिद्धिमास्थिता जनकादयः ।लोकसङ्ग्र...,Kings like Janaka were all self-realized souls...
149,4,8,परित्राणाय साधुनां विनाशाय च दुष्कृताम् ।धर्मस...,"According to Bhagavad-gītā, a sādhu (holy man)..."
153,4,12,काङ्क्षन्तः कर्मणां सिद्धिं यजन्त इह देवता ।क्...,There is a great misconception about the gods ...
171,4,30,सर्वेऽप्येते यज्ञविदो यज्ञक्षपितकल्मषाः ।यज्ञश...,From the foregoing explanation of different ty...
189,5,6,सन्न्यासस्तु महाबाहो दु:खमाप्‍तुमयोगत: ।योगयुक...,"There are two classes of sannyāsīs, or persons..."
218,6,15,युञ्जन्नेवं सदात्मानं योगी नियतमानस: ।शान्तिं ...,The ultimate goal in practicing yoga is now cl...


## Mean Pooling

In [36]:
kmeans_mean = KMeans(n_clusters=60, random_state=0).fit(verse_embeddings_mean)

In [37]:
def kmeans_mean_cluster(query_embedding_list):
    label=kmeans_mean.predict(query_embedding_list)[0]
    verse_index=[i for i in range(len(data)) if kmeans_mean.labels_[i]==label]
    return verse_index

In [38]:
verse_index=kmeans_mean_cluster(query_embedding_list)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
226,6,27,प्रशान्तमनसं ह्येनं योगिनं सुखमुत्तमम् ।उपैति ...,Brahma-bhūta is the state of being free from m...


## Whole

In [39]:
kmeans_whole = KMeans(n_clusters=60, random_state=0).fit(verse_embeddings_whole)

In [40]:
def kmeans_whole_cluster(query_embedding_list):
    label=kmeans_whole.predict(query_embedding_list)[0]
    verse_index=[i for i in range(len(data)) if kmeans_whole.labels_[i]==label]
    return verse_index

In [41]:
verse_index=kmeans_whole_cluster(query_embedding_list)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
31,2,2,श्री भगवानुवाचकुतस्त्वा कश्मलमिदं विषमे समुपस्...,Kṛṣṇa and the Supreme Personality of Godhead a...
71,2,44,भोगैश्वर्यप्रसक्तानां तयापहृतचेतसाम् ।व्यवसाया...,Samādhi means “fixed mind.” The Vedic dictiona...
108,3,10,सहयज्ञाः प्रजाः सृष्ट्वा पुरोवाच प्रजापतिः ।अन...,The material creation by the Lord of creatures...
198,5,17,तद्बुद्धयस्तदात्मानस्तन्निष्ठास्तत्परायणा: ।गच...,The Supreme Transcendental Truth is Lord Kṛṣṇa...
203,5,22,ये हि संस्पर्शजा भोगा दु:खयोनय एव ते ।आद्यन्तव...,Material sense pleasures are due to the contac...
218,6,15,युञ्जन्नेवं सदात्मानं योगी नियतमानस: ।शान्तिं ...,The ultimate goal in practicing yoga is now cl...
226,6,27,प्रशान्तमनसं ह्येनं योगिनं सुखमुत्तमम् ।उपैति ...,Brahma-bhūta is the state of being free from m...
246,6,47,योगिनामपि सर्वेषां मद्ग‍तेनान्तरात्मना ।श्रद्ध...,The word bhajate is significant here. Bhajate ...
249,7,3,मनुष्याणां सहस्रेषु कश्चिद्यतति सिद्धये ।यतताम...,"There are various grades of men, and out of ma..."
253,7,7,मत्त: परतरं नान्यत्किञ्चिदस्ति धनञ्जय ।मयि सर्...,There is a common controversy over whether the...
