In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
import numpy as np
import pandas as pd
import pickle
import numpy.linalg as lin
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import unicodedata
import warnings 
from sklearn.cluster import KMeans
stop_words = set(stopwords.words('english'))
warnings.filterwarnings('ignore')

In [2]:
file=open('sentence.pkl','rb')
verse_embeddings_sentence=pickle.load(file)
file.close()
file=open('max.pkl','rb')
verse_embeddings_max=pickle.load(file)
file.close()
file=open('mean.pkl','rb')
verse_embeddings_mean=pickle.load(file)
file.close()

In [3]:
file=open('whole.pkl','rb')
verse_embeddings_whole=pickle.load(file)
file.close()

In [4]:
w2v = KeyedVectors.load_word2vec_format('w2v.bin', binary=True)

In [5]:
def remove_special_characters(text):
    regex = re.compile('[^a-zA-Z\s]')
    text_returned = re.sub(regex,' ',text)
    return text_returned

In [6]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [7]:
def word_embedding(word):
    word=word.lower()
    try:
        return w2v.get_vector(word)
    except:
        return np.array([0.0]*300)

In [8]:
def cosine_similarity(a,b):
    try:
        return a.dot(b)/(lin.norm(a)*lin.norm(b))
    except:
        return -1.0

In [9]:
def ed(a,b):
    return lin.norm(a-b)

In [10]:
def sentence_embedding(sentence):
    sentence=strip_accents(sentence)
    sentence=remove_special_characters(sentence)
    words=word_tokenize(sentence)
    if len(words)>0:
        words=[word  for word in words if word not in stop_words]
        sentence_embedding=[word_embedding(word) for word in words]
        return np.array(list(map(lambda x: sum(x)/len(x), zip(*sentence_embedding))))
    return np.array([0]*300)

In [11]:
data=pd.read_csv('../English.csv')

In [12]:
query='Importance of bhagavad gita'
query_embedding=sentence_embedding(query)
query_embedding_list=query_embedding.reshape(1,-1).astype(np.float64)

# Max Strategy

## Cosine Similarity

In [13]:
def max_cosine(query_embedding,counter=10):
    scoring=[cosine_similarity(query_embedding,i) for i in verse_embeddings_max]
    verse_index=np.argsort(scoring)[-1::-1][:counter]
    return verse_index

In [14]:
verse_index=max_cosine(query_embedding)

In [15]:
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
526,17,23,ॐ तत्सदिति निर्देशो ब्रह्मणस्त्रिविध: स्मृत: ।...,"It has been explained that penance, sacrifice,..."
583,18,64,सर्वगुह्यतमं भूय: श‍ृणु मे परमं वच: ।इष्टोऽसि ...,The Lord has given Arjuna knowledge that is co...
349,10,16,वक्तुमर्हस्यशेषेण दिव्या ह्यात्मविभूतय: ।याभिर...,In this verse it appears that Arjuna is alread...
576,18,57,चेतसा सर्वकर्माणि मयि सन्न्यस्य मत्पर: ।बुद्धि...,"When one acts in Kṛṣṇa consciousness, he does ..."
552,18,23,नियतं सङ्गरहितमरागद्वेषत: कृतम् ।अफलप्रेप्सुना...,"Regulated occupational duties, as prescribed i..."
101,3,3,श्रीभगवानुवाचलोकेऽस्मिन्द्विविधा निष्ठा पुरा प...,"In the Second Chapter, verse 39, the Lord expl..."
510,17,2,श्रीभगवानुवाचत्रिविधा भवति श्रद्धा देहिनां सा ...,Those who know the rules and regulations of th...
549,18,20,सर्वभूतेषु येनैकं भावमव्ययमीक्षते ।अविभक्तं वि...,A person who sees one spirit soul in every liv...
490,15,18,यस्मात्क्षरमतीतोऽहमक्षरादपि चोत्तम: ।अतोऽस्मि ...,No one can surpass the Supreme Personality of ...
97,2,71,विहाय कामान्यः सर्वान्पुमांश्चरति निःस्पृहः ।न...,To become desireless means not to desire anyth...


## Euclidean Distance

In [16]:
def max_ed(query_embedding,counter=10):
    scoring=[ed(query_embedding,i) for i in verse_embeddings_max]
    verse_index=np.argsort(scoring)[:counter]
    return verse_index

In [17]:
verse_index=max_ed(query_embedding)

In [18]:
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
3,1,4,अत्र श‍ूरा महेष्वासा भीमार्जुनसमा युधि ।युयुधा...,Even though Dhṛṣṭadyumna was not a very import...
395,11,35,सञ्जय उवाचएतच्छ्रुत्वा वचनं केशवस्यकृताञ्जलिर्...,"As we have already explained, because of the s..."
349,10,16,वक्तुमर्हस्यशेषेण दिव्या ह्यात्मविभूतय: ।याभिर...,In this verse it appears that Arjuna is alread...
552,18,23,नियतं सङ्गरहितमरागद्वेषत: कृतम् ।अफलप्रेप्सुना...,"Regulated occupational duties, as prescribed i..."
557,18,29,बुद्धेर्भेदं धृतेश्चैव गुणतस्त्रिविधं श‍ृणु ।प...,"Now after explaining knowledge, the object of ..."
561,18,34,यया तु धर्मकामार्थान्धृत्या धारयतेऽर्जुन ।प्रस...,Any person who is always desirous of fruitive ...
320,9,17,पिताहमस्य जगतो माता धाता पितामह: ।वेद्यं पवित्...,"The entire cosmic manifestations, moving and n..."
5,1,9,अन्ये च बहवः श‍ूरा मदर्थे त्यक्तजीविताः ।नानाश...,As far as the others are concerned – like Jaya...
452,14,1,श्रीभगवानुवाचपरं भूय: प्रवक्ष्यामि ज्ञानानां ज...,From the Seventh Chapter to the end of the Twe...
489,15,17,उत्तम: पुरुषस्त्वन्य: परमात्मेत्युदाहृत: ।यो ल...,The idea of this verse is very nicely expresse...


# Mean Pooling

## Cosine Similarity

In [19]:
def mean_cosine(query_embedding,counter=10):
    scoring=[cosine_similarity(query_embedding,i) for i in verse_embeddings_mean]
    verse_index=np.argsort(scoring)[-1::-1][:counter]
    return verse_index

In [20]:
verse_index=mean_cosine(query_embedding)

In [21]:
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
583,18,64,सर्वगुह्यतमं भूय: श‍ृणु मे परमं वच: ।इष्टोऽसि ...,The Lord has given Arjuna knowledge that is co...
101,3,3,श्रीभगवानुवाचलोकेऽस्मिन्द्विविधा निष्ठा पुरा प...,"In the Second Chapter, verse 39, the Lord expl..."
186,5,3,ज्ञेय: स नित्यसन्न्यासी यो न द्वेष्टि न काङ्‍क...,One who is fully in Kṛṣṇa consciousness is alw...
174,4,33,श्रेयान्द्रव्यमयाद्यज्ञाज्ज्ञानयज्ञः परन्तप ।स...,The purpose of all sacrifices is to arrive at ...
554,18,26,मुक्तसङ्गोऽनहंवादी धृत्युत्साहसमन्वित: ।सिद्ध्...,A person in Kṛṣṇa consciousness is always tran...
281,8,5,अन्तकाले च मामेव स्मरन्मुक्त्वा कलेवरम् ।य: प्...,In this verse the importance of Kṛṣṇa consciou...
529,17,28,अश्रद्धया हुतं दत्तं तपस्तप्‍तं कृतं च यत् ।अस...,Anything done without the transcendental objec...
234,6,35,श्रीभगवानुवाचअसंशयं महाबाहो मनो दुर्निग्रहं चल...,The difficulty of controlling the obstinate mi...
472,14,21,अर्जुन उवाचकैर्लिङ्गैस्त्रीन्गुणानेतानतीतो भवत...,"In this verse, Arjuna’s questions are very app..."
487,15,15,सर्वस्य चाहं हृदि सन्निविष्टोमत्त: स्मृतिर्ज्ञ...,The Supreme Lord is situated as Paramātmā in e...


## Euclidean Distance

In [22]:
def mean_ed(query_embedding,counter=10):
    scoring=[ed(query_embedding,i) for i in verse_embeddings_mean]
    verse_index=np.argsort(scoring)[:counter]
    return verse_index

In [23]:
verse_index=mean_ed(query_embedding)

In [24]:
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
583,18,64,सर्वगुह्यतमं भूय: श‍ृणु मे परमं वच: ।इष्टोऽसि ...,The Lord has given Arjuna knowledge that is co...
320,9,17,पिताहमस्य जगतो माता धाता पितामह: ।वेद्यं पवित्...,"The entire cosmic manifestations, moving and n..."
70,2,41,व्यवसायात्मिका बुद्धिरेकेह कुरूनन्दन ।बहुशाखा ...,A strong faith that by Kṛṣṇa consciousness one...
370,10,37,वृष्णीनां वासुदेवोऽस्मि पाण्डवानां धनञ्जय: ।मु...,Kṛṣṇa is the original Supreme Personality of G...
413,11,55,मत्कर्मकृन्मत्परमो मद्भ‍क्त: सङ्गवर्जित: ।निर्...,Anyone who wants to approach the supreme of al...
230,6,31,सर्वभूतस्थितं यो मां भजत्येकत्वमास्थित: ।सर्वथ...,A yogī who is practicing meditation on the Sup...
487,15,15,सर्वस्य चाहं हृदि सन्निविष्टोमत्त: स्मृतिर्ज्ञ...,The Supreme Lord is situated as Paramātmā in e...
37,2,8,न हि प्रपश्यामि ममापनुद्याद् -यच्छोकमुच्छोषणमि...,Although Arjuna was putting forward so many ar...
0,1,1,धृतराष्ट्र उवाचधर्मक्षेत्रे कुरुक्षेत्रे समवेत...,Bhagavad-gītā is the widely read theistic scie...
542,18,13,पञ्चैतानि महाबाहो कारणानि निबोध मे ।सांख्ये कृ...,A question may be raised that since any activi...


# Line Matching

## Cosine Similarity

In [25]:
def line_cosine(query_embeddings,counter=10):
    scoring=[]
    for embeddings in verse_embeddings_sentence:
        score=[cosine_similarity(query_embedding,embed) for embed in embeddings]
        scoring.append(max(score))
    verse_index=np.argsort(scoring)[-1::-1][:counter]
    return verse_index

In [26]:
verse_index=line_cosine(query_embedding)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
281,8,5,अन्तकाले च मामेव स्मरन्मुक्त्वा कलेवरम् ।य: प्...,In this verse the importance of Kṛṣṇa consciou...
0,1,1,धृतराष्ट्र उवाचधर्मक्षेत्रे कुरुक्षेत्रे समवेत...,Bhagavad-gītā is the widely read theistic scie...
284,8,8,अभ्यासयोगयुक्तेन चेतसा नान्यगामिना ।परमं पुरुष...,In this verse Lord Kṛṣṇa stresses the importan...
184,5,1,अर्जुन उवाचसन्न्यासं कर्मणां कृष्ण पुनर्योगं च...,"In this Fifth Chapter of the Bhagavad-gītā, th..."
306,9,3,अश्रद्दधाना: पुरुषा धर्मस्यास्य परन्तप ।अप्राप...,The faithless cannot accomplish this process o...
305,9,2,राजविद्या राजगुह्यं पवित्रमिदमुत्तमम् ।प्रत्यक...,This chapter of Bhagavad-gītā is called the ki...
526,17,23,ॐ तत्सदिति निर्देशो ब्रह्मणस्त्रिविध: स्मृत: ।...,"It has been explained that penance, sacrifice,..."
40,2,11,श्री भगवानुवाचअशोच्यनन्वशोचस्त्वं प्रज्ञावादां...,The Lord at once took the position of the teac...
472,14,21,अर्जुन उवाचकैर्लिङ्गैस्त्रीन्गुणानेतानतीतो भवत...,"In this verse, Arjuna’s questions are very app..."
475,15,1,श्रीभगवानुवाचऊर्ध्वमूलमध:शाखमश्वत्थं प्राहुरव्...,After the discussion of the importance of bhak...


## Euclidean Distance

In [27]:
def line_ed(query_embeddings,counter=10):
    scoring=[]
    for embeddings in verse_embeddings_sentence:
        score=[ed(query_embedding,embed) for embed in embeddings]
        scoring.append(max(score))
    verse_index=np.argsort(scoring)[:counter]
    return verse_index

In [28]:
verse_index=line_ed(query_embedding)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
395,11,35,सञ्जय उवाचएतच्छ्रुत्वा वचनं केशवस्यकृताञ्जलिर्...,"As we have already explained, because of the s..."
552,18,23,नियतं सङ्गरहितमरागद्वेषत: कृतम् ।अफलप्रेप्सुना...,"Regulated occupational duties, as prescribed i..."
557,18,29,बुद्धेर्भेदं धृतेश्चैव गुणतस्त्रिविधं श‍ृणु ।प...,"Now after explaining knowledge, the object of ..."
3,1,4,अत्र श‍ूरा महेष्वासा भीमार्जुनसमा युधि ।युयुधा...,Even though Dhṛṣṭadyumna was not a very import...
561,18,34,यया तु धर्मकामार्थान्धृत्या धारयतेऽर्जुन ।प्रस...,Any person who is always desirous of fruitive ...
5,1,9,अन्ये च बहवः श‍ूरा मदर्थे त्यक्तजीविताः ।नानाश...,As far as the others are concerned – like Jaya...
99,3,1,अर्जुन उवाचज्यायसी चेत्कर्मणस्ते मता बुद्धिर्ज...,The Supreme Personality of Godhead Śrī Kṛṣṇa h...
222,6,19,यथा दीपो निवातस्थो नेङ्गते सोपमा स्मृता ।योगिन...,"A truly Kṛṣṇa conscious person, always absorbe..."
399,11,39,वायुर्यमोऽग्न‍िर्वरुण: शशाङ्क:प्रजापतिस्त्वं प...,The Lord is addressed here as air because the ...
349,10,16,वक्तुमर्हस्यशेषेण दिव्या ह्यात्मविभूतय: ।याभिर...,In this verse it appears that Arjuna is alread...


# Whole Part

## Cosine Similarity

In [29]:
def whole_cosine(query_embedding,counter=10):
    scoring=[cosine_similarity(query_embedding,i) for i in verse_embeddings_whole]
    verse_index=np.argsort(scoring)[-1::-1][:counter]
    return verse_index

In [30]:
verse_index=whole_cosine(query_embedding)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
583,18,64,सर्वगुह्यतमं भूय: श‍ृणु मे परमं वच: ।इष्टोऽसि ...,The Lord has given Arjuna knowledge that is co...
174,4,33,श्रेयान्द्रव्यमयाद्यज्ञाज्ज्ञानयज्ञः परन्तप ।स...,The purpose of all sacrifices is to arrive at ...
186,5,3,ज्ञेय: स नित्यसन्न्यासी यो न द्वेष्टि न काङ्‍क...,One who is fully in Kṛṣṇa consciousness is alw...
554,18,26,मुक्तसङ्गोऽनहंवादी धृत्युत्साहसमन्वित: ।सिद्ध्...,A person in Kṛṣṇa consciousness is always tran...
101,3,3,श्रीभगवानुवाचलोकेऽस्मिन्द्विविधा निष्ठा पुरा प...,"In the Second Chapter, verse 39, the Lord expl..."
529,17,28,अश्रद्धया हुतं दत्तं तपस्तप्‍तं कृतं च यत् ।अस...,Anything done without the transcendental objec...
179,4,38,न हि ज्ञानेन सदृशं पवित्रमिह विद्यते ।तत्स्वयं...,"When we speak of transcendental knowledge, we ..."
114,3,16,एवं प्रवर्तितं चक्रं नानुवर्तयतीह यः ।अघायुरिन...,The mammonist philosophy of “work very hard an...
234,6,35,श्रीभगवानुवाचअसंशयं महाबाहो मनो दुर्निग्रहं चल...,The difficulty of controlling the obstinate mi...
198,5,17,तद्बुद्धयस्तदात्मानस्तन्निष्ठास्तत्परायणा: ।गच...,The Supreme Transcendental Truth is Lord Kṛṣṇa...


## Euclidean Distance

In [31]:
def whole_ed(query_embedding,counter=10):
    scoring=[cosine_similarity(query_embedding,i) for i in verse_embeddings_whole]
    verse_index=np.argsort(scoring)[:counter]
    return verse_index

In [32]:
verse_index=whole_ed(query_embedding)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
4,1,8,भवान्भीष्मश्च कर्णश्च कृपश्च समितिंजयः ।अश्वत्...,Duryodhana mentions the exceptional heroes in ...
364,10,31,पवन: पवतामस्मि राम: शस्त्रभृतामहम् ।झषाणां मकर...,Of all the aquatics the shark is one of the bi...
66,2,37,हतो वा प्राप्स्यसि स्वर्ग जित्वा वा भोक्ष्यसे ...,Even though there was no certainty of victory ...
33,2,4,अर्जुन उवाचकथं भीष्ममहं संख्ये द्रोणं च मधुसूद...,Respectable superiors like Bhīṣma the grandfat...
402,11,44,तस्मात्प्रणम्य प्रणिधाय कायंप्रसादये त्वामहमीश...,Kṛṣṇa’s devotees relate to Kṛṣṇa in various re...
16,1,26,तत्रापश्यत्स्थितान्पार्थः पितॄनथ पितामहान्।आचा...,On the battlefield Arjuna could see all kinds ...
388,11,16,अनेकबाहूदरवक्‍त्रनेत्रंपश्यामि त्वां सर्वतोऽनन...,Kṛṣṇa is the Supreme Personality of Godhead an...
357,10,24,पुरोधसां च मुख्यं मां विद्धि पार्थ बृहस्पतिम् ...,Indra is the chief demigod of the heavenly pla...
222,6,19,यथा दीपो निवातस्थो नेङ्गते सोपमा स्मृता ।योगिन...,"A truly Kṛṣṇa conscious person, always absorbe..."
449,13,33,यथा सर्वगतं सौक्ष्म्यादाकाशं नोपलिप्यते ।सर्वत...,"The air enters into water, mud, stool and what..."


# Clustering

## Max Pooling

In [33]:
kmeans_max = KMeans(n_clusters=60, random_state=0).fit(verse_embeddings_max)

In [34]:
def kmeans_max_cluster(query_embedding_list):
    label=kmeans_max.predict(query_embedding_list)[0]
    verse_index=[i for i in range(len(data)) if kmeans_max.labels_[i]==label]
    return verse_index

In [35]:
verse_index=kmeans_max_cluster(query_embedding_list)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
6,1,10,अपर्याप्त‍ं तदस्माकं बलं भीष्माभिरक्षितम् ।पर्...,Herein an estimation of comparative strength i...
37,2,8,न हि प्रपश्यामि ममापनुद्याद् -यच्छोकमुच्छोषणमि...,Although Arjuna was putting forward so many ar...
72,2,45,त्रैगुण्यविषया वेदा निस्त्रैगुण्यो भवार्जुन ।न...,All material activities involve actions and re...
109,3,11,देवान्भावयतातेन ते देवा भावयन्तु वः ।परस्परं भ...,The demigods are empowered administrators of m...
134,3,36,अर्जुन उवाचअथ केन प्रयुक्तोऽयं पापं चरति पूरुष...,"A living entity, as part and parcel of the Sup..."
153,4,12,काङ्क्षन्तः कर्मणां सिद्धिं यजन्त इह देवता ।क्...,There is a great misconception about the gods ...
168,4,27,सर्वाणीन्द्रियकर्माणि प्राणकर्माणि चापरे ।आत्म...,The yoga system conceived by Patañjali is refe...
186,5,3,ज्ञेय: स नित्यसन्न्यासी यो न द्वेष्टि न काङ्‍क...,One who is fully in Kṛṣṇa consciousness is alw...
189,5,6,सन्न्यासस्तु महाबाहो दु:खमाप्‍तुमयोगत: ।योगयुक...,"There are two classes of sannyāsīs, or persons..."
195,5,14,न कर्तृत्वं न कर्माणि लोकस्य सृजति प्रभु: ।न क...,"The living entity, as will be explained in the..."


## Mean Pooling

In [36]:
kmeans_mean = KMeans(n_clusters=60, random_state=0).fit(verse_embeddings_mean)

In [37]:
def kmeans_mean_cluster(query_embedding_list):
    label=kmeans_mean.predict(query_embedding_list)[0]
    verse_index=[i for i in range(len(data)) if kmeans_mean.labels_[i]==label]
    return verse_index

In [38]:
verse_index=kmeans_mean_cluster(query_embedding_list)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
370,10,37,वृष्णीनां वासुदेवोऽस्मि पाण्डवानां धनञ्जय: ।मु...,Kṛṣṇa is the original Supreme Personality of G...


## Whole

In [39]:
kmeans_whole = KMeans(n_clusters=60, random_state=0).fit(verse_embeddings_whole)

In [40]:
def kmeans_whole_cluster(query_embedding_list):
    label=kmeans_whole.predict(query_embedding_list)[0]
    verse_index=[i for i in range(len(data)) if kmeans_whole.labels_[i]==label]
    return verse_index

In [41]:
verse_index=kmeans_whole_cluster(query_embedding_list)
data.loc[verse_index]

Unnamed: 0,Chapter,Verse,Sloke,Commentary
365,10,32,सर्गाणामादिरन्तश्च मध्यं चैवाहमर्जुन ।अध्यात्म...,"Among the created manifestations, the first is..."
370,10,37,वृष्णीनां वासुदेवोऽस्मि पाण्डवानां धनञ्जय: ।मु...,Kṛṣṇa is the original Supreme Personality of G...
373,10,40,नान्तोऽस्ति मम दिव्यानां विभूतीनां परन्तप ।एष ...,"As stated in the Vedic literature, although th..."
