In [91]:
#import important libraries
from nltk import sent_tokenize, word_tokenize
from gensim.models import Word2Vec 
from numpy.linalg import norm
from numpy import dot
import string
import pandas as pd
import numpy as np
import gensim

In [92]:
sample = open('data_preprocessed.txt','r')
s = sample.read()
f = s.replace('\n', ' ')

In [93]:
#prepare data for Word2Vec model
data = []

# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []
    
    # tokenize the setence into words
    for j in word_tokenize(i):
        temp.append(j)
        
    data.append(temp)

In [94]:
# train the model
model = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 3)

In [95]:
model.similar_by_word('consequence')

  """Entry point for launching an IPython kernel.


[('fragment', 0.7956534624099731),
 ('motherthe', 0.7932881712913513),
 ('wasp', 0.7788851857185364),
 ('gust', 0.7763956785202026),
 ('sample', 0.7721961736679077),
 ('box', 0.7672709226608276),
 ('semblance', 0.7669326066970825),
 ('incompatibility', 0.765509843826294),
 ('glimpse', 0.764214038848877),
 ('reverend', 0.7615311741828918)]

In [96]:
#function to get the word frequency of each word in a verse/sentence
# list_of_words: a pre-processed list of words in a verse/sentence.
def word_count(list_of_words):
    
    #dictionary keeping frequency for all the unique words
    counts = dict()
    
    for word in list_of_words:
        #if word already listed, increase its count
        if word in counts:
            counts[word] += 1
        # else putting the new word in dictionary
        else:
            counts[word] = 1

    return counts

In [97]:
# function to calculate composite sentence/verse embeddings
# verse to be given in tokenzied and preprocessed format to func.
def senemb(alpha, verse_tok, model, dim):
    # create an empty vector
    sentEmb = np.zeros(dim)
    v_count = word_count(verse_tok)
    
    for word in verse_tok:
        prob = alpha / (alpha + v_count[word]/len(verse))
        sentEmb = sentEmb +  model.wv[word]  * prob
        
    return sentEmb/len(verse)

In [107]:
#sentence embedding for verses

verses = pd.read_csv('verses.csv', usecols=['#verse', 'Translation'])
data = verses['Translation'].to_list()

#list to store all the vectors
verse_vectors = []

# loop over all the verses
for verse in data:
    # tokenize and preprocess the verse
    verse_tok = word_tokenize(verse.lower().translate(str.maketrans('','',string.punctuation)))
    verse_vectors.append(senemb(1, verse_tok, model, 100))   

27
29
23
22
15
25
25
23
34
24
27
32
19
22
28
57
26
49
37
18
28
25
29
23
31
24
31
30
109
48
39
23
29
41
29
23
30
21
26
21
39
30
29
47
37
47
41
19
24
33
27
35
47
25
40
21
24
26
45
24
25
24
25
20
28
32
25
32
23
31
22
29
17
30
20
33
28
44
26
30
49
36
41
36
39
18
31
29
49
29
32
35
47
34
28
26
30
28
24
24
31
28
26
37
28
16
28
42
36
37
27
20
41
20
32
22
30
22
42
49
23
42
34
27
30
41
32
37
26
28
21
40
21
35
28
52
29
34
37
28
26
31
27
37
39
21
37
34
27
28
30
31
39
37
41
37
26
29
31
26
24
32
38
20
27
44
34
25
32
25
28
43
25
39
35
26
41
23
34
35
34
58
31
25
30
33
34
41
34
24
44
31
33
41
28
35
32
35
42
39
33
36
61
27
20
39
33
39
25
27
32
27
31
33
35
43
36
33
39
31
55
44
46
38
38
26
28
30
34
51
32
40
77
52
32
28
23
30
27
108
44
35
27
42
30
26
22
27
28
25
24
29
30
39
36
28
40
32
32
32
35
33
26
46
35
25
24
19
27
35
22
35
31
26
29
45
23
29
31
35
34
43
34
26
32
26
34
43
30
43
24
35
31
46
34
31
42
63
25
22
45
30
64
43
43
40
38
25
29
39
24
28
24
38
32
31
29
51
38
38
19
51
39
43
31
22
47
19
31
27
22
36
25

In [99]:
# Here we calculate the sentence embedding vectors for queries

# getting the queries
df = pd.read_csv('queries_sem.csv')
queries_ = df['Verse'].to_list()

# pre-processing the queries
queries = [query.lower().translate(str.maketrans('','',string.punctuation)).split() for query in queries_]

# a list to store all the vectors
query_vectors = []

for query in queries:
    query_vectors.append(senemb(1, query, model, 100))

In [100]:
queries_[0]

' can I get rid of my sins?'

In [101]:
# queries_scores: to store cos. sim. of each query with all the verses. [basically a 2d list]
queries_scores = []

# looping over all the queries
for q_vec in query_vectors:
    query_scores = []
    # storing the scores for a particular query
    for v_vec in verse_vectors:
        # getting the score
        score = (dot(q_vec , v_vec))/(norm(q_vec)*norm(v_vec))
        query_scores.append(score)
        
    queries_scores.append(query_scores)

In [102]:
# now let's move towars getting the answers for the queries
f = open('best3_word2vec.txt','w+')

# top3_verseNo: to store verse nos. of top three verses for each query
# to be put as results in mAP fun.
top3_verseNo = []

# looping over 
for ind, i in enumerate(queries_scores):
    temp = [] # to store the top 3 verses 
    indices = [] # to store the top 3 indices
    
    #take three elements with highest scores   
    for ele in sorted(i)[-1:-4:-1]:
        #top_result = verses.iloc[i.index(ele)]
        indices.append(i.index(ele))
        temp.append(verses.iloc[i.index(ele)]['#verse'].replace(":",""))
        
    # printing the results in a file
    f.write('For query:\n' + '""' + queries_[ind] + '""' + '\n\n')
    f.write('The Best 3 matching verses are:\n')
    for ind, i in enumerate(indices):
        f.write(str(ind + 1) + ') ' + verses.iloc[i]['#verse'] + verses.iloc[i]['Translation'] + '\n\n')

    top3_verseNo.append(temp)

In [103]:
print(str(verses.iloc[[1,2]]))

  #verse                                        Translation
1   1.2:  Sanjaya said: O King, after looking over the a...
2   1.3:  O my teacher, behold the great army of the son...
