In [108]:
#import important libraries
from nltk import sent_tokenize, word_tokenize
from gensim.models import Word2Vec 
from numpy.linalg import norm
from numpy import dot
import string
import pandas as pd
import numpy as np
import gensim

In [109]:
sample = open('data_preprocessed.txt','r')
s = sample.read()
f = s.replace('\n', ' ')

In [110]:
#prepare data for Word2Vec model
data = []

# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []
    
    # tokenize the setence into words
    for j in word_tokenize(i):
        temp.append(j)
        
    data.append(temp)

In [111]:
# train the model
model = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 3)

In [112]:
model.similar_by_word('consequence')

  """Entry point for launching an IPython kernel.


[('bellows', 0.7524147629737854),
 ('saltines', 0.749169647693634),
 ('salary', 0.7470896244049072),
 ('pall', 0.7453371286392212),
 ('recipient', 0.7426235675811768),
 ('occurrence', 0.73431396484375),
 ('insanity', 0.7312554121017456),
 ('womanhood', 0.7241902351379395),
 ('sample', 0.7230222225189209),
 ('heatstroke', 0.7195021510124207)]

In [113]:
#function to get the word frequency of each word in a verse/sentence
# list_of_words: a pre-processed list of words in a verse/sentence.
def word_count(list_of_words):
    
    #dictionary keeping frequency for all the unique words
    counts = dict()
    
    for word in list_of_words:
        #if word already listed, increase its count
        if word in counts:
            counts[word] += 1
        # else putting the new word in dictionary
        else:
            counts[word] = 1

    return counts

In [114]:
# function to calculate composite sentence/verse embeddings
# verse to be given in tokenzied and preprocessed format to func.
def senemb(alpha, verse_tok, model, dim):
    # create an empty vector
    sentEmb = np.zeros(dim)
    v_count = word_count(verse_tok)
    
    for word in verse_tok:
        prob = alpha / (alpha + v_count[word]/len(verse))
        sentEmb = sentEmb +  model.wv[word]  * prob
        
    return sentEmb/len(verse)

In [115]:
#sentence embedding for verses

verses = pd.read_csv('verses.csv', usecols=['#verse', 'Translation'])
data = verses['Translation'].to_list()

#list to store all the vectors
verse_vectors = []

# loop over all the verses
for verse in data:
    # tokenize and preprocess the verse
    verse_tok = word_tokenize(verse.lower().translate(str.maketrans('','',string.punctuation)))
    verse_vectors.append(senemb(1, verse_tok, model, 100))   

In [116]:
# Here we calculate the sentence embedding vectors for queries

# getting the queries
df = pd.read_csv('queries_sem.csv')
queries_ = df['Verse'].to_list()

# pre-processing the queries
queries = [query.lower().translate(str.maketrans('','',string.punctuation)).split() for query in queries_]

# a list to store all the vectors
query_vectors = []

for query in queries:
    query_vectors.append(senemb(1, query, model, 100))

In [117]:
queries_[0]

' can I get rid of my sins?'

In [118]:
# queries_scores: to store cos. sim. of each query with all the verses. [basically a 2d list]
queries_scores = []

# looping over all the queries
for q_vec in query_vectors:
    query_scores = []
    # storing the scores for a particular query
    for v_vec in verse_vectors:
        # getting the score
        score = (dot(q_vec , v_vec))/(norm(q_vec)*norm(v_vec))
        query_scores.append(score)
        
    queries_scores.append(query_scores)

In [119]:
# now let's move towars getting the answers for the queries
f = open('best3_word2vec.txt','w+')

# top3_verseNo: to store verse nos. of top three verses for each query
# to be put as results in mAP fun.
top3_verseNo = []

# looping over 
for ind, i in enumerate(queries_scores):
    temp = [] # to store the top 3 verses 
    indices = [] # to store the top 3 indices
    
    #take three elements with highest scores   
    for ele in sorted(i)[-1:-4:-1]:
        #top_result = verses.iloc[i.index(ele)]
        indices.append(i.index(ele))
        temp.append(verses.iloc[i.index(ele)]['#verse'].replace(":",""))
        
    # printing the results in a file
    f.write('For query:\n' + '""' + queries_[ind] + '""' + '\n\n')
    f.write('The Best 3 matching verses are:\n')
    for ind, i in enumerate(indices):
        f.write(str(ind + 1) + ') ' + verses.iloc[i]['#verse'] + verses.iloc[i]['Translation'] + '\n\n')

    top3_verseNo.append(temp)

In [120]:
print(str(verses.iloc[[1,2]]))

  #verse                                        Translation
1   1.2:  Sanjaya said: O King, after looking over the a...
2   1.3:  O my teacher, behold the great army of the son...
