In [9]:
import fasttext
import string
import pandas as pd
import numpy as np
from numpy.linalg import norm
from numpy import dot

In [2]:
# training our model
model = fasttext.train_unsupervised('data_preprocessed.txt', "cbow", minn = 2, maxn = 13, epoch = 1, dim = 50, thread = 2)

In [3]:
# a function to get the word frequency of each word in a verse/sentence
# list_of_words: a pre-processed list of words in a verse/sentence.
def word_count(list_of_words):
    # a dictionary keeping frequency for all the unique words
    counts = dict()
    
    for word in list_of_words:
        # if the word is already listed, increasing its count
        if word in counts:
            counts[word] += 1
        # else putting the new word in our dictionary
        else:
            counts[word] = 1

    return counts

In [4]:
# a function to calculate composite sentence/verse embeddings
# verse has to be given in a tokenzied format to this.
def senemb(alpha, verse_tok, model, dim):
    # creating an empty vector
    sentEmb = np.zeros(dim)
    v_count = word_count(verse_tok)
    
    for word in verse_tok:
        prob = alpha / (alpha + v_count[word]/len(verse))
        sentEmb = sentEmb +  model.get_word_vector(word) * prob
        
    return sentEmb/len(verse)

In [5]:
# Here we calculate the sentence embedding vectors for verses

# let's get all the verses and get their vectors
verses = pd.read_csv('verses.csv', usecols=['#verse', 'Translation'])
data = verses['Translation'].to_list()

# a list to store all the vectors
verse_vectors = []

# looping over all the verses
for verse in data:
    # tokenizing and preprocessing the verse
    verse_tok = verse.lower().translate(str.maketrans('','',string.punctuation)).split() 
    verse_vectors.append(senemb(1, verse_tok, model, 50))    

In [12]:
# Here we calculate the sentence embedding vectors for queries

# getting the queries
df = pd.read_csv('queries_sem.csv')
queries = df['Verse'].to_list()

# pre-processing the queries
queries = [query.lower().translate(str.maketrans('','',string.punctuation)).split() for query in queries]

# a list to store all the vectors
query_vectors = []

for query in queries:
    query_vectors.append(senemb(1, query, model, 50))

In [13]:
#list1: to store cos. sim. of each query with the verses
list1 = []

for i in query_vectors:
    list2 = []
    for j in verse_vectors:
        d= (dot(i , j))/(norm(i)*norm(j))
        list2.append(d)
    list1.append(list2)

#list3: to store verse nos. of top three verses for each query
#to be put as results in mAP fun.
list3 = []

for i in list1:
    list4 = []
    
    #take three elements with highest scores   
    for ele in sorted(i)[:-4:-1]:
        list4.append(verses.iloc[i.index(ele)][0].replace(":",""))
    list3.append(list4)