In [1]:
import pandas as pd
import string

# getting the data
verses = pd.read_csv('verses.csv', usecols=['#verse', 'Translation'])
data = verses['Translation'].to_list()

verses

Unnamed: 0,#verse,Translation
0,1.1:,"Dhrtarastra said: O Sanjaya, after my sons and..."
1,1.2:,"Sanjaya said: O King, after looking over the a..."
2,1.3:,"O my teacher, behold the great army of the son..."
3,1.4:,Here in this army are many heroic bowmen equal...
4,1.5:,"There are also great heroic, powerful fighters..."
...,...,...
652,18.74:,Sanjaya said: Thus have I heard the conversati...
653,18.75:,"By the mercy of Vyasa, I have heard these most..."
654,18.76:,"O King, as I repeatedly recall this wondrous a..."
655,18.77:,"O King, as I remember the wonderful form of Lo..."


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# instantiating the model
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
# applying in on data
tfidf_vectors = tfidf_vectorizer.fit_transform(data)

# feature_names
feature_names = tfidf_vectorizer.get_feature_names()

# putting it all together in a dataframe
document_term_matrix = pd.DataFrame(tfidf_vectors.T.todense(), index = feature_names)

In [3]:
# getting the queries
df = pd.read_csv('queries_sem.csv')
queries = df['Verse'].to_list()
# pre-processing the queries
queries = [query.rstrip().lower().translate(str.maketrans('','',string.punctuation)) for query in queries]

In [4]:
# now let's move towars getting the answers for the queries
f = open('best3_tfidf.txt','w+')
results = []

for query in queries:
    # getting the common words between query and our features
    common = set(query.split()).intersection(set(feature_names))

    # getting the indices for top 3 documents
    ind = document_term_matrix.loc[common].apply(sum).sort_values(ascending = False).head(3).index.to_list()
    
    # getting our results as a 2d list
    results.append([verse_num.rstrip(':') for verse_num in verses.iloc[ind]['#verse'].to_list()])
    

    # printing the results in a file
    f.write('For query:\n' + '""' + query + '""' + '\n\n')
    f.write('The Best 3 matching verses are:\n')
    for ind, i in enumerate(ind):
        f.write(str(ind + 1) + ') ' + data[i] + '\n\n')

In [5]:
results

[['11.54', '1.29', '11.5'],
 ['8.9', '5.20', '15.19'],
 ['14.27', '2.15', '6.32'],
 ['1.36', '14.4', '17.14'],
 ['12.10', '1.36', '2.54'],
 ['15.16', '18.21', '10.22'],
 ['15.5', '12.18-19', '8.9'],
 ['4.4', '13.3', '15.10'],
 ['2.54', '8.2', '10.11'],
 ['8.2', '5.14', '2.54'],
 ['4.4', '10.10', '2.2'],
 ['4.4', '9.25', '14.18'],
 ['9.30', '11.54', '17.20'],
 ['18.45', '8.2', '15.10']]