In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import gensim
import gensim.corpora as corpora
from gensim import models
from pprint import pprint
import gensim.corpora as corpora

In [3]:
df = pd.read_csv('./sutta_csv/cleaned/df_all_prep.csv')

# Modeling Part 1 (M1): Topic Analysis

Here I develop two topic analysis models one using TFIDF and another using latent semantic analysis. Given the large computation requirements this was originally developed in google colab. 

This was unfortunately a bit of a bust. The topics that the model was identifying were not particularly interpretable and given the aim of the project the results were not usable (despite a decent coherence score on the LDA model with tfidf). 

### (M1.1) Topic Analysis using Tfidf and TruncatedSVD

In [None]:
#Instantiate and fit tfidfvectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features= 3000)

X = vectorizer.fit_transform(df['text_full'])

(103, 3000)

#### Using TruncastedSVD 

In [None]:
# Base code developed from the following blog, modified to fit requirements here. - https://www.analyticsvidhya.com/blog/2018/10/stepwise-guide-topic-modeling-latent-semantic-analysis/

#Instantiating and fitting model
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', 
                         n_iter=100, random_state=42)

svd_model.fit(X)

#Topic retrieval 
len(svd_model.components_)

20

In [None]:
## Print out the relevant topics

terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])
        print(" ")

Topic 0: 
pleasure
 
self
 
feeling
 
view
 
consciousness
 
gotama
 
qualities
 
Topic 1: 
inconstant
 
consciousness
 
self
 
contact
 
feeling
 
property
 
clinging
 
Topic 2: 
introduction
 
discourse
 
papañca
 
simile
 
clinging
 
leading
 
similes
 
Topic 3: 
inconstant
 
gotama
 
master
 
view
 
self
 
stressful
 
constant
 
Topic 4: 
heartwood
 
property
 
introduction
 
inconstant
 
consummation
 
gotama
 
breathe
 
Topic 5: 
heartwood
 
consummation
 
fame
 
offerings
 
gain
 
knowledge
 
vision
 
Topic 6: 
heartwood
 
consummation
 
view
 
breathe
 
property
 
resolve
 
knowledge
 
Topic 7: 
integrity
 
person
 
view
 
dimension
 
perception
 
quality
 
wrong
 
Topic 8: 
sensuality
 
drawback
 
inconstant
 
reason
 
source
 
stress
 
action
 
Topic 9: 
inconstant
 
stressful
 
subject
 
change
 
constant
 
ven
 
venerable
 
Topic 10: 
integrity
 
inconstant
 
person
 
quality
 
sensuality
 
stressful
 
breathe
 
Topic 11: 
pleasure
 
torturing
 
torments
 
devoted
 
conduct

## (M1.2) Topic Analysis with LDA

This blog was extremely helpful for this section. Code and troubleshooting from here: https://highdemandskills.com/topic-modeling-lda/.

### (M1.2.1) With Bag of Words

In [None]:
#Creating dictionary and bag of words manipulated corpus. Help and base code from: https://github.com/RaRe-Technologies/gensim/issues/2644
ID2word = corpora.Dictionary(df['text_no_stop'])
texts = df['text_no_stop']
corpus = [ID2word.doc2bow(sutta) for sutta in texts]

In [None]:
# Train LDA model on the corpus generated above
lda_model = gensim.models.LdaMulticore(corpus=corpus, num_topics=5, id2word=ID2word, passes=100)

# View topics from the lda model
pprint(lda_model.print_topics(num_words=5))

In [None]:
#Use the coherence score to evaluate the effectiveness of the model.
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=df['text_no_stop'], dictionary=ID2word, coherence='c_v')

#Print Coherence
coherence_lda = coherence_model_lda.get_coherence()
print('-'*50)
print('\nCoherence Score:', coherence_lda)
print('-'*50)

[(0, '0.023*"one" + 0.012*"blessed" + 0.011*"monks" + 0.008*"person"'),
 (1, '0.019*"right" + 0.013*"one" + 0.011*"stress" + 0.010*"pleasure"'),
 (2, '0.014*"one" + 0.011*"mind" + 0.010*"body" + 0.007*"monk"'),
 (3, '0.017*"one" + 0.009*"come" + 0.008*"dhamma" + 0.008*"clinging"'),
 (4, '0.021*"one" + 0.013*"self" + 0.013*"feeling" + 0.012*"blessed"')]
--------------------------------------------------

Coherence Score: 0.30941882568080414
--------------------------------------------------


### (M1.2.2) Applying TFIDF

In [None]:
## Use TFIDF in conjunction with the text 
corpus = [ID2word.doc2bow(sutta) for sutta in texts]
TFIDF = models.TfidfModel(corpus)
text_tfidf = TFIDF[corpus]

## Train LDA model on new TFIDF corpus
lda_modeltf = gensim.models.LdaMulticore(corpus=text_tfidf, num_topics=5, id2word=ID2word, passes=100)

## Print topics from new model
pprint(lda_modeltf.print_topics(num_words=5))

In [None]:
# Set up coherence model
coherence_model_lda = gensim.models.CoherenceModel(model=lda_modeltf, texts=df_mn2['text_no_stop'], dictionary=ID2word, coherence='c_v')

# Print coherence
coherence_lda = coherence_model_lda.get_coherence()
print('-'*50)
print('\nCoherence Score:', coherence_lda)
print('-'*50)

# Modeling Step 2 (M2): Clustering

## K-Means and TFIDF

In [None]:
## https://github.com/harrywang/document_clustering/blob/master/doc_clustering.ipynb

In [None]:
from nltk.stem.snowball import SnowballStemmer 
# load nltk's English stopwords as variable called 'stopwords'
# use nltk.download() to install the corpus first
# Stop Words are words which do not contain important significance to be used in Search Queries
stopwords = nltk.corpus.stopwords.words('english')

# load nltk's SnowballStemmer as variabled 'stemmer'
stemmer = SnowballStemmer("english")

In [None]:
sents = [sent for sent in nltk.sent_tokenize("Today (May 19, 2016) is his only daughter's wedding. Vito Corleone is the Godfather. Vito's youngest son, Michael, in a Marine Corps uniform, introduces his girlfriend, Kay Adams, to his family at the sprawling reception.")]

In [None]:
words = [word for word in nltk.word_tokenize(sents[0])]
words

In [None]:
filtered_words = []
for word in words:
        if re.search('[a-zA-Z]', word):
            filtered_words.append(word)
filtered_words

In [None]:
# see how "only" is stemmed to "onli" and "wedding" is stemmed to "wed"
stems = [stemmer.stem(t) for t in filtered_words]
stems

In [None]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
# Punkt Sentence Tokenizer, sent means sentence 
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [None]:
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
words_stemmed = tokenize_and_stem("Wilderness")
print(words_stemmed)

In [None]:

words_only = tokenize_only("Today (May 19, 2016) is his only daughter's wedding.")
words_only

In [None]:
text_full = []
for i in df_mn2['text_full']:
  text_full.append(i)

In [None]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in text_full:
    allwords_stemmed = tokenize_and_stem(i) # for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:

print(len(totalvocab_stemmed))
print(len(totalvocab_tokenized))

In [None]:

vocab_frame = pd.DataFrame({'words': totalvocab_tokenized})
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
print(vocab_frame.head())

In [None]:
words_frame = pd.DataFrame({'WORD': words_only}, index = words_stemmed)
print('there are ' + str(words_frame.shape[0]) + ' items in words_frame')
print(words_frame)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=20000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(df_mn2['text_full']) #fit the vectorizer to synopses

# (100, 563) means the matrix has 100 rows and 563 columns
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
len(terms)

In [None]:
terms

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
!pip install joblib

In [None]:
import joblib

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()
# clusters show which cluster (0-4) each of the 100 synoposes belongs to
print(len(clusters))
print(clusters)

In [None]:
# list_int = clusters
    
# # mapping 
# list_string = map(str, list_int) 
# clusters_s = list(list_string)
# # Printing sorted list of integers 
# print(clusters_s)

In [None]:
df_mn2['cluster'] = clusters

In [None]:
df_small = df_mn2[['title', 'cluster']].copy()

In [None]:
df_small.head()

In [None]:
terms

In [None]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

In [None]:
order_centroids

In [None]:
terms[143]

In [None]:
##https://github.com/harrywang/document_clustering/blob/master/doc_clustering.ipynb
from __future__ import print_function

print("Top terms per cluster:")

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % terms[ind].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace

    
    print("Cluster %d titles:" % i, end='')
    for title in df_small.loc[df_small['clusters'] == i, 'title']:
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace

In [None]:
frames.head(1)

In [None]:
df_mn2['clusters'] = clusters

In [None]:
df_small = df_mn2[['title', 'clusters']].copy()

# frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

In [None]:
vocab_frame.head()

In [None]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

In [None]:
order_centroids

In [None]:
terms[60]

In [None]:
similarity_distance = 1 - cosine_similarity(tfidf_matrix)
print(type(similarity_distance))
print(similarity_distance.shape)

# New Modeling Approach - Similarities

In [None]:
# https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630

In [None]:
df_mn3 = df_mn2[['title', 'ref', 'text_full']].copy()

In [None]:
df_mn3 = pd.DataFrame(df_mn3)

In [None]:
df_mn3.head(1)

Unnamed: 0,title,ref,text_full
0,MN 1 Mūlapariyāya Sutta | The Root Sequence,MN 1,"\n I have heard that on one occasion the Blessed One was staying near Ukkaṭṭhā, in the shade of a royal Sal tree in the Very Blessed Forest. There he addressed the monks, “Monks!”\n “Yes, lord,” the monks responded to him.\n The Blessed One said, “Monks, I will teach you the sequence of the root of all phenomena [or: the root sequence of all phenomena]. Listen & pay close attention. I will speak.”\n “As you say, lord,” they responded to him.\n The Blessed One said: “There is the case, monks, where an uninstructed run-of-the-mill person—who has no regard for noble ones, is not well-versed or disciplined in their Dhamma; who has no regard for people of integrity, is not well-versed or disciplined in their Dhamma—perceives earth as earth. Perceiving earth as earth, he supposes (things) about earth, he supposes (things) in earth, he supposes (things) coming out of earth, he supposes earth as ‘mine,’ he delights in earth. Why is that? Because he has not comprehended it, I tell you.\n “He perceives water as water… fire as fire… wind as wind1… beings as beings… devas as devas… Pajāpati as Pajāpati… Brahmā as Brahmā… the Radiant devas as Radiant devas… the Beautiful Black devas as Beautiful Black devas… the Sky-fruit devas as Sky-fruit devas… the Conqueror as the Conqueror2… the dimension of the infinitude of space as the dimension of the infinitude of space… the dimension of the infinitude of consciousness as the dimension of the infinitude of consciousness… the dimension of nothingness as the dimension of nothingness… the dimension of neither perception nor non-perception as the dimension of neither perception nor non-perception3… the seen as the seen… the heard as the heard… the sensed as the sensed… the cognized as the cognized4… singleness as singleness… multiplicity as multiplicity5… the All as the All6…\n “He perceives unbinding as unbinding.7 Perceiving unbinding as unbinding, he supposes things about unbinding, he supposes things in unbinding, he supposes things coming out of unbinding, he supposes unbinding as ‘mine,’ he delights in unbinding. Why is that? Because he has not comprehended it, I tell you.\n The Trainee\n “A monk who is a trainee—yearning for the unexcelled relief from bondage, his aspirations as yet unfulfilled—directly knows earth as earth. Directly knowing earth as earth, let him not suppose things about earth, let him not suppose things in earth, let him not suppose things coming out of earth, let him not suppose earth as ‘mine,’ let him not delight in earth. Why is that? So that he may comprehend it, I tell you.\n “He directly knows water as water… fire as fire… wind as wind… beings as beings… devas as devas… Pajāpati as Pajāpati… Brahmā as Brahmā… the Radiant devas as Radiant devas… the Beautiful Black devas as Beautiful Black devas… the Sky-fruit devas as Sky-fruit devas… the Conqueror as the Conqueror… the dimension of the infinitude of space as the dimension of the infinitude of space… the dimension of the infinitude of consciousness as the dimension of the infinitude of consciousness… the dimension of nothingness as the dimension of nothingness… the dimension of neither perception nor non-perception as the dimension of neither perception nor non-perception… the seen as the seen… the heard as the heard… the sensed as the sensed… the cognized as the cognized… singleness as singleness… multiplicity as multiplicity… the All as the All…\n “He directly knows unbinding as unbinding. Directly knowing unbinding as unbinding, let him not suppose things about unbinding, let him not suppose things in unbinding, let him not suppose things coming out of unbinding, let him not suppose unbinding as ‘mine,’ let him not delight in unbinding. Why is that? So that he may comprehend it, I tell you.\n The Arahant\n “A monk who is a Worthy One, devoid of effluents—who has attained completion, finished the task, laid down the burden, attained the true goal, destroyed the fetters of becoming, and is released through right knowledge—directly knows earth as earth. Directly knowing earth as earth, he doesn’t suppose things about earth, doesn’t suppose things in earth, doesn’t suppose things coming out of earth, doesn’t suppose earth as ‘mine,’ doesn’t delight in earth. Why is that? Because he has comprehended it, I tell you.\n “He directly knows water as water… fire as fire… wind as wind… beings as beings… devas as devas… Pajāpati as Pajāpati… Brahmā as Brahmā… the Radiant devas as Radiant devas… the Beautiful Black devas as Beautiful Black devas… the Sky-fruit devas as Sky-fruit devas… the Conqueror as the Conqueror… the dimension of the infinitude of space as the dimension of the infinitude of space… the dimension of the infinitude of consciousness as the dimension of the infinitude of consciousness… the dimension of nothingness as the dimension of nothingness… the dimension of neither perception nor non-perception as the dimension of neither perception nor non-perception… the seen as the seen… the heard as the heard… the sensed as the sensed… the cognized as the cognized… singleness as singleness… multiplicity as multiplicity… the All as the All…\n “He directly knows unbinding as unbinding. Directly knowing unbinding as unbinding, he doesn’t suppose things about unbinding, doesn’t suppose things in unbinding, doesn’t suppose things coming out of unbinding, doesn’t suppose unbinding as ‘mine,’ doesn’t delight in unbinding. Why is that? Because he has comprehended it, I tell you.\n “A monk who is a Worthy One, devoid of effluents… directly knows earth as earth. Directly knowing earth as earth, he doesn’t suppose things about earth, doesn’t suppose things in earth, doesn’t suppose things coming out of earth, doesn’t suppose earth as ‘mine,’ doesn’t delight in earth. Why is that? Because, with the ending of passion, he is devoid of passion, I tell you.\n “He directly knows water as water… the All as the All…\n “He directly knows unbinding as unbinding. Directly knowing unbinding as unbinding, he doesn’t suppose things about unbinding, doesn’t suppose things in unbinding, doesn’t suppose things coming out of unbinding, doesn’t suppose unbinding as ‘mine,’ doesn’t delight in unbinding. Why is that? Because, with the ending of passion, he is devoid of passion, I tell you.\n “A monk who is a Worthy One, devoid of effluents… directly knows earth as earth. Directly knowing earth as earth, he doesn’t suppose things about earth, doesn’t suppose things in earth, doesn’t suppose things coming out of earth, doesn’t suppose earth as ‘mine,’ doesn’t delight in earth. Why is that? Because, with the ending of aversion, he is devoid of aversion, I tell you.\n “He directly knows water as water… the All as the All…\n “He directly knows unbinding as unbinding. Directly knowing unbinding as unbinding, he doesn’t suppose things about unbinding, doesn’t suppose things in unbinding, doesn’t suppose things coming out of unbinding, doesn’t suppose unbinding as ‘mine,’ doesn’t delight in unbinding. Why is that? Because, with the ending of aversion, he is devoid of aversion, I tell you.\n “A monk who is a Worthy One, devoid of effluents… directly knows earth as earth. Directly knowing earth as earth, he doesn’t suppose things about earth, doesn’t suppose things in earth, doesn’t suppose things coming out of earth, doesn’t suppose earth as ‘mine,’ doesn’t delight in earth. Why is that? Because, with the ending of delusion, he is devoid of delusion, I tell you.\n “He directly knows water as water… the All as the All…\n “He directly knows unbinding as unbinding. Directly knowing unbinding as unbinding, he doesn’t suppose things about unbinding, doesn’t suppose things in unbinding, doesn’t suppose things coming out of unbinding, doesn’t suppose unbinding as ‘mine,’ doesn’t delight in unbinding. Why is that? Because, with the ending of delusion, he is devoid of delusion, I tell you.\n The Tathāgata\n “The Tathāgata—a worthy one, rightly self-awakened—directly knows earth as earth. Directly knowing earth as earth, he doesn’t suppose things about earth, doesn’t suppose things in earth, doesn’t suppose things coming out of earth, doesn’t suppose earth as ‘mine,’ doesn’t delight in earth. Why is that? Because the Tathāgata has comprehended it to the end, I tell you.\n “He directly knows water as water… fire as fire… wind as wind… beings as beings… devas as devas… Pajāpati as Pajāpati… Brahmā as Brahmā… the Radiant devas as Radiant devas… the Beautiful Black devas as Beautiful Black devas… the Sky-fruit devas as Sky-fruit devas… the Conqueror as the Conqueror… the dimension of the infinitude of space as the dimension of the infinitude of space… the dimension of the infinitude of consciousness as the dimension of the infinitude of consciousness… the dimension of nothingness as the dimension of nothingness… the dimension of neither perception nor non-perception as the dimension of neither perception nor non-perception… the seen as the seen… the heard as the heard… the sensed as the sensed… the cognized as the cognized… singleness as singleness… multiplicity as multiplicity… the All as the All…\n “He directly knows unbinding as unbinding. Directly knowing unbinding as unbinding, he doesn’t suppose things about unbinding, doesn’t suppose things in unbinding, doesn’t suppose things coming out of unbinding, doesn’t suppose unbinding as ‘mine,’ doesn’t delight in unbinding. Why is that? Because the Tathāgata has comprehended it to the end, I tell you.\n “The Tathāgata—a worthy one, rightly self-awakened—directly knows earth as earth. Directly knowing earth as earth, he doesn’t suppose things about earth, doesn’t suppose things in earth, doesn’t suppose things coming out of earth, doesn’t suppose earth as ‘mine,’ doesn’t delight in earth. Why is that? Because he has known that delight is the root of suffering & stress, that from coming-into-being there is birth, and that for what has come into being there is aging & death. Therefore, with the total ending, fading away, cessation, letting go, relinquishment of craving, the Tathāgata has totally awakened to the unexcelled right self-awakening, I tell you.\n “He directly knows water as water… the All as the All…\n “He directly knows unbinding as unbinding. Directly knowing unbinding as unbinding, he doesn’t suppose things about unbinding, doesn’t suppose things in unbinding, doesn’t suppose things coming out of unbinding, doesn’t suppose unbinding as ‘mine,’ doesn’t delight in unbinding. Why is that? Because he has known that delight is the root of suffering & stress, that from coming-into-being there is birth, and that for what has come into being there is aging & death. Therefore, with the total ending, fading away, cessation, letting go, relinquishment of craving, the Tathāgata has totally awakened to the unexcelled right self-awakening, I tell you.”\n That is what the Blessed One said. Displeased, the monks did not delight in the Blessed One’s words."


### With TFIDF

In [None]:
# https://www.analyticsvidhya.com/blog/2018/10/stepwise-guide-topic-modeling-latent-semantic-analysis/
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 20000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(df_mn3['text_full'])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
pairwise_similarities=np.dot(X,X.T).toarray()
pairwise_differences=euclidean_distances(X)

In [None]:
def most_similar(doc_id,similarity_matrix,matrix,number):
    print (f'Document: {df_mn3.iloc[doc_id]["title"]}')
    print ('\n')
    print ('Similar Documents:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1][:int(number) -1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])[:int(number) -1]
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Title: {df_mn3.iloc[ix]["title"]}')
        print (f'Ref: {df_mn3.iloc[ix]["ref"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

most_similar(6,pairwise_similarities,'Cosine Similarity', 5)
most_similar(6,pairwise_differences,'Euclidean Distance', 5)          

Document: MN 9  Sammādiṭṭhi Sutta | Right View


Similar Documents:


Title: MN 78  Samaṇa-Muṇḍika Sutta | Muṇḍika the Contemplative
Ref: MN 78
Cosine Similarity : 0.15916825584708316


Title: MN 61  Ambalaṭṭhikā Rāhulovāda Sutta | The Exhortation to Rāhula at Mango Stone
Ref: MN 61
Cosine Similarity : 0.14168510166959863


Title: MN 60  Apaṇṇaka Sutta | A Safe Bet
Ref: MN 60
Cosine Similarity : 0.11604174206793118
Document: MN 9  Sammādiṭṭhi Sutta | Right View


Similar Documents:


Title: MN 78  Samaṇa-Muṇḍika Sutta | Muṇḍika the Contemplative
Ref: MN 78
Euclidean Distance : 1.296789685456294


Title: MN 61  Ambalaṭṭhikā Rāhulovāda Sutta | The Exhortation to Rāhula at Mango Stone
Ref: MN 61
Euclidean Distance : 1.3102021968615392


Title: MN 60  Apaṇṇaka Sutta | A Safe Bet
Ref: MN 60
Euclidean Distance : 1.329630217716241


In [None]:
print (X[0].toarray())
print (pairwise_similarities.shape)
print (pairwise_similarities[0][:])

[[0. 0. 0. ... 0. 0. 0.]]
(103, 103)
[1.         0.38311196 0.30988342 0.2891808  0.39565505 0.41485866
 0.13622545 0.25407555 0.55424224 0.38745008 0.45618135 0.36108787
 0.34660793 0.38575052 0.3069664  0.39546249 0.41043222 0.37209638
 0.35358632 0.34392074 0.33755922 0.34439733 0.37873648 0.3736021
 0.25119948 0.48890284 0.41503736 0.31277483 0.28591621 0.3849433
 0.38790708 0.42301527 0.33389194 0.43054676 0.37618434 0.36642302
 0.42601748 0.43888858 0.24885097 0.37991875 0.43572802 0.3616452
 0.42027166 0.38463207 0.37700027 0.41336038 0.31067392 0.40901885
 0.389215   0.36831077 0.28754378 0.39746183 0.25883696 0.3001851
 0.36089611 0.18301161 0.41889157 0.39371869 0.42950978 0.30174464
 0.35400893 0.32584472 0.38980423 0.23894279 0.41778435 0.29055799
 0.36391645 0.25261796 0.38372024 0.36821325 0.3909372  0.39934236
 0.18925951 0.37509328 0.38799224 0.43230489 0.47227923 0.23863807
 0.33875223 0.38962773 0.37676387 0.48619618 0.42362059 0.22599288
 0.32955409 0.42929866 0.4141

In [None]:
most_similar(2,pairwise_similarities,'Cosine Similarity')

TypeError: ignored

### With Doc2Vec

In [None]:
#https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(df_mn3['text_full'])]
model_d2v = Doc2Vec(vector_size=100,alpha=0.025, min_count=1)
  
model_d2v.build_vocab(tagged_data)

for epoch in range(100):
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.epochs)
    
document_embeddings=np.zeros((df_mn3.shape[0],100))

for i in range(len(document_embeddings)):
    document_embeddings[i]=model_d2v.docvecs[i]
    
    
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

most_similar(1,pairwise_similarities,'Cosine Similarity')
most_similar(1,pairwise_differences,'Euclidean Distance')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TypeError: ignored

### BERT Model

In [None]:
!pip install sentence_transformers



In [None]:
df_mn3.head(1)

Unnamed: 0,title,ref,text_full
0,MN 1 Mūlapariyāya Sutta | The Root Sequence,MN 1,i have heard that on one occasion the blessed one was staying near ukkaṭṭhā in the shade of a royal sal tree in the very blessed forest there he addressed the monks monks yes lord the monks responded to him the blessed one said monks i will teach you the sequence of the root of all phenomena or the root sequence of all phenomena listen pay close attention i will speak as you say lord they responded to him the blessed one said there is the case monks where an uninstructed run of the mill personwho has no regard for noble ones is not well versed or disciplined in their dhamma who has no regard for people of integrity is not well versed or disciplined in their dhammaperceives earth as earth perceiving earth as earth he supposes things about earth he supposes things in earth he supposes things coming out of earth he supposes earth as mine he delights in earth why is that because he has not comprehended it i tell you he perceives water as water fire as fire wind as wind1 beings as beings devas as devas pajāpati as pajāpati brahmā as brahmā the radiant devas as radiant devas the beautiful black devas as beautiful black devas the sky fruit devas as sky fruit devas the conqueror as the conqueror2 the dimension of the infinitude of space as the dimension of the infinitude of space the dimension of the infinitude of consciousness as the dimension of the infinitude of consciousness the dimension of nothingness as the dimension of nothingness the dimension of neither perception nor non perception as the dimension of neither perception nor non perception3 the seen as the seen the heard as the heard the sensed as the sensed the cognized as the cognized4 singleness as singleness multiplicity as multiplicity5 the all as the all6 he perceives unbinding as unbinding 7 perceiving unbinding as unbinding he supposes things about unbinding he supposes things in unbinding he supposes things coming out of unbinding he supposes unbinding as mine he delights in unbinding why is that because he has not comprehended it i tell you the trainee a monk who is a traineeyearning for the unexcelled relief from bondage his aspirations as yet unfulfilleddirectly knows earth as earth directly knowing earth as earth let him not suppose things about earth let him not suppose things in earth let him not suppose things coming out of earth let him not suppose earth as mine let him not delight in earth why is that so that he may comprehend it i tell you he directly knows water as water fire as fire wind as wind beings as beings devas as devas pajāpati as pajāpati brahmā as brahmā the radiant devas as radiant devas the beautiful black devas as beautiful black devas the sky fruit devas as sky fruit devas the conqueror as the conqueror the dimension of the infinitude of space as the dimension of the infinitude of space the dimension of the infinitude of consciousness as the dimension of the infinitude of consciousness the dimension of nothingness as the dimension of nothingness the dimension of neither perception nor non perception as the dimension of neither perception nor non perception the seen as the seen the heard as the heard the sensed as the sensed the cognized as the cognized singleness as singleness multiplicity as multiplicity the all as the all he directly knows unbinding as unbinding directly knowing unbinding as unbinding let him not suppose things about unbinding let him not suppose things in unbinding let him not suppose things coming out of unbinding let him not suppose unbinding as mine let him not delight in unbinding why is that so that he may comprehend it i tell you the arahant a monk who is a worthy one devoid of effluentswho has attained completion finished the task laid down the burden attained the true goal destroyed the fetters of becoming and is released through right knowledgedirectly knows earth as earth directly knowing earth as earth he doesnt suppose things about earth doesnt suppose things in earth doesnt suppose things coming out of earth doesnt suppose earth as mine doesnt delight in earth why is that because he has comprehended it i tell you he directly knows water as water fire as fire wind as wind beings as beings devas as devas pajāpati as pajāpati brahmā as brahmā the radiant devas as radiant devas the beautiful black devas as beautiful black devas the sky fruit devas as sky fruit devas the conqueror as the conqueror the dimension of the infinitude of space as the dimension of the infinitude of space the dimension of the infinitude of consciousness as the dimension of the infinitude of consciousness the dimension of nothingness as the dimension of nothingness the dimension of neither perception nor non perception as the dimension of neither perception nor non perception the seen as the seen the heard as the heard the sensed as the sensed the cognized as the cognized singleness as singleness multiplicity as multiplicity the all as the all he directly knows unbinding as unbinding directly knowing unbinding as unbinding he doesnt suppose things about unbinding doesnt suppose things in unbinding doesnt suppose things coming out of unbinding doesnt suppose unbinding as mine doesnt delight in unbinding why is that because he has comprehended it i tell you a monk who is a worthy one devoid of effluents directly knows earth as earth directly knowing earth as earth he doesnt suppose things about earth doesnt suppose things in earth doesnt suppose things coming out of earth doesnt suppose earth as mine doesnt delight in earth why is that because with the ending of passion he is devoid of passion i tell you he directly knows water as water the all as the all he directly knows unbinding as unbinding directly knowing unbinding as unbinding he doesnt suppose things about unbinding doesnt suppose things in unbinding doesnt suppose things coming out of unbinding doesnt suppose unbinding as mine doesnt delight in unbinding why is that because with the ending of passion he is devoid of passion i tell you a monk who is a worthy one devoid of effluents directly knows earth as earth directly knowing earth as earth he doesnt suppose things about earth doesnt suppose things in earth doesnt suppose things coming out of earth doesnt suppose earth as mine doesnt delight in earth why is that because with the ending of aversion he is devoid of aversion i tell you he directly knows water as water the all as the all he directly knows unbinding as unbinding directly knowing unbinding as unbinding he doesnt suppose things about unbinding doesnt suppose things in unbinding doesnt suppose things coming out of unbinding doesnt suppose unbinding as mine doesnt delight in unbinding why is that because with the ending of aversion he is devoid of aversion i tell you a monk who is a worthy one devoid of effluents directly knows earth as earth directly knowing earth as earth he doesnt suppose things about earth doesnt suppose things in earth doesnt suppose things coming out of earth doesnt suppose earth as mine doesnt delight in earth why is that because with the ending of delusion he is devoid of delusion i tell you he directly knows water as water the all as the all he directly knows unbinding as unbinding directly knowing unbinding as unbinding he doesnt suppose things about unbinding doesnt suppose things in unbinding doesnt suppose things coming out of unbinding doesnt suppose unbinding as mine doesnt delight in unbinding why is that because with the ending of delusion he is devoid of delusion i tell you the tathāgata the tathāgataa worthy one rightly self awakeneddirectly knows earth as earth directly knowing earth as earth he doesnt suppose things about earth doesnt suppose things in earth doesnt suppose things coming out of earth doesnt suppose earth as mine doesnt delight in earth why is that because the tathāgata has comprehended it to the end i tell you he directly knows water as water fire as fire wind as wind beings as beings devas as devas pajāpati as pajāpati brahmā as brahmā the radiant devas as radiant devas the beautiful black devas as beautiful black devas the sky fruit devas as sky fruit devas the conqueror as the conqueror the dimension of the infinitude of space as the dimension of the infinitude of space the dimension of the infinitude of consciousness as the dimension of the infinitude of consciousness the dimension of nothingness as the dimension of nothingness the dimension of neither perception nor non perception as the dimension of neither perception nor non perception the seen as the seen the heard as the heard the sensed as the sensed the cognized as the cognized singleness as singleness multiplicity as multiplicity the all as the all he directly knows unbinding as unbinding directly knowing unbinding as unbinding he doesnt suppose things about unbinding doesnt suppose things in unbinding doesnt suppose things coming out of unbinding doesnt suppose unbinding as mine doesnt delight in unbinding why is that because the tathāgata has comprehended it to the end i tell you the tathāgataa worthy one rightly self awakeneddirectly knows earth as earth directly knowing earth as earth he doesnt suppose things about earth doesnt suppose things in earth doesnt suppose things coming out of earth doesnt suppose earth as mine doesnt delight in earth why is that because he has known that delight is the root of suffering stress that from coming into being there is birth and that for what has come into being there is aging death therefore with the total ending fading away cessation letting go relinquishment of craving the tathāgata has totally awakened to the unexcelled right self awakening i tell you he directly knows water as water the all as the all he directly knows unbinding as unbinding directly knowing unbinding as unbinding he doesnt suppose things about unbinding doesnt suppose things in unbinding doesnt suppose things coming out of unbinding doesnt suppose unbinding as mine doesnt delight in unbinding why is that because he has known that delight is the root of suffering stress that from coming into being there is birth and that for what has come into being there is aging death therefore with the total ending fading away cessation letting go relinquishment of craving the tathāgata has totally awakened to the unexcelled right self awakening i tell you that is what the blessed one said displeased the monks did not delight in the blessed ones words


In [None]:
df_mn3['ref'][12]

'MN\u200918'

In [None]:
sutta_ref = 'MN 18'
ref = sutta_ref.replace(' ', '\u2009')
sutta_index = df_mn3['ref'].index[(df_mn3['ref'] == ref)].tolist()
sutta_index[0]

12

In [None]:
from sentence_transformers import SentenceTransformer
def bert_mod(sutta_ref, number):
  sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

  #adding \u2009 in between the sutta abbreviation and number

  ref = sutta_ref.replace(' ', '\u2009')
  sutta_index = df_mn3['ref'].index[(df_mn3['ref'] == ref)].tolist() ##retrieving index of reference
  document_embeddings = sbert_model.encode(df_mn3['text_full'])

  pairwise_similarities=cosine_similarity(document_embeddings)
  pairwise_differences=euclidean_distances(document_embeddings)

  most_similar(sutta_index[0], pairwise_similarities,'Cosine Similarity', number = number)
  most_similar(sutta_index[0], pairwise_differences,'Euclidean Distance', number = number)

In [None]:
bert_mod('MN 1', 5)

Document: MN 1  Mūlapariyāya Sutta | The Root Sequence


Similar Documents:


Title: MN 49  Brahma-nimantanika Sutta | The Brahmā Invitation
Ref: MN 49
Cosine Similarity : 0.8729467391967773


Title: MN 149  Mahā Saḷāyatanika Sutta | The Great Six Sense-Media Discourse
Ref: MN 149
Cosine Similarity : 0.8703799247741699


Title: MN 138  Uddesa-vibhaṅga Sutta | An Analysis of the Statement
Ref: MN 138
Cosine Similarity : 0.8618467450141907


Title: MN 113  Sappurisa Sutta | A Person of Integrity
Ref: MN 113
Cosine Similarity : 0.8530935645103455
Document: MN 1  Mūlapariyāya Sutta | The Root Sequence


Similar Documents:


Title: MN 49  Brahma-nimantanika Sutta | The Brahmā Invitation
Ref: MN 49
Euclidean Distance : 7.23231315612793


Title: MN 149  Mahā Saḷāyatanika Sutta | The Great Six Sense-Media Discourse
Ref: MN 149
Euclidean Distance : 7.375919818878174


Title: MN 138  Uddesa-vibhaṅga Sutta | An Analysis of the Statement
Ref: MN 138
Euclidean Distance : 7.6350507736206055


Title:

In [None]:
document_embeddings

# Recommendation Work

In [None]:
## tfidf on the text
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_suttas = tfidf_vectorizer.fit_transform((df_mn2['text_full'])) #fitting and transforming the vector
tfidf_suttas

In [None]:
# https://medium.com/@armandj.olivares/building-nlp-content-based-recommender-systems-b104a709c042

In [None]:
# Computing cosine similarity using tfidf¶
from sklearn.metrics.pairwise import cosine_similarity
sutta_tfidf2 = tfidf_vectorizer.transform(df_mn2['text_full'])
cos_similarity_tfidf = map(lambda x: cosine_similarity(sutta_tfidf2, x), tfidf_suttas)
output2 = list(cos_similarity_tfidf)

#### Function for top-N recs

In [None]:
index = np.where(df_mn2['ref'] == 'MN 2')
user_q = df_mn2['ref'].loc[df_mn2['ref'] == 'MN 2']
user_q

In [None]:
df_all = df_mn2[['title', 'ref']]

In [None]:
def get_recommendation(top, df_all, scores):
  recommendation = pd.DataFrame(columns = ['ApplicantID', 'title', 'ref', 'score'])
  count = 0
  for i in top:
      recommendation.at[count, 'ApplicantID'] = user_q
      recommendation.at[count, 'ref'] = df_all['ref'][i]
      recommendation.at[count, 'title'] = df_all['title'][i]
      recommendation.at[count, 'score'] =  scores[count]
      count += 1
  return recommendation

In [None]:
##The top recommendations using TF-IDF
top = sorted(range(len(output2)), key=lambda i: output2[i], reverse=True)[:10]
list_scores = [output2[i][0][0] for i in top]
get_recommendation(top, df_all, list_scores)