In [None]:
import pandas as pd
import numpy as np

import gensim
import pickle
import os

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from google.colab import drive

filepath = '/content/drive/MyDrive/Data Science/GA Course/Capstone/'

In [None]:
!pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.7/dist-packages (1.0.4)


In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv(filepath + 'df_all_clean.csv')
df['ref'] = df['ref'].str.replace('\u2009', ' ')

In [None]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
def get_ix(ref):
  ''' Returns an index number based off of a string ID. '''
  return df[df['ref'] == ref].index[0]

In [None]:
document_embeddings = sbert_model.encode(df['text_full'])

embeddings_df = pd.DataFrame(document_embeddings)
embeddings_df.index = df['ref'].values

embeddings_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
MN 1,-0.114729,1.042907,0.857408,-0.085847,0.277071,-0.662585,1.44162,0.079466,-0.182428,-0.143656,-0.220849,0.712187,0.097093,0.046914,-0.709942,0.009015,0.370007,0.051946,0.407468,-0.504343,-0.477806,-0.306521,0.155221,0.231906,0.944239,0.692981,0.138007,0.454035,-0.515853,0.073684,0.304935,0.323918,0.036331,-0.643783,-0.580048,0.666422,0.784505,0.128521,-0.101357,-0.264002,...,-0.974057,-0.414906,-0.285009,0.366977,0.603069,0.252468,0.282127,0.546585,-1.132445,-0.29306,-0.389522,-1.251504,-0.005539,0.455459,-0.035164,-0.297324,0.019969,0.629699,-0.4657,-0.478378,-0.139754,0.018482,0.126853,-0.451839,0.242925,0.458839,-0.274801,-0.411734,-0.124682,-0.376968,-0.372177,-1.145525,-0.643652,-1.397123,0.186586,-0.137286,-0.080271,1.342329,-0.083997,-0.203061
MN 2,0.13368,0.982654,1.230356,-0.521233,0.310146,-0.669474,1.61452,-0.246975,-0.28177,-0.062034,-0.576925,0.698612,0.324024,0.293063,-0.495574,0.084072,0.674348,-0.274633,0.350879,-0.643345,-0.279122,0.18526,0.151121,-0.050149,0.229678,0.37486,-0.085525,-0.304801,-0.949858,-0.328682,0.339682,0.042061,0.055007,-0.540388,-0.25792,0.623957,0.78007,-0.23661,-0.030747,0.095679,...,-1.033723,-0.46783,-0.138258,0.275284,0.651161,0.246001,0.866489,0.240574,-0.506018,-0.302702,-0.253426,-1.266995,0.394874,0.686166,-0.205058,-0.532591,-0.204359,0.27209,-0.296163,-0.213768,0.542671,0.200347,0.064152,-0.066079,0.236946,0.307851,-0.18643,-0.691142,-0.178745,-0.227154,-0.028874,-0.890543,-0.647262,-1.356551,0.161522,0.047339,-0.155839,1.206385,-0.080179,-0.11935
MN 4,-0.265532,1.118359,0.942307,-0.357291,0.272429,-0.617968,0.582814,-0.591037,-0.268944,-0.234338,-0.263535,0.416193,0.67539,0.482913,-1.169223,-0.328796,0.545885,-0.200964,0.428041,-0.723204,-0.5322,0.053546,-0.549895,0.317295,0.218868,0.915141,-0.146566,-0.337891,-0.804982,-0.049771,0.06803,0.696924,0.216318,-0.883675,-0.138867,0.144566,0.938102,-0.131801,0.001726,0.405264,...,-0.949594,-0.151794,0.829908,-0.173631,0.931392,0.128928,0.59305,0.236782,-0.657086,-0.321281,-0.316753,-1.500322,0.258156,0.323646,-0.182559,-0.516344,0.163168,0.068013,-0.044919,0.056253,0.132714,-0.236723,0.124685,-0.875176,0.12353,0.682279,-0.741622,-0.434348,-0.120524,-0.436988,-0.370613,-1.080281,-0.677167,-1.608784,0.46272,-0.176951,-0.259186,1.374535,0.052056,-0.253926


In [None]:
pairwise_similarities = cosine_similarity(document_embeddings)
pairwise_differences = euclidean_distances(document_embeddings)

In [None]:
with open(filepath + 'pairwise_similarities.npy', 'wb') as f:
  np.save(f, pairwise_similarities)

with open(filepath + 'pairwise_differences.npy', 'wb') as f:
  np.save(f, pairwise_differences)

In [None]:
def most_similar(doc_id, similarity_matrix, matrix_type, number=5):
  '''
  doc_id: (str) unique document identifier
  similarity_matrix: (ndarray) cosine similarity or euclidean difference matrix
  matrix_type: (str) 'cosine' or 'euclidean'
  number: (int) n of similar suttas to display
  '''
  doc_ix = get_ix(doc_id)
  # print(f'Document {df.iloc[doc_id]["title"]}')
  if matrix_type == 'cosine':
    # extra
      similar_ix = np.argsort(similarity_matrix[doc_ix])[::-1][1:]
  elif matrix_type == 'euclidean':
      similar_ix = np.argsort(similarity_matrix[doc_ix])

  for ix in similar_ix[:number]:
    if ix == doc_ix:
      pass
    else:
      print(f'Title: {df.iloc[ix]["title"]}')