In [None]:
!pip install pandas sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Install Dependencies

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import pandas as pd
import pickle
import bz2

Load Sentence Transformers Encoders

In [None]:
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256
top_k = 32   

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

Load Dataframe

In [16]:
df = pd.read_csv('/philosophy_data.csv')
df.head()

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
0,Plato - Complete Works,Plato,plato,"What's new, Socrates, to make you leave your ...","What's new, Socrates, to make you leave your ...",-350,1997,125,"what's new, socrates, to make you leave your ...","['what', 'new', 'socrates', 'to', 'make', 'you...","what be new , Socrates , to make -PRON- lea..."
1,Plato - Complete Works,Plato,plato,Surely you are not prosecuting anyone before t...,Surely you are not prosecuting anyone before t...,-350,1997,69,surely you are not prosecuting anyone before t...,"['surely', 'you', 'are', 'not', 'prosecuting',...",surely -PRON- be not prosecute anyone before ...
2,Plato - Complete Works,Plato,plato,The Athenians do not call this a prosecution b...,The Athenians do not call this a prosecution b...,-350,1997,74,the athenians do not call this a prosecution b...,"['the', 'athenians', 'do', 'not', 'call', 'thi...",the Athenians do not call this a prosecution ...
3,Plato - Complete Works,Plato,plato,What is this you say?,What is this you say?,-350,1997,21,what is this you say?,"['what', 'is', 'this', 'you', 'say']",what be this -PRON- say ?
4,Plato - Complete Works,Plato,plato,"Someone must have indicted you, for you are no...","Someone must have indicted you, for you are no...",-350,1997,101,"someone must have indicted you, for you are no...","['someone', 'must', 'have', 'indicted', 'you',...","someone must have indict -PRON- , for -PRON- ..."


Creating Corpus Embeddings with Sentence Transformers Encoder

In [None]:
corpus_embeddings = bi_encoder.encode(df['sentence_str'], convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/11276 [00:00<?, ?it/s]

Storing Sentence Embeddings

In [None]:
import gzip

df = df.drop(['sentence_spacy', 'original_publication_date', 'corpus_edition_date', 'sentence_length', 'sentence_lowered', 'tokenized_txt', 'lemmatized_str'], axis=1)

pickle.dump({'df': df, 'embeddings': corpus_embeddings}, gzip.open('embeddings.gz', 'wb'))

Search Function

In [None]:
def search(query):
  question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
  question_embedding = question_embedding.cuda()
  hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
  hits = hits[0]

  cross_inp = [[query, df['sentence_str'][hit['corpus_id']]] for hit in hits]
  cross_scores = cross_encoder.predict(cross_inp)

  for idx in range(len(cross_scores)):
    hits[idx]['cross-score'] = cross_scores[idx]

  hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

  for hit in hits[0:5]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], df['sentence_str'][hit['corpus_id']]))    

Demo

In [None]:
search(query = "what is the meaning of life")

	9.017	The very meaning of life is now construed as the effort to live in such a way that life no longer has any point.
	8.502	Life means here: beings in their being: Nature.
	7.984	Life means here the will to will. '
	7.010	The concept of life or universal life is the immediate idea, the concept that has an objectivity corresponding to it; but the objectivity corresponds to it only to the extent that the concept is the negative unity of this externality, that is to say, posits it as corresponding to it.
	6.406	But a life is a relation with the world; the individual defines himself by choosing himself through the world; we must turn to the world to answer the questions that preoccupy us.
