In [25]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
from os import linesep
from PyPDF2 import PdfReader

In [2]:
query = "What is Atma?"

In [3]:
#  To encode all passages
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
top_k=32 # no of passages to be retrieved from bi-encoder

In [4]:
# To re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


In [5]:
FILE_PATH = 'C:/Users/v-ankbhagat/Learnings/SemanticRanker/DOCS/Gita.pdf'

In [6]:
def get_pdf_data(file_path):
    reader = PdfReader(file_path)
    full_doc_text = ""
    pages = reader.pages
    num_pages = len(pages)

    try:
        for page in range(num_pages):
            current_page = reader.pages[page]
            text = current_page.extract_text()
            full_doc_text += text
    except:
        print("Error reading the file")
    finally:
        return full_doc_text

In [7]:
def get_chunks(fulltext:str, chunk_length=500) -> list:
    text = fulltext

    chunks = []

    while len(text) > chunk_length:
        last_period_index = text[:chunk_length].rfind('.')
        if last_period_index == -1:
            last_period_index = chunk_length
        chunks.append(text[:last_period_index])
        text = text[last_period_index+1:]
    chunks.append(text)

    return chunks

In [8]:
full_doc_text = get_pdf_data(FILE_PATH)
print(f'Full doc text length: {len(full_doc_text)}')

chunks = get_chunks(full_doc_text)
print(f"# of chunks: {chunks}")

Full doc text length: 104274
# of chunks: ['ॐ\n\ue397ी\ue009 गु॒\ue450॒\ue1f4ो\ue009 न॒म॒ः हिरः ओम ्\n\ue397ीमद्-भगव\ue138ीता-ता\ue114य \ue303म्\nŚrīmad-Bhagavadgītā-Tātparyam\nA Chapterwise Summary of the Divine\nSong\n\ue280ामी परमाथा \ue303न\ue185 सर\ue280ती\nSvāmī Paramārthānanda SarasvatīKey\nto Transliteration\nअ आ इ ई उ ऊ ऋ\na ā i ī u ū ṛ\nॠ ऌ ए ऐ ओ औ\nṝ ḷ e ai o au\nक ख ग घ ङ\nka kha ga gha ṅa\nच छ ज झ ञ\nca cha ja jha ña\nट ठ ड ढ ण\nṭa ṭha ḍa ḍha ṇa\nत थ द ध न\nta tha da dha na\nप फ ब भ म\npa pha ba bha ma\nय र ल व अ ं\nya ra la va aṃ\nश ष स ह अः\nśa ṣa sa ha aḥContents\nListof Tables . . . . . . . . . . . . . . . . . ', ' . . . . . . . . . i\nChapter 1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1\nChapter 2 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3\nChapter 3 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6\nChapter 4 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 9\nChapter 5 . . . . . . . . . . . . . . . . . . . . . . .

In [9]:
# generate embeddings for chunks and question

chunk_embeddings = bi_encoder.encode(chunks, show_progress_bar=True)
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

# Semantic Search

In [10]:
search_results = util.semantic_search(question_embedding, chunk_embeddings, top_k=top_k)
search_results = search_results[0] # get the search score for the first query
search_results

[{'corpus_id': 78, 'score': 0.43814271688461304},
 {'corpus_id': 80, 'score': 0.4013247787952423},
 {'corpus_id': 17, 'score': 0.33926135301589966},
 {'corpus_id': 26, 'score': 0.3355519771575928},
 {'corpus_id': 44, 'score': 0.3328952193260193},
 {'corpus_id': 77, 'score': 0.32449427247047424},
 {'corpus_id': 79, 'score': 0.31207209825515747},
 {'corpus_id': 20, 'score': 0.3084617257118225},
 {'corpus_id': 41, 'score': 0.30409711599349976},
 {'corpus_id': 81, 'score': 0.3027154803276062},
 {'corpus_id': 209, 'score': 0.2986343502998352},
 {'corpus_id': 24, 'score': 0.2921697497367859},
 {'corpus_id': 76, 'score': 0.2882997691631317},
 {'corpus_id': 167, 'score': 0.2743922472000122},
 {'corpus_id': 103, 'score': 0.2579214572906494},
 {'corpus_id': 176, 'score': 0.2569130063056946},
 {'corpus_id': 66, 'score': 0.25000083446502686},
 {'corpus_id': 180, 'score': 0.2442205250263214},
 {'corpus_id': 137, 'score': 0.24386337399482727},
 {'corpus_id': 181, 'score': 0.236028790473938},
 {'corp

In [12]:
len(search_results)

32

# Semantic Reranking

In [13]:
cross_input = [[query, chunks[search_result['corpus_id']]] for search_result in search_results]
cross_scores = cross_encoder.predict(cross_input)

In [18]:
cross_input

[['What is Atma?',
  ' Every-\nthing, including thoughts, is an object of the Ātmā, the Awareness. The\nĀtmā is not an object for me to be thought of, or to be experienced. It\nis the very “I”. Thus, to knock off all expectations and struggles and\nto abide as the Ātmā with the knowledge that “I am the Ātmā” is true\nmeditation.)\nThus, constantly abiding in the Ātmā , the meditator comes to en-\njoy permanent peace which culminates in the attainment of liberation\n(videhamukti) (15)'],
 ['What is Atma?',
  ' Once this is removed, the inner ānanda becomes evident.\nThus, this is more a dissociation from sorrow rather than association\n(23). (This alone is known as jīvanmukti.) Seeing the Ātmā in all be-\nings, and all beings in the Ātmā, he gets estabilshed in the vision of\nequality (29). This vision of the Ātmā alone is the vision of the Lord,\nbecause the Ātmā is not different from the Lord. Thus the meditator\nis ever established in the Lord inspite of his activities (30, 31)'],
 [

In [14]:
cross_scores

array([  5.6001816 ,   1.7976379 ,   2.6420355 ,   0.29547724,
        -9.827906  ,   0.09490482,  -0.41204083,  -2.3754735 ,
         0.5349555 ,  -0.72674286, -10.571349  ,  -0.68445265,
        -1.7891197 ,  -9.955763  ,  -9.999147  , -10.877742  ,
       -10.745488  , -10.515318  , -10.831608  , -10.729138  ,
       -10.622442  , -10.730914  , -10.1301    , -10.791334  ,
       -10.705481  , -10.609955  , -10.993363  , -10.775614  ,
        -9.522174  , -10.695619  , -10.939821  , -11.169359  ],
      dtype=float32)

In [19]:
for idx in range(len(cross_scores)):
    search_results[idx]['cross-score'] = cross_scores[idx]

In [20]:
search_results

[{'corpus_id': 78, 'score': 0.43814271688461304, 'cross-score': 5.6001816},
 {'corpus_id': 80, 'score': 0.4013247787952423, 'cross-score': 1.7976379},
 {'corpus_id': 17, 'score': 0.33926135301589966, 'cross-score': 2.6420355},
 {'corpus_id': 26, 'score': 0.3355519771575928, 'cross-score': 0.29547724},
 {'corpus_id': 44, 'score': 0.3328952193260193, 'cross-score': -9.827906},
 {'corpus_id': 77, 'score': 0.32449427247047424, 'cross-score': 0.09490482},
 {'corpus_id': 79, 'score': 0.31207209825515747, 'cross-score': -0.41204083},
 {'corpus_id': 20, 'score': 0.3084617257118225, 'cross-score': -2.3754735},
 {'corpus_id': 41, 'score': 0.30409711599349976, 'cross-score': 0.5349555},
 {'corpus_id': 81, 'score': 0.3027154803276062, 'cross-score': -0.72674286},
 {'corpus_id': 209, 'score': 0.2986343502998352, 'cross-score': -10.571349},
 {'corpus_id': 24, 'score': 0.2921697497367859, 'cross-score': -0.68445265},
 {'corpus_id': 76, 'score': 0.2882997691631317, 'cross-score': -1.7891197},
 {'corpu

In [27]:
print("Top-3 Bi-Encoder Retrieval hits")
search_results = sorted(search_results, key=lambda x: x['score'], reverse=True)
newline = '\n'
for search_result in search_results[:3]:
    print(f"{search_result['score']} - {chunks[search_result['corpus_id']].replace( newline,' ')}") #.replace(linesep, ' ')}")

Top-3 Bi-Encoder Retrieval hits
0.43814271688461304 -  Every- thing, including thoughts, is an object of the Ātmā, the Awareness. The Ātmā is not an object for me to be thought of, or to be experienced. It is the very “I”. Thus, to knock off all expectations and struggles and to abide as the Ātmā with the knowledge that “I am the Ātmā” is true meditation.) Thus, constantly abiding in the Ātmā , the meditator comes to en- joy permanent peace which culminates in the attainment of liberation (videhamukti) (15)
0.4013247787952423 -  Once this is removed, the inner ānanda becomes evident. Thus, this is more a dissociation from sorrow rather than association (23). (This alone is known as jīvanmukti.) Seeing the Ātmā in all be- ings, and all beings in the Ātmā, he gets estabilshed in the vision of equality (29). This vision of the Ātmā alone is the vision of the Lord, because the Ātmā is not different from the Lord. Thus the meditator is ever established in the Lord inspite of his activities 

In [28]:
print("Top-3 Cross-Encoder Re-ranking hits")
search_results = sorted(search_results, key=lambda x: x['cross-score'], reverse=True)
newline = '\n'
for search_result in search_results[:3]:
    print(f"{search_result['cross-score']} - {chunks[search_result['corpus_id']].replace( newline,' ')}")

Top-3 Cross-Encoder Re-ranking hits
5.600181579589844 -  Every- thing, including thoughts, is an object of the Ātmā, the Awareness. The Ātmā is not an object for me to be thought of, or to be experienced. It is the very “I”. Thus, to knock off all expectations and struggles and to abide as the Ātmā with the knowledge that “I am the Ātmā” is true meditation.) Thus, constantly abiding in the Ātmā , the meditator comes to en- joy permanent peace which culminates in the attainment of liberation (videhamukti) (15)
2.642035484313965 -  From the stand point of true nature of Ātmā (ādhyātmika-dṛśṭi), Bhīṣma and others are immortal. Ātmā is never subject to changes 3in spite of the changes of the body. It is neither a doer nor an en- joyer. Hence, neither is Arjuna a slayer nor is Bhīṣma slain. So, why should he resist to fight? (12 to 25). Even if the Ātmā is impermanent, Arjuna should not lament. Whatever appears will have to disappear and whatever disappears will appear. Hence, one should le