In [None]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
from PyPDF2 import PdfReader

In [None]:
query = "What are Annelids?"

In [None]:
#We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
top_k = 32  #Number of passages we want to retrieve with the bi-encoder

In [None]:
#We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
FILE_PATH = "C:/Ambarish/NCERT/CHAP04-BIOLOGY-CLASS11.pdf"

In [None]:
"""
This function extracts text data from a PDF file.

Parameters:
- file_path (str): The path to the PDF file.

Returns:
- str: The extracted text from the PDF file.

Raises:
- Exception: If there is an error reading the file.

Example Usage:
>>> get_pdf_data('path/to/file.pdf')
"This is the extracted text from the first 3 pages of the PDF file."
"""
def get_pdf_data(file_path):
    reader = PdfReader(file_path)
    full_doc_text = ""
    pages = reader.pages
    num_pages = len(pages) 
    
    try:
        for page in range(num_pages):
            current_page = reader.pages[page]
            text = current_page.extract_text()
            full_doc_text += text
    except:
        print("Error reading file")
    finally:
        return full_doc_text

In [None]:
# Divide the text into chunks of chunk_length 
# [ default is 500] characters

def get_chunks(fulltext:str,chunk_length =500) -> list:
    text = fulltext

    chunks = []
    while len(text) > chunk_length:
        last_period_index = text[:chunk_length].rfind('.')
        if last_period_index == -1:
            last_period_index = chunk_length
        chunks.append(text[:last_period_index])
        text = text[last_period_index+1:]
    chunks.append(text)

    return chunks

In [8]:
full_doc_text = get_pdf_data(FILE_PATH)

In [9]:
print(f'Full doc text length: {len(full_doc_text)}')

Full doc text length: 31802


In [10]:
Lines =get_chunks(full_doc_text,500)

In [11]:
len(Lines)

73

In [12]:
embeddings_all = bi_encoder.encode(Lines,show_progress_bar=True)

Batches: 100%|██████████| 3/3 [00:05<00:00,  1.73s/it]


In [13]:
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)

Sematic Search

In [14]:
hits = util.semantic_search(question_embedding, embeddings_all, top_k=top_k)
hits = hits[0]  # Get the hits for the first query

In [15]:
hits

[{'corpus_id': 25, 'score': 0.6500279307365417},
 {'corpus_id': 64, 'score': 0.5820745825767517},
 {'corpus_id': 27, 'score': 0.5067367553710938},
 {'corpus_id': 32, 'score': 0.43581706285476685},
 {'corpus_id': 26, 'score': 0.4296877384185791},
 {'corpus_id': 42, 'score': 0.40576884150505066},
 {'corpus_id': 20, 'score': 0.39878952503204346},
 {'corpus_id': 18, 'score': 0.386780709028244},
 {'corpus_id': 41, 'score': 0.37630388140678406},
 {'corpus_id': 17, 'score': 0.3688402473926544},
 {'corpus_id': 31, 'score': 0.36617720127105713},
 {'corpus_id': 71, 'score': 0.3617857098579407},
 {'corpus_id': 61, 'score': 0.3545941114425659},
 {'corpus_id': 44, 'score': 0.3508317768573761},
 {'corpus_id': 23, 'score': 0.3482189476490021},
 {'corpus_id': 52, 'score': 0.33426812291145325},
 {'corpus_id': 19, 'score': 0.32377803325653076},
 {'corpus_id': 66, 'score': 0.3220410645008087},
 {'corpus_id': 7, 'score': 0.3106873035430908},
 {'corpus_id': 65, 'score': 0.3043312728404999},
 {'corpus_id': 

In [16]:
len(hits)

32

Semantic Reranking

In [17]:
##### Re-Ranking #####
# Now, score all retrieved passages with the cross_encoder
cross_inp = [[query, Lines[hit['corpus_id']]] for hit in hits]
cross_scores = cross_encoder.predict(cross_inp)

In [18]:
# Sort results by the cross-encoder scores
for idx in range(len(cross_scores)):
    hits[idx]['cross-score'] = cross_scores[idx]

In [19]:
hits

[{'corpus_id': 25, 'score': 0.6500279307365417, 'cross-score': 7.34903},
 {'corpus_id': 64, 'score': 0.5820745825767517, 'cross-score': 6.0799055},
 {'corpus_id': 27, 'score': 0.5067367553710938, 'cross-score': 0.6422741},
 {'corpus_id': 32, 'score': 0.43581706285476685, 'cross-score': -8.577603},
 {'corpus_id': 26, 'score': 0.4296877384185791, 'cross-score': 1.1963358},
 {'corpus_id': 42, 'score': 0.40576884150505066, 'cross-score': -8.462291},
 {'corpus_id': 20, 'score': 0.39878952503204346, 'cross-score': -7.3358374},
 {'corpus_id': 18, 'score': 0.386780709028244, 'cross-score': -9.917613},
 {'corpus_id': 41, 'score': 0.37630388140678406, 'cross-score': -9.6821375},
 {'corpus_id': 17, 'score': 0.3688402473926544, 'cross-score': -8.8455305},
 {'corpus_id': 31, 'score': 0.36617720127105713, 'cross-score': -9.399993},
 {'corpus_id': 71, 'score': 0.3617857098579407, 'cross-score': -4.7287617},
 {'corpus_id': 61, 'score': 0.3545941114425659, 'cross-score': -3.5549932},
 {'corpus_id': 44,

In [20]:
# Output of top-5 hits from bi-encoder
print("\n-------------------------\n")
print("Top-3 Bi-Encoder Retrieval hits")
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
for hit in hits[0:3]:
    print("\t{:.3f}\t{}".format(hit['score'], Lines[hit['corpus_id']].replace("\n", " ")))



-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.650	6 Phylum – Annelida They may be aquatic (marine and fresh water) or terrestrial; free-living, and sometimes parasitic. They exhibit organ-system level of body organisation and bilateral symmetry. They are triploblastic, metamerically segmented and coelomate animals. Their body surface is distinctly marked out into segments  or metameres  and, hence, the phylum name Annelida (Latin, annulus  : little ring) (Figure 4.11). They possess longitudinal and circular muscles which help in locomotion
	0.582	 Aschelminthes are pseudocoelomates and  include parasitic as well as non-parasitic roundworms. Annelids are metamerically segmented animals with a true coelom. The arthropods are the most abundant group of animals characterised by the presence of jointed appendages. The molluscs have a soft body surrounded by an external calcareous shell. The body is covered with external skeleton made of chitin. The echinoderms possess a spi

In [21]:
# Output of top-5 hits from re-ranker
print("\n-------------------------\n")
print("Top-3 Cross-Encoder Re-ranker hits")
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
for hit in hits[0:3]:
    print("\t{:.3f}\t{}".format(hit['cross-score'], Lines[hit['corpus_id']].replace("\n", " ")))


-------------------------

Top-3 Cross-Encoder Re-ranker hits
	7.349	6 Phylum – Annelida They may be aquatic (marine and fresh water) or terrestrial; free-living, and sometimes parasitic. They exhibit organ-system level of body organisation and bilateral symmetry. They are triploblastic, metamerically segmented and coelomate animals. Their body surface is distinctly marked out into segments  or metameres  and, hence, the phylum name Annelida (Latin, annulus  : little ring) (Figure 4.11). They possess longitudinal and circular muscles which help in locomotion
	6.080	 Aschelminthes are pseudocoelomates and  include parasitic as well as non-parasitic roundworms. Annelids are metamerically segmented animals with a true coelom. The arthropods are the most abundant group of animals characterised by the presence of jointed appendages. The molluscs have a soft body surrounded by an external calcareous shell. The body is covered with external skeleton made of chitin. The echinoderms possess a 

In [22]:
hits

[{'corpus_id': 25, 'score': 0.6500279307365417, 'cross-score': 7.34903},
 {'corpus_id': 64, 'score': 0.5820745825767517, 'cross-score': 6.0799055},
 {'corpus_id': 26, 'score': 0.4296877384185791, 'cross-score': 1.1963358},
 {'corpus_id': 27, 'score': 0.5067367553710938, 'cross-score': 0.6422741},
 {'corpus_id': 7, 'score': 0.3106873035430908, 'cross-score': -2.2745957},
 {'corpus_id': 3, 'score': 0.28081774711608887, 'cross-score': -3.2081892},
 {'corpus_id': 61, 'score': 0.3545941114425659, 'cross-score': -3.5549932},
 {'corpus_id': 71, 'score': 0.3617857098579407, 'cross-score': -4.7287617},
 {'corpus_id': 20, 'score': 0.39878952503204346, 'cross-score': -7.3358374},
 {'corpus_id': 65, 'score': 0.3043312728404999, 'cross-score': -7.4494085},
 {'corpus_id': 35, 'score': 0.28132861852645874, 'cross-score': -7.703805},
 {'corpus_id': 28, 'score': 0.275229275226593, 'cross-score': -8.09626},
 {'corpus_id': 21, 'score': 0.29166650772094727, 'cross-score': -8.242816},
 {'corpus_id': 42, 's