In [1]:
import random
from pathlib import Path
from pprint import pprint

import fitz  # pymupdf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
from spacy.lang.en import English

  from tqdm.autonotebook import tqdm, trange


In [2]:
def preprocess(file_path: fitz.Document) -> pd.DataFrame:
    with fitz.open(file_path) as doc:
        page_content = {}
        for page_number, page in enumerate(doc) :
            text = page.get_text().replace('\n', ' ')
            page_content[page_number+1] = text

        data = pd.DataFrame(columns=['page_number', 'text'])
        data['page_number'] = list(page_content.keys())
        data['text'] = list(page_content.values())

        return data

In [3]:
class Preprocessing:
    def __init__(self, file_path: Path, chunk_size: int) -> None:
        self.chunk_size = chunk_size
        self.data = self.prepare_data(file_path)
        self.split_to_pages()
        self.chunk_data()
        self.split_chunks()

    def __call__(self):
        return self.data
    
    def prepare_data(self, file_path): # TODO this procedure can be optimized later
        with fitz.open(file_path) as doc:
            page_content = {}
            for page_number, page in enumerate(doc) :
                text = page.get_text().replace('\n', ' ')
                page_content[page_number+1] = text

            data = pd.DataFrame(columns=['page_number', 'text'])
            data['page_number'] = list(page_content.keys())
            data['text'] = list(page_content.values())
            del page_content

            return data

    def split_to_pages(self):
        nlp = English()
        nlp.add_pipe("sentencizer")

        self.data['sentences'] = self.data.apply(
            func=lambda row: list(nlp(row['text']).sents),
            axis=1,
            # result_type='expand'
        )
        # make sure all the sentences are str() (if you don't do this , the type will be )
        self.data['sentences'] = self.data['sentences'].map(
            lambda sentences: list(map(lambda s: s.text, sentences))
        )

    def chunk_data(self):
        def get_chunk_points(chunk_size, array_length):
            return list(
                range(chunk_size, array_length + 1, chunk_size)
            )
        
        self.data['chunked'] = self.data.apply(
            func=lambda row: np.split(
                row['sentences'],
                get_chunk_points(self.chunk_size, len(row['sentences']))
            ),
            axis=1
        )

    def split_chunks(self):
        chunked_data = self.data.explode('chunked').reset_index()
        chunked_data = chunked_data[['page_number', 'chunked']]

        # converting the chunked list into one string
        chunked_data['chunked'] = chunked_data['chunked'].apply(
            func=lambda chunk: ' '.join(chunk)
        )

        self.data = chunked_data


In [4]:
pdf_path = Path('./Early Iran History.pdf')
preprocessor = Preprocessing(pdf_path, chunk_size=13)

In [5]:
data = preprocessor()

In [6]:
data.head()

Unnamed: 0,page_number,chunked
0,1,H I S T O R Y O F E A R L Y I R A N oi.uc...
1,2,T H E U N I V E R S I T Y O F C H I C A G O...
2,3,HISTORY OF EARLY IRAN BY GEORGE G. CAMERON I...
3,4,C O P Y R I G H T I 9 3 6 B Y T H E U N I ...
4,5,TO MY WIFE oi.uchicago.edu


### Embedding

In [7]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'mps'

In [8]:
embedding_model = SentenceTransformer(
    model_name_or_path="all-mpnet-base-v2", 
    device=device
)



In [9]:
class Embedder:
    def __init__(self, data: pd.DataFrame, embedding_model: SentenceTransformer) -> None:
        self.data = data
        self.embedding_model = embedding_model
        self.embeddings = self.embed()
    
    def embed(self) :
        text_chunk_embeddings = self.data.apply(
            func=lambda row: self.embedding_model.encode(
                row['chunked'],
                batch_size=64, 
            ),
        axis=1
        )
        # convert to tensor
        text_chunk_embeddings = text_chunk_embeddings.apply(
            func=lambda embedding: torch.tensor(embedding)
        )
        # convert the embeddings into a matrix
        embeddings = torch.stack(text_chunk_embeddings.tolist()).to(device)
        return embeddings
    
    def get_score(self, query, k):
        query_embedding = self.embedding_model.encode(query, convert_to_tensor=True)
        dot_score = util.dot_score(a=query_embedding, b=self.embeddings)
        score_result = torch.topk(
            input=dot_score,
            k=k,
            dim=1
        )
        return score_result
    
    def get_related_content(self, query, k=5):
        score_result = self.get_score(query, k)
        for value, index in list(zip(score_result[0].ravel(), score_result[1].ravel())) :
            index = int(index)
            page_number = self.data.iloc[index]['page_number']
            print(f"Score: {value}")
            print(f"Index: {index}")
            print(f"Page: {page_number}")
            pprint(self.data.iloc[index]['chunked'])
            print()

In [10]:
emb = Embedder(data, embedding_model)
emb.embed()

tensor([[-0.0319,  0.0426,  0.0149,  ..., -0.0099,  0.0187, -0.0342],
        [ 0.0485, -0.0495, -0.0167,  ..., -0.0002, -0.0482, -0.0372],
        [ 0.0207,  0.0594, -0.0192,  ...,  0.0692, -0.0103, -0.0485],
        ...,
        [-0.0011,  0.0078, -0.0217,  ...,  0.0783,  0.0023, -0.0388],
        [ 0.0347, -0.0003, -0.0243,  ...,  0.0615,  0.0028, -0.0384],
        [ 0.0066,  0.0154, -0.0130,  ...,  0.0797, -0.0005, -0.0443]],
       device='mps:0')

In [11]:
emb.get_related_content(
    'What is the name of the first Iranian king?'
)

Score: 0.6583991050720215
Index: 64
Page: 42
('26  HISTORY OF EARLY IRAN  ready had a local history;12 but its political '
 'fate was  inextricably bound up with the city Awan, where  there now (ca. '
 '2670 B.C.) began to rule a dynasty of  kings, twelve in number.13  Peli '
 'founded the dynasty; and, if names are to be  trusted, his immediate '
 'successors were all pure Elam- ites. To us these rulers—Tata,14 Ukku-tahesh, '
 'Hi- shur, Shushun-tarana, Napi-ilhush, and Kikku-sime- temti—are no more '
 'than names, though we might,  with some degree of probability, ascribe to '
 'one of  them an inscription since found on Liyan, modern  Bushire, an island '
 'in the Persian Gulf. Fragmentary  though it is, this text with its archaic '
 'signs is yet  proof that by the time of Sargon of Agade the Elam- ites had '
 'adopted the Sumerian script to write their  own language.15 With the eighth '
 'member of the  dynasty, Luhhi-ishshan, and his successor, Hishep- 13 Scheil, '
 'Mim.y VI, 59