In [58]:
##Required packages
#!pip install pymupdf
#!pip install tqdm
#!pip install sentence-transformers tensorflow
#!pip install tf-keras
#!pip install langchain-text-splitters

# Embedding model https://huggingface.co/sentence-transformers/all-mpnet-base-v2

## optional, info only
#!pip install accelerate
#!pip install bitsandbytes
#!pip install flash-attn --no-build-isolation # failed because no GPU

# Semantic chunk: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# nltk
#!pip install upgrade protobuf


In [59]:
from tqdm import tqdm
import pymupdf
import re


def text_formatter(text: str) -> str:
    para_end = "<paraend>"
    word_cut = "<wordcut>"
    words_to_clean = {
        "ﬁ ":"fi",
        "ﬂ ":"fl",
        ".\n\n\n": para_end,
        ".\n\n": para_end,
        "\n\n": para_end,
        ":\n": para_end,
        ".\n": para_end,
        "-\n": word_cut,
        "\n": " ",
        para_end : ".\n",
        word_cut :"",
    }

    cleaned_text = text
    for invalid_word, valid_word in words_to_clean.items():
        cleaned_text = cleaned_text.replace(invalid_word, valid_word)        
    
    # cleaned_text = cleaned_text.replace(".\n\n\n", "<paraend>")
    # cleaned_text = cleaned_text.replace(".\n\n", "<paraend>")
    # cleaned_text = cleaned_text.replace("\n\n", "<paraend>")
    # cleaned_text = cleaned_text.replace(":\n", "<paraend>")
    # cleaned_text = cleaned_text.replace(".\n", "<paraend>")
    # cleaned_text = cleaned_text.replace("-\n", "<wordcut>")
    # cleaned_text = cleaned_text.replace("\n", " ")
    # cleaned_text = cleaned_text.replace("<paraend>", ".\n ")
    # cleaned_text = cleaned_text.replace("<wordcut>","")
    
    # Remove excessive spaces and replace with a single space    
    cleaned_text = re.sub(' +', ' ', cleaned_text)
    return cleaned_text.strip()

def open_and_read_pdf(pdf_path: str) -> list[dict] :
    try:
        # Open the PDF document
        doc = pymupdf.open(pdf_path)
    
        # Now you can work with the Document object
        print(f"Successfully opened '{pdf_path}'.")
        print(f"Number of pages: {doc.page_count}")
    
        # To access a specific page (e.g., the first page)
        #page = doc[13]
        #print(f"Content of page 1 (first 100 characters): {page.get_text()[:100]}...")
        #pdb.set_trace()
        header_height = 50 # Adjust as needed
        footer_height = 50 # Adjust as needed
        pages_and_texts = []
        for page_number, page in tqdm(enumerate(doc)):
            if page_number > 0 :
                page_rect = page.rect
                clip = pymupdf.Rect(
                    page_rect.x0,
                    page_rect.y0 + header_height,
                    page_rect.x1,
                    page_rect.y1 - footer_height
                )
                #get_text(option, *, clip=None, flags=None, textpage=None, sort=False, delimiters=None)
                #flags = pymupdf.TEXT_INHIBIT_SPACES            
                text = page.get_text(clip=clip)
                text = text_formatter(text)          
                
                if len(text) != 0 and page_number > 14 :
                    pages_and_texts.append({"page_number": page_number-14,
                                       "page_char_count": len(text),
                                       "page_word_count": len(text.split(" ")),
                                       "page_sentence_count_raw": len(text.split(". ")),
                                       "page_token_count": len(text)/4, # 1 token =~4 char
                                       "text": text })
            
        # Close the document when you are done
        doc.close()
        return pages_and_texts    
    except FileNotFoundError:
        print(f"Error: The file '{pdf_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

pdf_path = "HumanNutrition.pdf"
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

Successfully opened 'HumanNutrition.pdf'.
Number of pages: 386


386it [00:05, 70.67it/s]


In [60]:
import pandas as pd
df = pd.DataFrame(pages_and_texts)
df.head()
df.describe().round()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,370.0,370.0,370.0,370.0,370.0
mean,186.0,3776.0,569.0,17.0,944.0
std,107.0,1039.0,162.0,9.0,260.0
min,1.0,233.0,37.0,1.0,58.0
25%,93.0,3159.0,469.0,11.0,790.0
50%,186.0,4160.0,618.0,17.0,1040.0
75%,278.0,4554.0,688.0,22.0,1138.0
max,371.0,5180.0,835.0,65.0,1295.0


In [61]:
import string
import re
from spellchecker import SpellChecker
import os

misspelled_words = []
raw_text = []

misspell_file = "misspelled_words_test.txt"

if os.path.exists(misspell_file):
    os.remove(misspell_file)

with open(misspell_file, 'w', encoding='utf-8') as file:
    for item in pages_and_texts :   
        raw_text.append(item["text"])
        cleaned_text = re.sub(r'[^\w\s]', '', item["text"])
        spell = SpellChecker()    
        misspelled = spell.unknown(cleaned_text.split())    
        for word in misspelled:
            misspelled_words.append(word)
            file.write(word + "\n")     

print(f"Text appended to '{file_name}'.")


Text appended to 'misspelled_words.txt'.


In [64]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initialize the RecursiveCharacterTextSplitter
# chunk_size: The maximum size of each chunk (in characters by default).
# chunk_overlap: The number of characters to overlap between consecutive chunks.
# separators: A list of characters to try splitting by, in order of preference.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=40,
    separators=["\n\n", "\n", " ", ""]  # Try splitting by paragraphs, then newlines, then spaces, then characters
)

# Split the text into documents
#docs = text_splitter.create_documents([text])
docs = text_splitter.create_documents(raw_text)
#pages_and_texts
print("No of recursive chunks", len(docs))
# Print the resulting chunks

recursive_chunks = []
chunks_file = "chunks_test.txt"

if os.path.exists(chunks_file):
    os.remove(chunks_file)

with open(chunks_file, 'w', encoding='utf-8') as file:
    for i, doc in enumerate(docs):
        recursive_chunks.append(doc.page_content)
        file.write(doc.page_content + "\n\n")
    

No of recursive chunks 989


In [8]:
# SELECT id,SUBSTRING(ni.chunk, 1, 200) AS short_chunk, embedding as dimension FROM nutritionitems ni LIMIT 5;
# SELECT id,SUBSTRING(ni.chunk, 1, 200) AS short_chunk, embedding as dimension FROM nutritionitems ni
# ORDER BY embedding <=> '[5.99734336e-02,-1.30569497e-02]'
# LIMIT 5;

In [65]:
from sentence_transformers import SentenceTransformer
sentences = recursive_chunks
embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2')
embeddings = embedding_model.encode(sentences)

In [66]:
print("total chunks",len(recursive_chunks))
print("total embeddings",len(embeddings))
print("length of each embedding",len(embeddings[0]))
print("embeddings type",type(embeddings))
embeddings_list = embeddings.tolist()
print(type(embeddings_list))

# drop table nutritionitems;
# CREATE TABLE nutritionitems (id bigserial PRIMARY KEY,chunk VARCHAR(2500) NOT NULL,embedding vector(768));
# CREATE INDEX ON nutritionitems USING hnsw (embedding vector_cosine_ops);

total chunks 989
total embeddings 989
length of each embedding 768
embeddings type <class 'numpy.ndarray'>
<class 'list'>


In [69]:
import psycopg2
import numpy as np
import os
from dotenv import load_dotenv

def connect_to_db():
    # Load environment variables from .env file
    load_dotenv()
    DB_NAME = os.getenv("DB_NAME")
    DB_USER = os.getenv("DB_USER")
    DB_PASS = os.getenv("DB_PASS")
    DB_HOST = os.getenv("DB_HOST")
    DB_PORT = os.getenv("DB_PORT")
    
    try:
        conn = psycopg2.connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASS,
            host=DB_HOST,
            port=DB_PORT
        )
        print("Connected to PostgreSQL database successfully!")
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to PostgreSQL database: {e}")
        return None


conn = connect_to_db() 
table_name = "nutritionitems"
cur = conn.cursor()
for i in range(len(embeddings_list)):
    embedding = embeddings_list[i]
    content = recursive_chunks[i]    
    cur.execute(f"INSERT INTO {table_name} (chunk, embedding) VALUES (%s, %s)",(content, embedding))
    
conn.commit()
cur.execute(f"SELECT COUNT(*) FROM {table_name};")
count_after = cur.fetchone()[0]
print(f"Rows after insert into {table_name} table: {count_after}")

cur.close()
conn.close()


Connected to PostgreSQL database successfully!
Rows after insert into nutritionitems table: 989


In [None]:
# Recursive chunking 
# chunk by double new lines \n\n
# chunk by single new line
# chunk by sentence
# https://github.com/docling-project/docling    -> handle tables
# https://github.com/google/langextract
# https://spacy.io/api/sentencizer
# used this https://huggingface.co/sentence-transformers/all-mpnet-base-v2