In [1]:
##Required packages
#!pip install pymupdf
#!pip install tqdm
#!pip install sentence-transformers tensorflow
#!pip install tf-keras
#!pip install langchain-text-splitters

# Embedding model https://huggingface.co/sentence-transformers/all-mpnet-base-v2

## optional, info only
#!pip install accelerate
#!pip install bitsandbytes
#!pip install flash-attn --no-build-isolation # failed because no GPU

# Semantic chunk: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# nltk
#!pip install upgrade protobuf

local_debug = True


In [2]:
from tqdm import tqdm
import pymupdf
import re


def text_formatter(text: str) -> str:
    para_end = "<paraend>"
    word_cut = "<wordcut>"
    words_to_clean = {
        "ﬁ ":"fi",
        "ﬂ ":"fl",
        ".\n\n\n": para_end,
        ".\n\n": para_end,
        "\n\n": para_end,
        ":\n": para_end,
        ".\n": para_end,
        "-\n": word_cut,
        "\n": " ",
        para_end : ".\n",
        word_cut :"",
    }

    cleaned_text = text
    for invalid_word, valid_word in words_to_clean.items():
        cleaned_text = cleaned_text.replace(invalid_word, valid_word)        
    
    # Remove excessive spaces and replace with a single space    
    cleaned_text = re.sub(' +', ' ', cleaned_text)
    return cleaned_text.strip()

def open_and_read_pdf(pdf_path: str) -> list[dict] :
    try:        
        doc = pymupdf.open(pdf_path)    
        print(f"Successfully opened '{pdf_path}'.")
        print(f"Number of pages: {doc.page_count}")    
        header_height = 50 # Adjust as needed
        footer_height = 50 # Adjust as needed
        pages_and_texts = []
        for page_number, page in tqdm(enumerate(doc)):
            if page_number > 0 :
                page_rect = page.rect
                clip = pymupdf.Rect(
                    page_rect.x0,
                    page_rect.y0 + header_height,
                    page_rect.x1,
                    page_rect.y1 - footer_height
                )
                #get_text(option, *, clip=None, flags=None, textpage=None, sort=False, delimiters=None)                
                text = page.get_text(clip=clip)
                text = text_formatter(text)          
                
                if len(text) != 0 and page_number > 14 :
                    pages_and_texts.append({"page_number": page_number-14,
                                       "page_char_count": len(text),
                                       "page_word_count": len(text.split(" ")),
                                       "page_sentence_count_raw": len(text.split(". ")),
                                       "page_token_count": len(text)/4, # 1 token =~4 char
                                       "text": text })
        
        doc.close()
        return pages_and_texts    
    except FileNotFoundError:
        print(f"Error: The file '{pdf_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")



In [3]:
import string
import re
from spellchecker import SpellChecker
import os

def extract_text_and_spell_check(pages_and_texts: list) -> list :
    misspelled_words = []
    raw_text = []    
    misspell_file = "misspelled_words_test.txt"    
    if os.path.exists(misspell_file):
        os.remove(misspell_file)
    
    with open(misspell_file, 'w', encoding='utf-8') as file:
        for item in pages_and_texts :   
            raw_text.append(item["text"])
            cleaned_text = re.sub(r'[^\w\s]', '', item["text"])
            spell = SpellChecker()    
            misspelled = spell.unknown(cleaned_text.split())    
            for word in misspelled:
                misspelled_words.append(word)
                file.write(word + "\n")
    return raw_text

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def get_recursive_chunks(raw_text: list) -> list:    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=40,
        separators=["\n\n", "\n", " ", ""]  # Try splitting by paragraphs, then newlines, then spaces, then characters
    )    
    docs = text_splitter.create_documents(raw_text)    
    recursive_chunks = []
    chunks_file = "chunks_test.txt"    
    if os.path.exists(chunks_file):
        os.remove(chunks_file)
    
    with open(chunks_file, 'w', encoding='utf-8') as file:
        for i, doc in enumerate(docs):
            recursive_chunks.append(doc.page_content)
            if local_debug :                
                file.write(doc.page_content + "\n\n")
    return recursive_chunks
    

  from .autonotebook import tqdm as notebook_tqdm





In [8]:

# SELECT id,SUBSTRING(ni.chunk, 1, 200) AS short_chunk, embedding as dimension FROM nutritionitems ni
# ORDER BY embedding <=> '[5.99734336e-02,-1.30569497e-02]'
# LIMIT 5;
# drop table nutritionitems;
# CREATE TABLE nutritionitems (id bigserial PRIMARY KEY,chunk VARCHAR(2500) NOT NULL,embedding vector(768));
# CREATE INDEX ON nutritionitems USING hnsw (embedding vector_cosine_ops);

In [5]:
from sentence_transformers import SentenceTransformer
def get_embeddings(recursive_chunks : list) -> list:    
    embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2')
    embeddings = embedding_model.encode(recursive_chunks)
    embeddings_list = embeddings.tolist()
    return embeddings_list

In [12]:
import psycopg2
import numpy as np
import os
from dotenv import load_dotenv
from psycopg2 import Error

class PostgreSQLManager:
    def __init__(self, db_params):
        """
        Initializes the database connection.
        db_params should be a dictionary with keys like 'host', 'database', 'user', 'password', 'port'.
        """
        self.db_params = db_params
        self.connection = None

    def connect(self):
        """Establishes a connection to the PostgreSQL database."""
        try:
            self.connection = psycopg2.connect(**self.db_params)
            self.connection.autocommit = False  # Disable autocommit for explicit transactions
            print("Database connection established successfully.")
        except Error as e:
            print(f"Error connecting to database: {e}")
            self.connection = None

    def disconnect(self):
        """Closes the database connection."""
        if self.connection:
            self.connection.close()
            print("Database connection closed.")
            
    def execute_batch(self, insert_query, data_to_insert) -> bool:
        try:
            with self.connection.cursor() as cursor:
                psycopg2.extras.execute_batch(cursor, insert_query, data_to_insert)
                self.connection.commit()  # Commit changes for CUD operations
                return True
        except Error as e:
            self.connection.rollback()  # Rollback on error
            print(f"Database operation failed: {e}")
            return None
            
    def execute_select_count(self, select_query) -> int :
        try:
            with self.connection.cursor() as cursor:
                cursor.execute(select_query)
                count_after = cursor.fetchone()[0]            
                return count_after
        except Error as e:
            self.connection.rollback()  # Rollback on error
            print(f"Database operation failed: {e}")
            return None

    def execute_query(self, query, params=None, fetch_result=False):
        """Helper method to execute a query and handle transactions."""
        if not self.connection:
            print("No database connection. Please connect first.")
            return None

        try:
            with self.connection.cursor() as cursor:
                if params:
                    cursor.execute(query, params)
                else:
                    cursor.execute(query)

                if fetch_result:
                    return cursor.fetchall()
                else:
                    self.connection.commit()  # Commit changes for CUD operations
                    return True
        except Error as e:
            self.connection.rollback()  # Rollback on error
            print(f"Database operation failed: {e}")
            return None

In [10]:
import psycopg2
import numpy as np
import os
from dotenv import load_dotenv
from psycopg2.extras import execute_batch

def save_relevant_chunks(recursive_chunks:list, embeddings_list:list) -> int:
    load_dotenv()
    DB_NAME = os.getenv("DB_NAME")
    DB_USER = os.getenv("DB_USER")
    DB_PASS = os.getenv("DB_PASS")
    DB_HOST = os.getenv("DB_HOST")
    DB_PORT = os.getenv("DB_PORT")
    db_params = {
            "host": DB_HOST,
            "database": DB_NAME,
            "user": DB_USER,
            "password": DB_PASS,
            "port": DB_PORT
        }

    crud_manager = PostgreSQLManager(db_params)
    crud_manager.connect()

    data_to_insert = []
    for i in range(len(embeddings_list)):
        content = recursive_chunks[i]            
        embedding = embeddings_list[i]
        data_to_insert.append((content,embedding))

    if crud_manager.connection:
        table_name = "nutritionitems"
        insert_sql = f"INSERT INTO {table_name} (chunk, embedding) VALUES (%s, %s)"        
        crud_manager.execute_batch(insert_sql, data_to_insert)
        rows_inserted = crud_manager.execute_select_count(f"SELECT COUNT(*) FROM {table_name};")        
        crud_manager.disconnect()
        return rows_inserted

In [15]:
import pandas as pd

pdf_path = "HumanNutrition.pdf"
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

df = pd.DataFrame(pages_and_texts)
#df.head()
df.describe().round()

raw_text = extract_text_and_spell_check(pages_and_texts)
if local_debug :
    print("raw_text len", len(raw_text))
recursive_chunks = get_recursive_chunks(raw_text)
if local_debug :
    print("recursive_chunks len", len(recursive_chunks))
embeddings_list = get_embeddings(recursive_chunks)
if local_debug :
    print("embeddings_list len", len(embeddings_list))


In [14]:
inserted_rows = save_relevant_chunks(recursive_chunks, embeddings_list)
if local_debug :
    print("inserted_rows: ", inserted_rows)

Database connection established successfully.
Database connection closed.
inserted_rows:  989


In [None]:
# Recursive chunking 
# chunk by double new lines \n\n
# chunk by single new line
# chunk by sentence
# https://github.com/docling-project/docling    -> handle tables
# https://github.com/google/langextract
# https://spacy.io/api/sentencizer
# used this https://huggingface.co/sentence-transformers/all-mpnet-base-v2