In [1]:
##Required packages
#!pip install pymupdf
#!pip install tqdm
#!pip install sentence-transformers tensorflow
#!pip install tf-keras
#!pip install langchain-text-splitters

# Embedding model https://huggingface.co/sentence-transformers/all-mpnet-base-v2

## optional, info only
#!pip install accelerate
#!pip install bitsandbytes
#!pip install flash-attn --no-build-isolation # failed because no GPU

# Semantic chunk: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# nltk
#!pip install upgrade protobuf


In [2]:
from tqdm import tqdm
import pymupdf
#import pdb

def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip()
    # Other potential text formatting functions can go here.
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict] :
    try:
        # Open the PDF document
        doc = pymupdf.open(pdf_path)
    
        # Now you can work with the Document object
        print(f"Successfully opened '{pdf_path}'.")
        print(f"Number of pages: {doc.page_count}")
    
        # To access a specific page (e.g., the first page)
        #page = doc[13]
        #print(f"Content of page 1 (first 100 characters): {page.get_text()[:100]}...")
        #pdb.set_trace()
        pages_and_texts = []
        for page_number, page in tqdm(enumerate(doc)):
            text = page.get_text()
            text = text_formatter(text)
            if len(text) != 0:
                pages_and_texts.append({"page_number": page_number,
                                   "page_char_count": len(text),
                                   "page_word_count": len(text.split(" ")),
                                   "page_sentence_count_raw": len(text.split(". ")),
                                   "page_token_count": len(text)/4, # 1 token =~4 char
                                   "text": text })
            
        # Close the document when you are done
        #doc.close()
        return pages_and_texts    
    except FileNotFoundError:
        print(f"Error: The file '{pdf_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

pdf_path = "HumanNutrition.pdf"
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

Successfully opened 'HumanNutrition.pdf'.
Number of pages: 386


386it [00:02, 144.39it/s]


[{'page_number': 1,
  'page_char_count': 202,
  'page_word_count': 37,
  'page_sentence_count_raw': 1,
  'page_token_count': 50.5,
  'text': 'Introduction to  Human Nutrition Second Edition Edited on behalf of The Nutrition Society by Michael J Gibney  Susan A Lanham-New  Aedin Cassidy  Hester H Vorster  A John Wiley & Sons, Ltd., Publication'},
 {'page_number': 3,
  'page_char_count': 32,
  'page_word_count': 5,
  'page_sentence_count_raw': 1,
  'page_token_count': 8.0,
  'text': 'Introduction to  Human Nutrition'}]

In [3]:
# import random
# random.sample(pages_and_texts, k=1)

In [4]:
import pandas as pd
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,202,37,1,50.5,Introduction to Human Nutrition Second Editio...
1,3,32,5,1,8.0,Introduction to Human Nutrition
2,4,2251,301,1,562.75,The Nutrition Society Textbook Series Introduc...
3,5,202,37,1,50.5,Introduction to Human Nutrition Second Editio...
4,6,2775,458,37,693.75,This edition ﬁ rst published 2009 First editio...


In [5]:
df.describe().round()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,381.0,381.0,381.0,381.0,381.0
mean,194.0,3851.0,653.0,23.0,963.0
std,110.0,1090.0,186.0,12.0,272.0
min,1.0,32.0,5.0,1.0,8.0
25%,99.0,3210.0,551.0,15.0,802.0
50%,194.0,4254.0,704.0,23.0,1064.0
75%,289.0,4673.0,791.0,28.0,1168.0
max,385.0,5313.0,948.0,91.0,1328.0


In [6]:
raw_text = []
for item in pages_and_texts:
    raw_text.append(item["text"])
print(f"Extracted text: {raw_text[:2]}")

Extracted text: ['Introduction to  Human Nutrition Second Edition Edited on behalf of The Nutrition Society by Michael J Gibney  Susan A Lanham-New  Aedin Cassidy  Hester H Vorster  A John Wiley & Sons, Ltd., Publication', 'Introduction to  Human Nutrition']


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initialize the RecursiveCharacterTextSplitter
# chunk_size: The maximum size of each chunk (in characters by default).
# chunk_overlap: The number of characters to overlap between consecutive chunks.
# separators: A list of characters to try splitting by, in order of preference.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=383,
    chunk_overlap=20,
    separators=["\n\n", "\n", " ", ""]  # Try splitting by paragraphs, then newlines, then spaces, then characters
)

# Split the text into documents
#docs = text_splitter.create_documents([text])
docs = text_splitter.create_documents(raw_text)
#pages_and_texts
print("No of recursive chunks", len(docs))
# Print the resulting chunks
print(docs[:2])
recursive_chunks = []
for i, doc in enumerate(docs):
    recursive_chunks.append(doc.page_content)
    #print(f"Chunk {i+1}:\n{doc.page_content}\n---")

print("Recursive chunks: ------ ", recursive_chunks[:2])

  from .autonotebook import tqdm as notebook_tqdm



No of recursive chunks 4207
[Document(metadata={}, page_content='Introduction to  Human Nutrition Second Edition Edited on behalf of The Nutrition Society by Michael J Gibney  Susan A Lanham-New  Aedin Cassidy  Hester H Vorster  A John Wiley & Sons, Ltd., Publication'), Document(metadata={}, page_content='Introduction to  Human Nutrition')]
Recursive chunks: ------  ['Introduction to  Human Nutrition Second Edition Edited on behalf of The Nutrition Society by Michael J Gibney  Susan A Lanham-New  Aedin Cassidy  Hester H Vorster  A John Wiley & Sons, Ltd., Publication', 'Introduction to  Human Nutrition']


In [8]:
# SELECT id,SUBSTRING(ni.chunk, 1, 200) AS short_chunk, embedding as dimension FROM nutritionitems ni LIMIT 5;
# SELECT id,SUBSTRING(ni.chunk, 1, 200) AS short_chunk, embedding as dimension FROM nutritionitems ni
# ORDER BY embedding <=> '[5.99734336e-02,-1.30569497e-02]'
# LIMIT 5;

In [12]:
from sentence_transformers import SentenceTransformer
sentences = recursive_chunks
embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2')
embeddings = embedding_model.encode(sentences)

In [14]:
print("total chunks",len(recursive_chunks))
print("total embeddings",len(embeddings))
print("length of each embedding",len(embeddings[0]))
print("embeddings type",type(embeddings))
embeddings_list = embeddings.tolist()
print(type(embeddings_list))

# drop table nutritionitems;
# CREATE TABLE nutritionitems (id bigserial PRIMARY KEY,chunk VARCHAR(400) NOT NULL,embedding vector(768));
# CREATE INDEX ON nutritionitems USING hnsw (embedding vector_cosine_ops);

total chunks 4207
total embeddings 4207
length of each embedding 768
embeddings type <class 'numpy.ndarray'>
<class 'list'>


In [15]:
import psycopg2
import numpy as np
import os
from dotenv import load_dotenv

def connect_to_db():
    # Load environment variables from .env file
    load_dotenv()
    DB_NAME = os.getenv("DB_NAME")
    DB_USER = os.getenv("DB_USER")
    DB_PASS = os.getenv("DB_PASS")
    DB_HOST = os.getenv("DB_HOST")
    DB_PORT = os.getenv("DB_PORT")
    
    try:
        conn = psycopg2.connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASS,
            host=DB_HOST,
            port=DB_PORT
        )
        print("Connected to PostgreSQL database successfully!")
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to PostgreSQL database: {e}")
        return None


In [18]:
conn = connect_to_db() 

cur = conn.cursor()
for i in range(len(embeddings_list)):
    embedding = embeddings_list[i]
    content = recursive_chunks[i]    
    cur.execute("INSERT INTO nutritionitems (chunk, embedding) VALUES (%s, %s)",(content, embedding))
conn.commit()
cur.close()
conn.close()
print("loaded embeddings to nutritionitems table")

Connected to PostgreSQL database successfully!
Database connected successfully!


In [None]:
# Recursive chunking 
# chunk by double new lines \n\n
# chunk by single new line
# chunk by sentence
# https://github.com/docling-project/docling    -> handle tables
# https://github.com/google/langextract
# https://spacy.io/api/sentencizer
# used this https://huggingface.co/sentence-transformers/all-mpnet-base-v2