In [1]:
#!pip install pymupdf
#!pip install tqdm

#!pip install accelerate
#!pip install bitsandbytes
#!pip install flash-attn --no-build-isolation # failed because no GPU

#!pip install langchain-text-splitters

# Semantic chunk: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# nltk
# Embedding model https://huggingface.co/sentence-transformers/all-mpnet-base-v2
#!pip install kubernetes
#!pip install sentence-transformers tensorflow
#!pip install tf-keras
#!pip install upgrade protobuf


In [2]:
from tqdm import tqdm
import pymupdf
#import pdb

def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip()
    # Other potential text formatting functions can go here.
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict] :
    try:
        # Open the PDF document
        doc = pymupdf.open(pdf_path)
    
        # Now you can work with the Document object
        print(f"Successfully opened '{pdf_path}'.")
        print(f"Number of pages: {doc.page_count}")
    
        # To access a specific page (e.g., the first page)
        #page = doc[13]
        #print(f"Content of page 1 (first 100 characters): {page.get_text()[:100]}...")
        #pdb.set_trace()
        pages_and_texts = []
        for page_number, page in tqdm(enumerate(doc)):
            text = page.get_text()
            text = text_formatter(text)
            if len(text) != 0:
                pages_and_texts.append({"page_number": page_number,
                                   "page_char_count": len(text),
                                   "page_word_count": len(text.split(" ")),
                                   "page_sentence_count_raw": len(text.split(". ")),
                                   "page_token_count": len(text)/4, # 1 token =~4 char
                                   "text": text })
            
        # Close the document when you are done
        #doc.close()
        return pages_and_texts    
    except FileNotFoundError:
        print(f"Error: The file '{pdf_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

pdf_path = "HumanNutrition.pdf"
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

Successfully opened 'HumanNutrition.pdf'.
Number of pages: 386


386it [00:04, 88.16it/s] 


[{'page_number': 1,
  'page_char_count': 202,
  'page_word_count': 37,
  'page_sentence_count_raw': 1,
  'page_token_count': 50.5,
  'text': 'Introduction to  Human Nutrition Second Edition Edited on behalf of The Nutrition Society by Michael J Gibney  Susan A Lanham-New  Aedin Cassidy  Hester H Vorster  A John Wiley & Sons, Ltd., Publication'},
 {'page_number': 3,
  'page_char_count': 32,
  'page_word_count': 5,
  'page_sentence_count_raw': 1,
  'page_token_count': 8.0,
  'text': 'Introduction to  Human Nutrition'}]

In [3]:
# import random
# random.sample(pages_and_texts, k=1)

In [4]:
import pandas as pd
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,202,37,1,50.5,Introduction to Human Nutrition Second Editio...
1,3,32,5,1,8.0,Introduction to Human Nutrition
2,4,2251,301,1,562.75,The Nutrition Society Textbook Series Introduc...
3,5,202,37,1,50.5,Introduction to Human Nutrition Second Editio...
4,6,2775,458,37,693.75,This edition ﬁ rst published 2009 First editio...


In [5]:
df.describe().round()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,381.0,381.0,381.0,381.0,381.0
mean,194.0,3851.0,653.0,23.0,963.0
std,110.0,1090.0,186.0,12.0,272.0
min,1.0,32.0,5.0,1.0,8.0
25%,99.0,3210.0,551.0,15.0,802.0
50%,194.0,4254.0,704.0,23.0,1064.0
75%,289.0,4673.0,791.0,28.0,1168.0
max,385.0,5313.0,948.0,91.0,1328.0


In [6]:
raw_text = []
for item in pages_and_texts:
    raw_text.append(item["text"])
print(f"Extracted text: {raw_text[:2]}")

Extracted text: ['Introduction to  Human Nutrition Second Edition Edited on behalf of The Nutrition Society by Michael J Gibney  Susan A Lanham-New  Aedin Cassidy  Hester H Vorster  A John Wiley & Sons, Ltd., Publication', 'Introduction to  Human Nutrition']


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initialize the RecursiveCharacterTextSplitter
# chunk_size: The maximum size of each chunk (in characters by default).
# chunk_overlap: The number of characters to overlap between consecutive chunks.
# separators: A list of characters to try splitting by, in order of preference.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
    separators=["\n\n", "\n", " ", ""]  # Try splitting by paragraphs, then newlines, then spaces, then characters
)

# Split the text into documents
#docs = text_splitter.create_documents([text])
docs = text_splitter.create_documents(raw_text)
#pages_and_texts
print("No of recursive chunks", len(docs))
# Print the resulting chunks
print(docs[:2])
recursive_chunks = []
for i, doc in enumerate(docs):
    recursive_chunks.append(doc.page_content)
    #print(f"Chunk {i+1}:\n{doc.page_content}\n---")

print("Recursive chunks: ------ ", recursive_chunks[:2])

  from .autonotebook import tqdm as notebook_tqdm



No of recursive chunks 8301
[Document(metadata={}, page_content='Introduction to  Human Nutrition Second Edition Edited on behalf of The Nutrition Society by Michael J Gibney  Susan A Lanham-New  Aedin Cassidy  Hester H Vorster  A John Wiley & Sons, Ltd.,'), Document(metadata={}, page_content='Wiley & Sons, Ltd., Publication')]
Recursive chunks: ------  ['Introduction to  Human Nutrition Second Edition Edited on behalf of The Nutrition Society by Michael J Gibney  Susan A Lanham-New  Aedin Cassidy  Hester H Vorster  A John Wiley & Sons, Ltd.,', 'Wiley & Sons, Ltd., Publication']


In [None]:
# SELECT id,SUBSTRING(ni.chunk, 1, 200) AS short_chunk, embedding as dimension FROM nutritionitems ni LIMIT 5;
# SELECT id,SUBSTRING(ni.chunk, 1, 200) AS short_chunk, embedding as dimension FROM nutritionitems ni
# ORDER BY embedding <=> '[5.99734336e-02,-1.30569497e-02]'
# LIMIT 5;

In [35]:


from sentence_transformers import SentenceTransformer
sentences = recursive_chunks

embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2')
# embeddings = embedding_model.encode(sentences)
input_query = "What is Metabolism of Proteins"
query_embedding = embedding_model.encode(input_query)
# print(embeddings[:1])
print("query_embedding", query_embedding)


query_embedding [ 5.99734336e-02 -1.30569497e-02  2.30207779e-02 -2.00188030e-02
 -7.81783741e-03  2.83929184e-02 -7.55176246e-02  3.40906158e-02
 -6.34887815e-02 -3.30348052e-02 -3.83958500e-03  2.17949245e-02
  1.22626675e-02  1.32024419e-02 -2.19708215e-02  6.36042878e-02
 -3.19794118e-02 -2.56833211e-02 -2.51763295e-02  2.15658639e-02
 -2.04073582e-02  5.76458639e-04  4.17370498e-02 -4.13271561e-02
  4.25131954e-02  1.95993949e-02 -1.69518702e-02  3.48948268e-03
  2.45916005e-02  2.21633967e-02 -7.56690130e-02 -5.84247382e-03
  1.93108283e-02  2.45451499e-02  1.22844278e-06 -6.91458955e-02
 -5.60407303e-02 -1.92490276e-02 -1.01138782e-02  5.83418235e-02
 -2.64907796e-02  4.10293200e-04 -1.58048812e-02  3.67564894e-02
  6.66305125e-02  1.08983833e-02  2.55106930e-02 -3.24699730e-02
 -1.87634956e-02 -2.25600284e-02  1.23422742e-02 -3.22799496e-02
 -2.45459974e-02 -4.52162605e-03  2.34220345e-02  6.59096334e-03
  4.03133184e-02  4.11379784e-02 -2.35513486e-02  8.81510787e-03
  7.70121

In [29]:
print("length of each embedding",len(embeddings[0]))
print("total embeddings",len(embeddings))
print("total chunks",len(recursive_chunks))
print("embeddings type",type(embeddings))

# drop table nutritionitems;
# CREATE TABLE nutritionitems (id bigserial PRIMARY KEY,chunk VARCHAR(400) NOT NULL,embedding vector(768));
# CREATE INDEX ON nutritionitems USING hnsw (embedding vector_cosine_ops);

length of each embedding 768
total embeddings 8301
total chunks 8301
embeddings type <class 'numpy.ndarray'>


In [34]:
import psycopg2
import numpy as np

embeddings_list = embeddings.tolist()
# Database connection parameters
DB_NAME = "sales"
DB_USER = "postgres"
DB_PASS = "pass@word1"
DB_HOST = "localhost"  # Or your PostgreSQL host
DB_PORT = "5433"       # Default PostgreSQL port
#conn = psycopg2.connect("dbname=sales user=postgres password=pass@word1")
 # Establish a connection to the PostgreSQL database


Database connected successfully!


In [36]:
# Ingest 
# conn = psycopg2.connect(
#     database=DB_NAME,
#     user=DB_USER,
#     password=DB_PASS,
#     host=DB_HOST,
#     port=DB_PORT
# )
# print("Database connected successfully!")
# cur = conn.cursor()
# for i in range(len(embeddings_list)):
#     embedding = embeddings_list[i]
#     content = recursive_chunks[i]    
#     cur.execute("INSERT INTO nutritionitems (chunk, embedding) VALUES (%s, %s)",(content, embedding))
# conn.commit()
# cur.close()
# conn.close()

In [9]:
# Recursive chunking 
# chunk by double new lines \n\n
# chunk by single new line
# chunk by sentence
# https://github.com/docling-project/docling    -> handle tables
# https://github.com/google/langextract
# https://spacy.io/api/sentencizer
# https://huggingface.co/sentence-transformers/all-mpnet-base-v2