In [5]:
# execute as needed
# !pip install -r requirements.txt

## Process Books

In [6]:
import sys
import warnings

sys.path.append("../")

RAW_BOOKS_FOLDER_PATH = "../raw_data/books/"
PROCESSED_BOOKS_FOLDER_PATH = "../processed_data/books/"

warnings.filterwarnings('ignore')

In [7]:
import os
import fitz
from modules.embedding_tracking import add_new_file, file_exists

available_files = os.listdir(RAW_BOOKS_FOLDER_PATH)

for i in available_files:
    print(f"Processing {i}")
    
    processed_fname = ".".join(i.split(".")[:-1])+".txt"
    print(processed_fname)
    
    if file_exists(processed_fname):
        print(f"Skipping {i} as it has already been processed.")
        continue

    text = ""
    if i.endswith(".pdf"):
        fname = os.path.join(RAW_BOOKS_FOLDER_PATH, i)
        pdf_document = fitz.open(fname)
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            page_text = page.get_text()
            # Fix spaces between words
            page_text = " ".join(page_text.replace("\n", " ").split())
            text += page_text + "\n"
        pdf_document.close()
            
    else:
        # invalid file types
        print(f"Skipping {i} as it is not a valid file.")
        continue
    
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, processed_fname), "w", encoding="utf-8") as f:
        f.write(text)
        add_new_file(processed_fname)
        
    print(f"Finished processing {i}.")

Processing .DS_Store
.txt
Tracker file is empty or corrupted.
Skipping .DS_Store as it is not a valid file.
Processing Sunrise on the Reaping.pdf
Sunrise on the Reaping.txt
Tracker file is empty or corrupted.
Tracker file is empty or corrupted.
Finished processing Sunrise on the Reaping.pdf.
Processing All Fours (Miranda July).pdf
All Fours (Miranda July).txt
Finished processing All Fours (Miranda July).pdf.


### Store to storage as vectors

In [None]:
# https://medium.com/the-ai-forum/semantic-chunking-for-rag-f4733025d5f5

from modules import accessor, embedding_tracking
from langchain.text_splitter import RecursiveCharacterTextSplitter
from nltk.tokenize import sent_tokenize
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np

nltk.download("punkt")

filenames = os.listdir(PROCESSED_BOOKS_FOLDER_PATH)
embedding_functions = list(accessor.EMBEDDING_FUNCTIONS.keys())
stitch_title = "TITLE: {}\n"

for filename in filenames:
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, filename), "r", encoding="utf-8") as file:
        text = file.read()
        print(f"Tokenizing {filename}")
        
        for embedding_function in embedding_functions:
            # Fixed size chunk text embedding
            suffix = "512_chunks"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                chunked_texts = [stitch_title.format(filename) + text[i:i+512] for i in range(0, len(text), 512)]
                try:
                    ids = accessor.insert(
                        data=chunked_texts,
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, "{}_{}".format(embedding_function, suffix))
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
            else:
                print(f"Skipping {filename} for {embedding_function} as it has already been processed with suffix {suffix}.")
                
            # Recursive character text splitting
            suffix = "recursive_chunks"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=512,
                    chunk_overlap=100,
                    length_function=len,
                )
                chunks = text_splitter.split_text(text)
                chunks = [stitch_title.format(filename) + chunk for chunk in chunks]
                try:
                    ids = accessor.insert(
                        data=chunks,
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, "{}_{}".format(embedding_function, suffix))
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
                    
            # Separator text splitting
            suffix = "separator_chunks"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=512,
                    chunk_overlap=100,
                    length_function=len,
                    separators=["CHAPTER", "\n\n", "\n", " ", ""],
                )
                chunks = text_splitter.split_text(text)
                chunks = [stitch_title.format(filename) + chunk for chunk in chunks]
                try:
                    ids = accessor.insert(
                        data=chunks,
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, "{}_{}".format(embedding_function, suffix))
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
            
            # Semantic chunking
            suffix = "semantic_chunks"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                sentences = sent_tokenize(text)
                threshold = 0.7 # can be adjusted
                chunks = []
                curr_chunk = [stitch_title.format(filename), sentences[0]]
                for i in range(0, len(sentences)-1):
                    
                    embeddings_1 = accessor.EMBEDDING_FUNCTIONS.get(embedding_function).embed_query(sentences[i])
                    embeddings_2 = accessor.EMBEDDING_FUNCTIONS.get(embedding_function).embed_query(sentences[i+1])
                    
                    emb_1 = np.array(embeddings_1)
                    emb_2 = np.array(embeddings_2)

                    if np.linalg.norm(emb_1) == 0 or np.linalg.norm(emb_2) == 0:
                        continue

                    try:
                        similarity = cosine_similarity([emb_1], [emb_2])[0][0]
                    except Exception as e:
                        print(f"Error calculating similarity for {sentences[i]} and {sentences[i+1]}: {e}")
                        continue

                    if similarity > threshold:
                        chunks.append(curr_chunk)
                        curr_chunk = [stitch_title.format(filename), sentences[i+1]]
                        continue
                    
                    curr_chunk.append(sentences[i+1])
                
                try:
                    ids = accessor.insert(
                        data=[" ".join(chunk) for chunk in chunks],
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, "{}_{}".format(embedding_function, suffix))
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
            
    print(f"Finished processing {filename} for embeddings.")
    
# LLM based chunking
suffix = "llm_chunked"
llm_chunked_fnames = ["all_fours_txt_chunks.json", "sunrise_on_the_reaping_chunks.json"]
for fname in llm_chunked_fnames:
    embedding_tracking.add_new_file(fname)
    
for embedding_function in embedding_functions:
    for fname in llm_chunked_fnames:
        if not embedding_tracking.is_file_processed(fname, "{}_{}".format(embedding_function, suffix)):
            try:
                with open(os.path.join("../llm_chunked_data", fname), "r", encoding="utf-8") as f:
                    chunks = json.load(f)
                
                chunks = [stitch_title.format(fname) + chunk for chunk in chunks]
                ids = accessor.insert(
                    data=chunks,
                    embedding_func=embedding_function,
                    custom_suffix=fname,
                )
                embedding_tracking.mark_file_processed(fname, "{}_{}".format(embedding_function, suffix))
            except Exception as e:
                print(f"Error storing {suffix} for {fname}: {e}")
        else:
            print(f"Skipping {fname} for {embedding_function} as it has already been processed with suffix {suffix}.")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yudhistiraonggowarsito/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tokenizing All Fours (Miranda July).txt
Skipping All Fours (Miranda July).txt for nomic-embed-text as it has already been processed with suffix 512_chunks.
Inserted data with IDs: ['a598e6f9-1fa3-4ea1-8589-706717a69794', 'dcd9c87c-3328-4da4-9393-401196f1a558', 'b5cf1127-735c-4a90-a9d9-7b96f58fe011', '561b4c40-7205-421f-8396-93e9c68d7345', '62a8cc50-616b-4eab-9511-7136062e3b17', '09d8bcaf-41a5-40dc-ae5e-5d856b7b8b41', 'f3f9bff0-f40b-472b-a0b4-7ec18f3ce09c', 'ec52d4bc-e418-4605-a985-c31e0a5113f2', '07d3ba6f-06aa-49d5-9f57-444525065d0a', '8dde5347-704d-4691-9542-988a346274a1', '4972a756-9015-4ab1-a416-0888c9ac6e90', '7eb5b6d0-a542-4145-b22e-9ea26edbf866', '14aef551-53c2-485a-98a1-355e52c55f00', 'dfc4b68e-4418-4aef-8e50-0ef3c367b900', '52a79304-f742-4ee9-9796-dfbc6c661496', 'ba0dcb74-7001-4860-a803-9bcd0ba20e99', '17fa66d3-530a-4a06-b321-3b5d2e9c6338', '65bdf384-f2c9-4df9-9bb6-4f4eaad4cc93', '90c78e25-7006-4fb3-b36e-9083e730a75f', '85e387f3-4a6d-4f7e-8793-7fede3b35f62', '151fb7d9-f617-403a

In [9]:
available_collections = accessor.list_collections()
print(f"Available collections: {available_collections}")

documents = accessor.get(
    query="What book has the word Sunrise in it?",
    embedding_func="nomic-embed-text",
    custom_suffix="recursive_chunks",
    n_results=10
)
for i in documents:
    print(i.page_content)
    print("-------------------------")

Available collections: ['nomic-embed-text_collection_separator_chunks', 'bge-m3_collection_512_chunks', 'bge-m3_collection_recursive_chunks', 'bge-m3_collection_sunrise_on_the_reaping_chunks.json', 'mxbai-embed-large_collection_all_fours_txt_chunks.json', 'mxbai-embed-large_collection_recursive_chunks', 'mxbai-embed-large_collection_512_chunks', 'bge-m3_collection_all_fours_txt_chunks.json', 'nomic-embed-text_collection_sunrise_on_the_reaping_chunks.json', 'nomic-embed-text_collection_all_fours_txt_chunks.json', 'bge-m3_collection_separator_chunks', 'mxbai-embed-large_collection_sunrise_on_the_reaping_chunks.json', 'nomic-embed-text_collection_recursive_chunks', 'mxbai-embed-large_collection_separator_chunks', 'nomic-embed-text_collection_512_chunks', 'langchain']
TITLE: Sunrise on the Reaping.txt
or locales is entirely coincidental. Library of Congress Cataloging-in-Publication Data available e-ISBN 978-1-5461-7147-8 First edition, March 2025 Jacket and book design by Elizabeth B. Par