In [11]:
# execute as needed
# !pip install -r requirements.txt

## Process Books

In [15]:
import sys

sys.path.append("../")

RAW_BOOKS_FOLDER_PATH = "../raw_data/books/"
PROCESSED_BOOKS_FOLDER_PATH = "../processed_data/books/"

In [16]:
import os
import fitz
from modules.embedding_tracking import add_new_file, file_exists

available_files = os.listdir(RAW_BOOKS_FOLDER_PATH)

for i in available_files:
    print(f"Processing {i}")
    
    processed_fname = ".".join(i.split(".")[:-1])+".txt"
    print(processed_fname)
    
    if file_exists(processed_fname):
        print(f"Skipping {i} as it has already been processed.")
        continue

    text = ""
    if i.endswith(".pdf"):
        fname = os.path.join(RAW_BOOKS_FOLDER_PATH, i)
        pdf_document = fitz.open(fname)
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            page_text = page.get_text()
            # Fix spaces between words
            page_text = " ".join(page_text.replace("\n", " ").split())
            text += page_text + "\n"
        pdf_document.close()
            
    else:
        # invalid file types
        print(f"Skipping {i} as it is not a valid file.")
        continue
    
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, processed_fname), "w", encoding="utf-8") as f:
        f.write(text)
        add_new_file(processed_fname)
        
    print(f"Finished processing {i}.")

Processing .DS_Store
.txt
Skipping .DS_Store as it is not a valid file.
Processing Sunrise on the Reaping.pdf
Sunrise on the Reaping.txt
Skipping Sunrise on the Reaping.pdf as it has already been processed.
Processing All Fours (Miranda July).pdf
All Fours (Miranda July).txt
Skipping All Fours (Miranda July).pdf as it has already been processed.


### Store to storage as vectors

In [17]:
# https://medium.com/the-ai-forum/semantic-chunking-for-rag-f4733025d5f5

from modules import accessor, embedding_tracking
from langchain.text_splitter import RecursiveCharacterTextSplitter
from nltk.tokenize import sent_tokenize
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import re
from IPython import get_ipython
import json

nltk.download("punkt")

filenames = os.listdir(PROCESSED_BOOKS_FOLDER_PATH)
embedding_functions = list(accessor.EMBEDDING_FUNCTIONS.keys())

for filename in filenames:
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, filename), "r", encoding="utf-8") as file:
        text = file.read()
        print(f"Tokenizing {filename}")
        
        for embedding_function in embedding_functions:
            # # Full text embedding
            # suffix = "full_text"
            # if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
            #     try:
            #         ids = accessor.insert(
            #             data=[text],
            #             embedding_func=embedding_function,
            #             custom_suffix=suffix,
            #         )
            #         embedding_tracking.mark_file_processed(filename, "{}_{}".format(embedding_function, suffix))
            #     except Exception as e:
            #         print(f"Error storing {suffix} for {filename}: {e}")
            # else:
            #     print(f"Skipping {filename} for {embedding_function} as it has already been processed with suffix {suffix}.")
                    
            # Fixed size chunk text embedding
            suffix = "512_chunks"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                chunked_texts = [text[i:i+512] for i in range(0, len(text), 512)]
                try:
                    ids = accessor.insert(
                        data=chunked_texts,
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, "{}_{}".format(embedding_function, suffix))
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
            else:
                print(f"Skipping {filename} for {embedding_function} as it has already been processed with suffix {suffix}.")
                
            # Recursive character text splitting
            suffix = "recursive_chunks"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=512,
                    chunk_overlap=100,
                    length_function=len,
                )
                chunks = text_splitter.split_text(text)
                try:
                    ids = accessor.insert(
                        data=chunks,
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, "{}_{}".format(embedding_function, suffix))
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
                    
            # Separator text splitting
            suffix = "separator_chunks"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=512,
                    chunk_overlap=100,
                    length_function=len,
                    separators=["CHAPTER", "\n\n", "\n", " ", ""],
                )
                chunks = text_splitter.split_text(text)
                try:
                    ids = accessor.insert(
                        data=chunks,
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, "{}_{}".format(embedding_function, suffix))
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
            
            # Semantic chunking
            suffix = "semantic_chunks"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                sentences = sent_tokenize(text)
                threshold = 0.7 # can be adjusted
                chunks = []
                curr_chunk = [sentences[0]]
                for i in range(0, len(sentences)-1):
                    embeddings_1 = accessor.EMBEDDING_FUNCTIONS.get(embedding_function).embed_query(sentences[i])
                    embeddings_2 = accessor.EMBEDDING_FUNCTIONS.get(embedding_function).embed_query(sentences[i+1])
                    
                    if cosine_similarity([embeddings_1], [embeddings_2])[0][0] > threshold:
                        chunks.append(curr_chunk)
                        curr_chunk = [sentences[i+1]]
                        continue
                    
                    curr_chunk.append(sentences[i+1])
                
                try:
                    ids = accessor.insert(
                        data=[" ".join(chunk) for chunk in chunks],
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, "{}_{}".format(embedding_function, suffix))
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
            
    print(f"Finished processing {filename} for embeddings.")
    
# LLM based chunking
suffix = "llm_chunked"
llm_chunked_fnames = ["all_fours_txt_chunks.json", "sunrise_on_the_reaping_chunks.json"]
for fname in llm_chunked_fnames:
    embedding_tracking.add_new_file(fname)
    
for embedding_function in embedding_functions:
    for fname in llm_chunked_fnames:
        if not embedding_tracking.is_file_processed(fname, "{}_{}".format(embedding_function, suffix)):
            try:
                with open(os.path.join("../llm_chunked_data", fname), "r", encoding="utf-8") as f:
                    chunks = json.load(f)
                
                ids = accessor.insert(
                    data=chunks,
                    embedding_func=embedding_function,
                    custom_suffix=fname,
                )
                embedding_tracking.mark_file_processed(fname, "{}_{}".format(embedding_function, suffix))
            except Exception as e:
                print(f"Error storing {suffix} for {fname}: {e}")
        else:
            print(f"Skipping {fname} for {embedding_function} as it has already been processed with suffix {suffix}.")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yudhistiraonggowarsito/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tokenizing All Fours (Miranda July).txt
Error storing 512_chunks for All Fours (Miranda July).txt: Query error: Database error: error returned from database: (code: 1032) attempt to write a readonly database


KeyboardInterrupt: 

In [None]:
available_collections = accessor.list_collections()
print(f"Available collections: {available_collections}")

documents = accessor.get(
    query="What book has the word Sunrise in it?",
    embedding_func="all-minilm",
    custom_suffix="recursive_chunks",
    n_results=10
)
for i in documents:
    print(i.page_content[:100])
    print("-------------------------")

Available collections: ['nomic-embed-text_collection_full_text', 'mxbai-embed-large_collection_512_chunks', 'all-minilm_collection_512_chunks', 'bge-m3_collection_512_chunks', 'snowflake-arctic-embed_collection_512_chunks', 'nomic-embed-text_collection_512_chunks', 'bge-m3_collection_full_text', 'snowflake-arctic-embed2_collection_full_text', 'snowflake-arctic-embed_collection_full_text', 'mxbai-embed-large_collection_full_text', 'snowflake-arctic-embed2_collection_512_chunks', 'langchain', 'all-minilm_collection_full_text']
eps the District 12 of fice in shape — thanks for the super job you do
making sure everything runs s
-------------------------
or design adapted for ebook by Cora W igen
This is a work of fiction. Names, characters, places, and
-------------------------
 is something at my window lattice;
Let me see, then, what ther eat is, and this mystery explor e —

-------------------------
into
focus. They’re not lined with instruments of pain but towering shelves of
books. Th