In [None]:
# execute as needed
# !pip install -r requirements.txt

## Process Books

In [1]:
import sys

sys.path.append("../")

RAW_BOOKS_FOLDER_PATH = "../raw_data/books/"
PROCESSED_BOOKS_FOLDER_PATH = "../processed_data/books/"

In [2]:
import os
import PyPDF2
from modules.embedding_tracking import add_new_file, file_exists

available_files = os.listdir(RAW_BOOKS_FOLDER_PATH)

for i in available_files:
    print(f"Processing {i}")
    
    processed_fname = ".".join(i.split(".")[:-1])+".txt"
    
    if file_exists(processed_fname):
        print(f"Skipping {i} as it has already been processed.")
        continue
    
    text = ""
    if i.endswith(".pdf"):
        fname = os.path.join(RAW_BOOKS_FOLDER_PATH, i)
        reader = PyPDF2.PdfReader(fname)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
            
    else:
        # invalid file types
        print(f"Skipping {i} as it is not a valid file.")
        continue
    
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, processed_fname), "w", encoding="utf-8") as f:
        f.write(text)
        add_new_file(processed_fname)
        
    print(f"Finished processing {i}.")

Processing .DS_Store
Skipping .DS_Store as it is not a valid file.
Processing Sunrise on the Reaping.pdf
Skipping Sunrise on the Reaping.pdf as it has already been processed.
Processing All Fours (Miranda July).pdf
Skipping All Fours (Miranda July).pdf as it has already been processed.


### Store to storage as vectors

In [4]:
from modules import accessor
from modules import embedding_tracking

filenames = os.listdir(PROCESSED_BOOKS_FOLDER_PATH)
embedding_functions = list(accessor.EMBEDDING_FUNCTIONS.keys())

for filename in filenames:
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, filename), "r", encoding="utf-8") as file:
        text = file.read()
        print(f"Tokenizing {filename}")
        
        for embedding_function in embedding_functions:
            suffix = "full_text"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                try:
                    ids = accessor.insert(
                        data=[text],
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, suffix)
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
            else:
                print(f"Skipping {filename} for {embedding_function} as it has already been processed with suffix {suffix}.")
                    
            suffix = "512_chunks"
            if not embedding_tracking.is_file_processed(filename, "{}_{}".format(embedding_function, suffix)):
                chunked_texts = [text[i:i+512] for i in range(0, len(text), 512)]
                try:
                    ids = accessor.insert(
                        data=chunked_texts,
                        embedding_func=embedding_function,
                        custom_suffix=suffix,
                    )
                    embedding_tracking.mark_file_processed(filename, suffix)
                except Exception as e:
                    print(f"Error storing {suffix} for {filename}: {e}")
            else:
                print(f"Skipping {filename} for {embedding_function} as it has already been processed with suffix {suffix}.")
            
    print(f"Finished processing {filename} for embeddings.")

Tokenizing All Fours (Miranda July).txt
Inserted data with IDs: ['f433acca-e72e-42b2-b700-b8d23a4848aa'] into collection 'mxbai-embed-large_collection_full_text'.
Inserted data with IDs: ['38fb6012-e535-4e73-a528-360a24af079c', '7543e5e1-6103-4c79-837e-0be0d13aaf04', 'da85d567-b20d-4b11-900a-b86d223d63ff', '3e563dac-cef8-4bfd-8fcb-a90cc77c20fd', '7c1e4363-f7e9-4643-ba3c-d29031c0f457', '61bd673a-f0c2-44af-a86b-5c36e1258f6b', 'c727cdbc-f1cb-4181-9f15-af67c76ec41f', '258310c9-5c4e-4c4d-818e-dad4d218d22a', '5fb8b668-e964-4b6b-87f1-1d58773063e1', 'f21d3146-6d41-414a-88e1-f3ff87d08c28', 'c99fa8a6-823c-4136-b228-17f33d5f5a1d', 'd2034b6a-b7d5-4680-a066-eaca0a0fcbdf', '5b3f4f77-d0b9-4298-a731-16db2eebaf6f', '62ae255c-d993-4beb-a95a-ebdc258e1d30', 'd788bf22-cd78-4828-b5ac-d30ca92f549b', 'b559e509-d9a0-44db-b052-792befac4f2e', '1c8c8db7-deaf-47c1-a809-06177af6b7af', '9d59491d-33f3-457f-b929-92ca92988bd7', '0de83793-d691-404d-871f-d42f01a8bf4e', '2f0fb56c-faa9-4b87-b5e0-6dba2d7e44fa', '1a78c446-14

In [16]:
available_collections = accessor.list_collections()
print(f"Available collections: {available_collections}")

documents = accessor.get(
    query="What book has the word Sunrise in it?",
    embedding_func="all-minilm",
    custom_suffix="512_chunks",
    n_results=10
)
for i in documents:
    print(i.page_content[:100])
    print("-------------------------")

Available collections: ['mxbai-embed-large_collection_full_text', 'snowflake-arctic-embed_collection_full_text', 'nomic-embed-text_collection_512_chunks', 'snowflake-arctic-embed_collection_512_chunks', 'all-minilm_collection_full_text', 'langchain', 'snowflake-arctic-embed2_collection_512_chunks', 'all-minilm_collection_512_chunks', 'mxbai-embed-large_collection_512_chunks', 'bge-m3_collection_512_chunks', 'nomic-embed-text_collection_full_text', 'bge-m3_collection_full_text', 'snowflake-arctic-embed2_collection_full_text']
eps the District 12 of fice in shape — thanks for the super job you do
making sure everything runs s
-------------------------
or design adapted for ebook by Cora W igen
This is a work of fiction. Names, characters, places, and
-------------------------
 is something at my window lattice;
Let me see, then, what ther eat is, and this mystery explor e —

-------------------------
into
focus. They’re not lined with instruments of pain but towering shelves of
books. Th

## Process Movie Transcripts