In [None]:
# execute as needed
# !pip install -r requirements.txt

Collecting transformers (from -r requirements.txt (line 4))
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting regex!=2019.12.17 (from transformers->-r requirements.txt (line 4))
  Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting safetensors>=0.4.3 (from transformers->-r requirements.txt (line 4))
  Using cached safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0mm
[?25hDownloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl (284 kB)
Using cached safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl (418 kB)
Installing collected packages: safetensors, regex, transformers
Successfully installed regex-2024.11.6 safetensors-0.5.3 transformers-4.52.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new releas

## Process Books

In [1]:
import os
import PyPDF2

RAW_BOOKS_FOLDER_PATH = "raw_data/books/"
PROCESSED_BOOKS_FOLDER_PATH = "processed_data/books/"

In [None]:
available_files = os.listdir(RAW_BOOKS_FOLDER_PATH)

for i in available_files:
    print(f"Processing {i}")
    
    text = ""
    if i.endswith(".pdf"):
        reader = PyPDF2.PdfReader(os.path.join(RAW_BOOKS_FOLDER_PATH, i))
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
            
    else:
        # invalid file types
        print(f"Skipping {i} as it is not a valid file.")
        continue
            
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, ".".join(i.split(".")[:-1])+".txt"), "w", encoding="utf-8") as f:
        f.write(text)
        
    print(f"Finished processing {i}.")

Processing .DS_Store
Skipping .DS_Store as it is not a valid file.
Processing Sunrise on the Reaping.pdf
Finished processing Sunrise on the Reaping.pdf.
Processing All Fours (Miranda July).pdf
Finished processing All Fours (Miranda July).pdf.


### Store to storage as vectors

#### BERT

In [17]:
from transformers import AutoTokenizer, AutoModel
import transformers

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = transformers.BertModel.from_pretrained(model_name)
bert_embedding_layer = bert_model.embeddings
bert_word_embeddings = bert_embedding_layer.word_embeddings

def generate_bert_embeddings(text, truncate=False):
    inputs = tokenizer(text, return_tensors="pt", truncation=truncate, padding=True, add_special_tokens=True, max_length=512)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.squeeze().detach().numpy().tolist()

generate_bert_embeddings("test")



[[0.18179400265216827,
  0.11388609558343887,
  0.1243995651602745,
  -0.4696972966194153,
  -0.16297216713428497,
  -0.007843047380447388,
  0.19663424789905548,
  -0.3071346580982208,
  0.23120948672294617,
  -0.9543548822402954,
  -0.6488723754882812,
  0.020990587770938873,
  -0.202029287815094,
  0.26552772521972656,
  -0.620647668838501,
  0.127091184258461,
  0.4827876091003418,
  0.23583216965198517,
  0.12683910131454468,
  -0.23082110285758972,
  0.04090723767876625,
  0.16145868599414825,
  0.5038300156593323,
  -0.15852510929107666,
  -0.10843393206596375,
  -0.08590751886367798,
  0.5335981249809265,
  0.02906597964465618,
  0.149443119764328,
  0.4602819085121155,
  0.056582972407341,
  0.4857437014579773,
  -0.14637485146522522,
  -0.17265111207962036,
  -0.03973954916000366,
  -0.0871751457452774,
  0.13444671034812927,
  -0.2703970968723297,
  -0.11369200050830841,
  -0.10413695871829987,
  -0.6094565987586975,
  -0.036320775747299194,
  0.43351906538009644,
  0.167825

In [21]:
from modules.accessor import add_embedding

filenames = os.listdir(PROCESSED_BOOKS_FOLDER_PATH)

for filename in filenames:
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, filename), "r", encoding="utf-8") as file:
        text = file.read()
        print(f"Tokenizing {filename}")
        
        # Truncated BERT embeddings
        token_embeddings = generate_bert_embeddings(text, truncate=True)
        
        try:
            add_embedding(
                embeddings=token_embeddings,
                document=filename,
                custom_suffix="full_text_bert_truncated",
                metadata=None
            )
        except Exception as e:
            print(f"Error adding embedding for {filename}: {e}")
            print("Embedding Spec:", len(token_embeddings))
        
        # Chunked BERT embeddings
        chunk_size = 512  # BERT's max input size
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        for idx in range(len(chunks)):
            print(f"Tokenizing chunk {idx+1} of {filename}")
            token_embeddings = generate_bert_embeddings(chunks[idx], truncate=False)
            
            try:
                add_embedding(
                    embeddings=token_embeddings,
                    document=filename,
                    custom_suffix="chunked_text_bert",
                    metadata={"chunk_index": idx + 1}
                )
            except Exception as e:
                print(f"Error adding embedding for chunk {idx+1} of {filename}: {e}")
                print("Embedding Spec:", len(token_embeddings))
                break

Tokenizing All Fours (Miranda July).txt
dim_768_collection_full_text_bert_truncated
Tokenizing chunk 1 of All Fours (Miranda July).txt




dim_768_collection_chunked_text_bert
Tokenizing chunk 2 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 3 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 4 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 5 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 6 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 7 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 8 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 9 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 10 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 11 of All Fours (Miranda July).txt
dim_768_collection_chunked_text_bert
Tokenizing chunk 12 of All Fours (Miranda July).txt
dim_768_collection_chunked_te

In [None]:
# if need to remove collections from the database
# from modules.accessor import list_collections, delete_collection

# for i in list_collections():
#     delete_collection(i)

Collection 'dim_768_collection_chunked_text_bert' deleted.
Collection 'dim_768_collection_full_text_bert_truncated' deleted.


In [31]:
from modules.accessor import list_collections, query_embedding

available_collections = list_collections()
print(f"Available collections: {available_collections}")

query_result = query_embedding(
    query=generate_bert_embeddings("What book has the word Sunrise in it?"),
    custom_suffix="full_text_bert_truncated",
    n_results=5
)
print(query_result['documents'])

Available collections: ['dim_768_collection_chunked_text_bert', 'dim_768_collection_full_text_bert_truncated']
[['Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt'], ['All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt'], ['All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt'], ['All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'All Fours (Miranda July).txt'], ['All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt'], ['All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'Sunrise on t

## Process Movie Transcripts