In [1]:
# execute as needed
# !pip install -r requirements.txt

## Process Books

In [2]:
RAW_BOOKS_FOLDER_PATH = "raw_data/books/"
PROCESSED_BOOKS_FOLDER_PATH = "processed_data/books/"

In [9]:
import os
import PyPDF2
from modules.embedding_tracking import add_new_file, file_exists

available_files = os.listdir(RAW_BOOKS_FOLDER_PATH)

for i in available_files:
    print(f"Processing {i}")
    
    processed_fname = ".".join(i.split(".")[:-1])+".txt"
    
    if file_exists(processed_fname):
        print(f"Skipping {i} as it has already been processed.")
        continue
    
    text = ""
    if i.endswith(".pdf"):
        fname = os.path.join(RAW_BOOKS_FOLDER_PATH, i)
        reader = PyPDF2.PdfReader(fname)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
            
    else:
        # invalid file types
        print(f"Skipping {i} as it is not a valid file.")
        continue
    
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, processed_fname), "w", encoding="utf-8") as f:
        f.write(text)
        add_new_file(processed_fname)
        
    print(f"Finished processing {i}.")

Processing .DS_Store
Skipping .DS_Store as it is not a valid file.
Processing Sunrise on the Reaping.pdf
Skipping Sunrise on the Reaping.pdf as it has already been processed.
Processing All Fours (Miranda July).pdf
Skipping All Fours (Miranda July).pdf as it has already been processed.


### Store to storage as vectors

#### BERT

In [4]:
from transformers import AutoTokenizer, AutoModel
import transformers

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = transformers.BertModel.from_pretrained(model_name)
bert_embedding_layer = bert_model.embeddings
bert_word_embeddings = bert_embedding_layer.word_embeddings

def generate_bert_embeddings(text, truncate=False):
    inputs = tokenizer(text, return_tensors="pt", truncation=truncate, padding=True, add_special_tokens=True, max_length=512)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.squeeze().detach().numpy().tolist()

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from modules.accessor import add_embedding
from modules.embedding_tracking import mark_file_processed, is_file_processed

filenames = os.listdir(PROCESSED_BOOKS_FOLDER_PATH)

for filename in filenames:
    with open(os.path.join(PROCESSED_BOOKS_FOLDER_PATH, filename), "r", encoding="utf-8") as file:
        text = file.read()
        print(f"Tokenizing {filename}")
        
        if not is_file_processed(filename, "full_text_bert_truncated"):
            # Truncated BERT embeddings
            token_embeddings = generate_bert_embeddings(text, truncate=True)
            
            try:
                add_embedding(
                    embeddings=token_embeddings,
                    document=filename,
                    custom_suffix="full_text_bert_truncated",
                    metadata=None
                )
                mark_file_processed(filename, "full_text_bert_truncated")
            except Exception as e:
                print(f"Error adding embedding for {filename}: {e}")
                print("Embedding Spec:", len(token_embeddings))
        
        if not is_file_processed(filename, "chunked_text_bert"):
            # Chunked BERT embeddings
            chunk_size = 512  # BERT's max input size
            chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
            for idx in range(len(chunks)):
                print(f"Tokenizing chunk {idx+1} of {filename}")
                token_embeddings = generate_bert_embeddings(chunks[idx], truncate=False)
                
                try:
                    add_embedding(
                        embeddings=token_embeddings,
                        document=filename,
                        custom_suffix="chunked_text_bert",
                        metadata={"chunk_index": idx + 1}
                    )
                except Exception as e:
                    print(f"Error adding embedding for chunk {idx+1} of {filename}: {e}")
                    print("Embedding Spec:", len(token_embeddings))
                    break
            mark_file_processed(filename, "chunked_text_bert")
            
    print(f"Finished processing {filename} for embeddings.")

Tokenizing All Fours (Miranda July).txt
Tokenizing chunk 1 of All Fours (Miranda July).txt
Tokenizing chunk 2 of All Fours (Miranda July).txt
Tokenizing chunk 3 of All Fours (Miranda July).txt
Tokenizing chunk 4 of All Fours (Miranda July).txt
Tokenizing chunk 5 of All Fours (Miranda July).txt
Tokenizing chunk 6 of All Fours (Miranda July).txt
Tokenizing chunk 7 of All Fours (Miranda July).txt
Tokenizing chunk 8 of All Fours (Miranda July).txt
Tokenizing chunk 9 of All Fours (Miranda July).txt
Tokenizing chunk 10 of All Fours (Miranda July).txt
Tokenizing chunk 11 of All Fours (Miranda July).txt
Tokenizing chunk 12 of All Fours (Miranda July).txt
Tokenizing chunk 13 of All Fours (Miranda July).txt
Tokenizing chunk 14 of All Fours (Miranda July).txt
Tokenizing chunk 15 of All Fours (Miranda July).txt
Tokenizing chunk 16 of All Fours (Miranda July).txt
Tokenizing chunk 17 of All Fours (Miranda July).txt
Tokenizing chunk 18 of All Fours (Miranda July).txt
Tokenizing chunk 19 of All Fours 

In [None]:
# if need to remove collections from the database
# from modules.accessor import list_collections, delete_collection

# for i in list_collections():
#     delete_collection(i)

Collection 'dim_768_collection_full_text_bert_truncated' deleted.
Collection 'dim_768_collection_chunked_text_bert' deleted.


In [11]:
from modules.accessor import list_collections, query_embedding

available_collections = list_collections()
print(f"Available collections: {available_collections}")

query_result = query_embedding(
    query=generate_bert_embeddings("What book has the word Sunrise in it?"),
    custom_suffix="full_text_bert_truncated",
    n_results=5
)
print(query_result['documents'])

Available collections: ['dim_768_collection_chunked_text_bert', 'dim_768_collection_full_text_bert_truncated']
[['All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt'], ['All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'All Fours (Miranda July).txt'], ['All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt'], ['All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'All Fours (Miranda July).txt'], ['All Fours (Miranda July).txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt', 'Sunrise on the Reaping.txt'], ['All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'All Fours (Miranda July).txt', 'Sunr

## Process Movie Transcripts