In [1]:
from data_prep_library import *

In [None]:
# set up directory structure
INPUT_DIR = "../data/extracts"
OUTPUT_DIR = "../data"

make_path(OUTPUT_DIR)
make_path(OUTPUT_DIR + "/vectors")

In [3]:
# get pdfs to process
txt_filenames = list_files(INPUT_DIR, include_dirs=False, include_files=True, include_extension=".txt", recursive=False, full_path=False, as_paths=False )


In [4]:
# chunk text
chunks = []
vectors = []

CHUNK_SIZE = 5
for filename in txt_filenames:
    show_name, episode_name = filename.rsplit('_', 1)
    with open(os.path.join(INPUT_DIR, filename), 'r', encoding="utf-8") as f:
        text = f.read()

    data = {'show': show_name, 'episode': episode_name}
    sentences = re.split('\n', text)
    sentences = [sentence for sentence in sentences if sentence != '']
    for i in range(0, len(sentences), CHUNK_SIZE):
        chunk = '\n'.join(sentences[i:i+CHUNK_SIZE])
        chunks.append(chunk)
        vectors.append(data)


In [5]:
# vectorize
vectorizer_vars = get_vectorizer_variables()

In [7]:
def make_vector_wrapper(vectors, chunks, make_function, vectorizer_vars):
    
    failures = []
    output_vectors = []
    with tqdm(total=len(vectors)) as pbar:
        with ThreadPoolExecutor(max_workers=50) as executor:
            futures = {}
            for vector_data, chunk in zip(vectors, chunks):
                future = executor.submit(make_function, chunk, vectorizer_vars)
                futures[future] = vector_data.copy()
            for future in as_completed(futures):
                try:
                    out = future.result()
                    vector_data = futures[future]
                    vector_data['vector'] = out
                    output_vectors.append(vector_data)
                    
                except Exception as e:
                    print(f"Error [{vector_data}]: {e}")
                    failures.append(futures[future])
                    exit()
                pbar.update(1) 

    return output_vectors, failures

In [10]:
# semantic vectorize each chunk
semantic_vectors, failures = make_vector_wrapper(vectors, chunks, make_lexical_vector, vectorizer_vars)
pickle.dump(semantic_vectors, open(OUTPUT_DIR + f"/vectors/semantic_vectors.bin", "wb")) 

100%|██████████| 68316/68316 [00:22<00:00, 3069.20it/s]
