In [2]:
%load_ext autoreload 
%autoreload 2

from preprocessing import Utilities, Vectorizor
utils = Utilities()
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import List
from math import ceil
from tqdm import tqdm
import pandas as pd
import openai
from tiktoken_functions import Tokenizer
import json
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# path1 = '/home/elastic/notebooks/vector_search_applications/data/podcast_transcripts/SiliconValleyInsider/chunk-docs-1_79.json'
# path2 = '/home/elastic/notebooks/vector_search_applications/data/podcast_transcripts/SiliconValleyInsider/chunk-docs-80_158.json'
path = '/home/elastic/notebooks/vector_search_applications/data/impact_theory_metadata.json'

In [4]:
combined = utils.json_data_loader(path)
# data1 = json_data_loader(file_path=path1)
# data2 = json_data_loader(file_path=path2)
# combined = data1 + data2
len(combined)

387

In [5]:
def sentence_grouper(sentences: List[str], group_length: int, add_group_id: bool=True) -> List[dict]:
    sentence_groups = []
    groups = ceil(len(sentences)/group_length)
    for i in range(0,groups):
        start = i*group_length
        end = start + group_length
        group = ' '.join(sentences[start:end])
        group_dict = {'content': group}
        if add_group_id:
            group_dict['group_id'] = i + 1
        sentence_groups.append(group_dict)
    return sentence_groups

In [10]:
def extract_group_load(data: dict, 
                       group_length: int, 
                       add_group_id: bool=True,
                       capture_fields: List[str]=None
                      ) -> List[dict]:
    content = data.get('content', '')
    sentences = utils.sentence_splitter(content)
    groups = sentence_grouper(sentences, group_length=group_length, add_group_id=add_group_id)
    for group in groups:
        if capture_fields:
            for k in capture_fields:
                group[k] = data[k]
        else: 
            for k in data:
                if k != 'content':
                    group[k] = data[k]
    return groups

In [11]:
def mp_sentence_grouper(data: List[dict],
                        group_length: int,
                        add_group_id: bool=True
                        ) -> List[dict]:
    progress = tqdm(unit=": Text Chunks", total=len(data))
    count = 0
    results = []
    with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
        futures = [executor.submit(extract_group_load, d, group_length, add_group_id) for d in data]
        for future in as_completed(futures):
            results.append(future.result())
            progress.update(1)        
    flattened_results = [d for alist in results for d in alist]
    return flattened_results

In [12]:
%%time
results = mp_sentence_grouper(combined, group_length=10)


  0%|                                                                                                     | 0/387 [00:00<?, ?: Text Chunks/s][A
  1%|▋                                                                                            | 3/387 [00:00<00:14, 25.95: Text Chunks/s][A
  6%|█████▍                                                                                     | 23/387 [00:00<00:02, 121.83: Text Chunks/s][A
 12%|███████████                                                                                | 47/387 [00:00<00:01, 170.45: Text Chunks/s][A
 19%|█████████████████▍                                                                         | 74/387 [00:00<00:01, 207.67: Text Chunks/s][A
 27%|████████████████████████▏                                                                 | 104/387 [00:00<00:01, 234.12: Text Chunks/s][A
 37%|█████████████████████████████████▎                                                        | 143/387 [00:00<00:00, 281.86: Te

CPU times: user 218 ms, sys: 130 ms, total: 348 ms
Wall time: 1.05 s





In [15]:
for d in results:
    group_id = str(d['group_id'])
    video_id = str(d['video_id'])
    unique_id = video_id + '-' + group_id
    d.update(unique_id=unique_id)

In [17]:
df = pd.DataFrame.from_records(results)

In [31]:
def get_context(df: pd.DataFrame, 
                video_id: int, 
                group_id: int, 
                window_increment: int=1
               ) -> str:
    hit_index = df[(df.video_id == video_id) & (df.group_id == group_id)].index[0]
    start = hit_index - window_increment
    end = hit_index + window_increment
    return ' '.join(df.loc[start:end, 'content'].values.tolist())

### Vectorizing

In [19]:
vectorizer = Vectorizor()

In [20]:
docs = vectorizer.add_vector_batch(results)


Batches:   0%|                                                                                                      | 0/1037 [00:00<?, ?it/s][A
Batches:   0%|                                                                                            | 1/1037 [00:07<2:11:02,  7.59s/it][A
Batches:   0%|▎                                                                                             | 3/1037 [00:07<34:33,  2.01s/it][A
Batches:   1%|▌                                                                                             | 6/1037 [00:07<13:49,  1.24it/s][A
Batches:   1%|▊                                                                                             | 9/1037 [00:07<07:42,  2.22it/s][A
Batches:   1%|█                                                                                            | 12/1037 [00:08<04:54,  3.48it/s][A
Batches:   1%|█▎                                                                                           | 15/1037 [00:08<03:22

In [21]:
outpath = '/home/elastic/notebooks/vector_search_applications/data/impact_theory_with_vectors.json'

In [23]:
for d in docs:
    assert len(d['vector']) == 384

In [27]:
with open(outpath, 'w') as json_file:
     json.dump(docs, json_file, indent=4)

In [174]:
texts = [d['content'] for d in docs]