In [4]:
%load_ext autoreload 
%autoreload 2

from preprocessing import Utilities, Vectorizor
utils = Utilities()
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import List
from math import ceil
from tqdm import tqdm
import pandas as pd
import openai
from tiktoken_functions import Tokenizer
import json
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# path1 = '/home/elastic/notebooks/vector_search_applications/data/podcast_transcripts/SiliconValleyInsider/chunk-docs-1_79.json'
# path2 = '/home/elastic/notebooks/vector_search_applications/data/podcast_transcripts/SiliconValleyInsider/chunk-docs-80_158.json'
path = '/home/elastic/notebooks/vector_search_applications/data/podcast_transcripts/fivethirtyeight/combined.json'

In [5]:
combined = utils.json_data_loader(path)# data1 = json_data_loader(file_path=path1)
# data2 = json_data_loader(file_path=path2)
# combined = data1 + data2
len(combined)

5

In [10]:
def sentence_grouper(sentences: List[str], group_length: int, add_group_id: bool=True) -> List[dict]:
    sentence_groups = []
    groups = ceil(len(sentences)/group_length)
    for i in range(0,groups):
        start = i*group_length
        end = start + group_length
        group = ' '.join(sentences[start:end])
        group_dict = {'content': group}
        if add_group_id:
            group_dict['group_id'] = i + 1
        sentence_groups.append(group_dict)
    return sentence_groups

In [18]:
def extract_group_load(data: dict, 
                       group_length: int, 
                       add_group_id: bool=True,
                       capture_fields: List[str]=None
                      ) -> List[dict]:
    content = data['content']
    sentences = utils.sentence_splitter(content)
    groups = sentence_grouper(sentences, group_length=group_length, add_group_id=add_group_id)
    for group in groups:
        if capture_fields:
            for k in capture_fields:
                group[k] = data[k]
        else: 
            for k in data:
                if k != 'content':
                    group[k] = data[k]
    return groups

In [19]:
def mp_sentence_grouper(data: List[dict],
                        group_length: int,
                        add_group_id: bool=True
                        ) -> List[dict]:
    progress = tqdm(unit=": Text Chunks", total=len(data))
    count = 0
    results = []
    with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
        futures = [executor.submit(extract_group_load, d, group_length, add_group_id) for d in data]
        for future in as_completed(futures):
            results.append(future.result())
            progress.update(1)        
    flattened_results = [d for alist in results for d in alist]
    return flattened_results

In [20]:
%%time
results = mp_sentence_grouper(combined, group_length=10)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 51.49: Text Chunks/s][A

CPU times: user 13.1 ms, sys: 52.7 ms, total: 65.8 ms
Wall time: 102 ms





In [37]:
results[:5]

[{'content': "Hello, and welcome to the FiveThirtyEight Politics Podcast. I'm Galen Druk. Game time for the Republican presidential primary begins in earnest this week. The first debate is being held in Milwaukee on Wednesday, and it marks the beginning of a five-month countdown to the Iowa caucuses, during which there will be monthly debates, nonstop campaigning, and a likely winnowing of the field. As summer winds down and voters train more of their attention on the primary, we'll also see whether any of the candidates are able to take former President Trump down a peg. He currently leads our national polling average with 54% support, and at the time of this recording, reporting has just come out suggesting that Trump is going to skip the debate in favor of an interview with Tucker Carlson. As an aside, we're recording this on Friday because I'm going to be running around Milwaukee on Monday. But in any case, while this may be when the nation begins to tune in, folks in the early sta

In [26]:
df = pd.DataFrame.from_records(results)

In [31]:
def get_context(df: pd.DataFrame, 
                video_id: int, 
                group_id: int, 
                window_increment: int=1
               ) -> str:
    hit_index = df[(df.video_id == video_id) & (df.group_id == group_id)].index[0]
    start = hit_index - window_increment
    end = hit_index + window_increment
    return ' '.join(df.loc[start:end, 'content'].values.tolist())

In [35]:
# get_context(df, 'R6M9djUFJ60', group_id=26, window_increment=5)

### Vectorizing

In [38]:
vectorizer = Vectorizor()

In [39]:
docs = vectorizer.add_vector_batch(results)


Batches:   0%|                                                                                                               | 0/9 [00:00<?, ?it/s][A
Batches:  11%|███████████▍                                                                                           | 1/9 [00:00<00:04,  1.92it/s][A
Batches:  44%|█████████████████████████████████████████████▊                                                         | 4/9 [00:00<00:00,  7.81it/s][A
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 12.05it/s][A


In [47]:
outpath = '/home/elastic/notebooks/vector_search_applications/data/podcast_transcripts/fivethirtyeight/five_with_vectors.json'

In [43]:
for d in results:
    assert len(d['vector']) == 384

In [48]:
with open(outpath, 'w') as json_file:
     json.dump(docs, json_file, indent=4)

In [174]:
texts = [d['content'] for d in docs]