In [8]:
#standard libraries
import json
import os
import time
from collections import defaultdict
from typing import List, Dict, Tuple, Union, Callable
from math import ceil

#external libraries
import pandas as pd
import numpy as np
from rich import print
from torch import cuda
from tqdm.notebook import tqdm

#external files
from preprocessing import FileIO
import tiktoken # bad ass tokenizer library for use with OpenAI LLMs 
from llama_index.text_splitter import SentenceSplitter #one of the best on the market
from sentence_transformers import SentenceTransformer

def chunk_data(data, text_splitter, content_field='content'):
    return [text_splitter.split_text(d[content_field]) for d in tqdm(data, 'CHUNKING')]

def create_vectors(content_splits: List[List[str]], model: SentenceTransformer):
    text_vector_tuples = []
    for chunk in tqdm(content_splits, 'VECTORS'):
        vectors = model.encode(chunk, show_progress_bar=False, device='cuda:0')
        text_vector_tuples.append(list(zip(chunk, vectors)))
    return text_vector_tuples

def join(corpus, tuples):
    docs = []
    for i, d in enumerate(corpus):
        for j, episode in enumerate(tuples[i]):
            doc = {k:v for k,v in d.items() if k != 'content'}
            video_id = doc['video_id']
            doc['doc_id'] = f'{video_id}_{j}'
            doc['content'] = episode[0]
            doc['content_embedding'] = episode[1].tolist()
            docs.append(doc)
    return docs
    
def create_dataset(corpus: List[dict],
                   embedding_model: SentenceTransformer,
                   text_splitter: SentenceSplitter,
                   file_outpath_prefix: str='./impact-theory-minilmL6',
                   content_field: str='content',
                   embedding_field: str='content_embedding',
                   device: str='cuda:0' if cuda.is_available() else 'cpu'
                   ) -> None:
    '''
    Given a raw corpus of data, this function creates a new dataset where each dataset 
    doc contains episode metadata and it's associated text chunk and vector representation. 
    Output is directly saved to disk. 
    '''
    
    io = FileIO()

    chunk_size = text_splitter.chunk_size
    print(f'Creating dataset using chunk_size: {chunk_size}')
    start = time.perf_counter()
    ########################
    # START YOUR CODE HERE #
    ########################
    content_splits = chunk_data(corpus, text_splitter)
    text_vector_tuples = create_vectors(content_splits, embedding_model)
    joined_docs = join(corpus, text_vector_tuples)
    ########################
    # END YOUR CODE HERE #
    ########################
    file_path = f'{file_outpath_prefix}-{chunk_size}.parquet'
    io.save_as_parquet(file_path=file_path, data=joined_docs, overwrite=False)
    end = time.perf_counter() - start
    print(f'Total Time to process dataset of chunk_size ({chunk_size}): {round(end/60, 2)} minutes')

In [9]:
#tokenizer
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
#text_splitter
splitter = SentenceSplitter(chunk_overlap=0, chunk_size=256, tokenizer=encoding.encode)
#model
model = SentenceTransformer('/home/elastic/notebooks/vector_search_applications/models/new_ft_model/')
#corpus
data = FileIO().load_json('./data/impact_theory_data.json')

In [10]:
create_dataset(data, model, splitter, './impact-theory-newft')

CHUNKING:   0%|          | 0/384 [00:00<?, ?it/s]

VECTORS:   0%|          | 0/384 [00:00<?, ?it/s]

[32m2023-11-30 17:54:37.471[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m41[0m - [1mDataFrame saved as parquet file here: ./impact-theory-newft-256.parquet[0m


Bad pipe message: %s [b'\\v']
Bad pipe message: %s [b'/?\x1c?\xf5\x97q \xfc\x97\xc6\\f7 \xa5\xf0\xbc\xfb\xf35\x13\xbc\xc8\xca\xf3?\xf9\xfea\xecA\x17.$\x98\x9b{\xd2Q\xc8M\x85\xc6\xd0q2\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17']
Bad pipe message: %s [b"\x81Qe\x9f\xcb\x8ed\xb7]\x95I\x1e<\x03g\xd2\xc4\xf8\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00"]
Bad pipe message: %s [b'\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\