In [1]:
import sys
sys.path.append('../../')

In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

In [18]:
from src.pipelines.pipeline import chunk_data, create_vectors, create_parent_chunks, join_docs, groupby_episode
from src.preprocessor.preprocessing import FileIO
from llama_index.text_splitter import SentenceSplitter
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import tiktoken

In [6]:
data_path = '../../answer_key/data/huberman_minilm-256.parquet'

In [8]:
data = FileIO.load_parquet(data_path)

Shape of data: (23905, 13)
Memory Usage: 2.37+ MB


### Set Constants

In [11]:
chunk_size = 256
#tokenizer
encoding = tiktoken.get_encoding(encoding_name='cl100k_base')
#text_splitter
splitter = SentenceSplitter(chunk_overlap=0, chunk_size=chunk_size, tokenizer=encoding.encode)
#model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda:0')

### Get Chunks

In [12]:
chunks = [d['content'] for d in data]

### Create Expanded Content

In [21]:
from itertools import groupby

def groupby_episode(data: list[dict], key_field: str='video_id') -> list[list[dict]]:
    '''
    Separates entire Impact Theory corpus into individual 
    lists of discrete episodes.
    '''
    episodes = []
    for _, group in groupby(data, lambda x: x[key_field]):
        episode = [chunk for chunk in group]
        episodes.append(episode)
    return episodes

In [45]:
def create_expanded_content(data: list[dict]=None, 
                            chunk_list: list[list[str]]=None, 
                            window_size: int=1,
                            num_episodes: int=193,
                            key_field: str='video_id'
                            ) -> list[list[str]]:
    '''
    Creates expanded content from original chunks of text, for use with 
    expanded content retrieval.  Takes in raw data in dict format or 
    accepts a list of chunked episodes already grouped. 
    
    Window size sets the number of chunks before and after the original chunk.  
    For example a window_size of 2 will return five joined chunks.  2 chunks 
    before original chunk, the original, and 2 chunks after the original.  
    
    Expanded content is grouped by podcast episode, and chunks are assumed 
    to be kept in order by which they will be joined as metadata in follow-on 
    processing.
    '''
    if not data and not chunk_list:
        raise ValueError("Either data or a chunk_list must be passed as an arg")
    if data:
        episodes = groupby_episode(data, key_field)
        assert len(episodes) == num_episodes, f'Number of grouped episodes does not equal num_episodes ({len(episodes)} != {num_episodes})'
        chunk_list = [[d['content'] for d in alist] for alist in episodes]
    expanded_contents = []
    for episode in tqdm(chunk_list):
        episode_container = []
        for i, chunk in enumerate(episode):
            start = max(0, i-window_size)
            end = i+window_size+1
            expanded_content = ' '.join(episode[start:end])
            episode_container.append(expanded_content)
        expanded_contents.append(episode_container)
    return expanded_contents

In [46]:
expanded = create_expanded_content(data, num_episodes=60)

AssertionError: Number of grouped episodes does not equal num_episodes (193 != 60)

### Create Vectors

In [25]:
vectors = create_vectors(chunks, model, 'cuda:0')

VECTORS:   0%|          | 0/193 [00:00<?, ?it/s]

### Join Meta

In [126]:
docs = join_docs(data, vectors, expanded)

In [127]:
len(docs)

23905