In [5]:
%load_ext autoreload
%autoreload 2

# standard libraries
import sys

sys.path.append("../")

import os
import tiktoken
import time
import torch
from typing import List, Tuple
from math import ceil

# external libraries
import pandas as pd
import numpy as np
from llama_index.text_splitter import SentenceSplitter  # one of the best on the market
from rich import print
from rich.pretty import pprint  # nifty library for pretty printing
from sentence_transformers import SentenceTransformer, losses, InputExample, models
from torch import cuda
from tqdm import tqdm

# external files
try:
    from preprocessing import FileIO
except ModuleNotFoundError:
    from src.preprocessor.preprocessing import FileIO

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

from src.database.weaviate_interface_v4 import WeaviateIndexer, WeaviateWCS
from src.database.database_utils import get_weaviate_client

import os
import time
import json
from typing import List
from tqdm import tqdm
from rich import (
    print,
)  # nice library that provides improved printing output (overrides default print function)

from src.database.properties_template import properties

api_key = os.environ["WEAVIATE_API_KEY"]
url = os.environ["WEAVIATE_ENDPOINT"]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
def split_contents(
    corpus: list[dict], text_splitter: SentenceSplitter, content_field: str = "content"
) -> list[list[str]]:
    """
    Given a corpus of "documents" with text content, this function splits the
    content field into chunks sizes as specified by the text_splitter.

    Example
    -------
    corpus = [
            {'title': 'This is a cool show', 'content': 'There is so much good content on this show. \
              This would normally be a really long block of content. ... But for this example it will not be.'},
            {'title': 'Another Great Show', 'content': 'The content here is really good as well.  If you are \
              reading this you have too much time on your hands. ... More content, blah, blah.'}
           ]

    output = split_contents(data, text_splitter, content_field="content")

    output >>> [['There is so much good content on this show.', 'This would normally be a really long block of content.', \
                 'But for this example it will not be'],
                ['The content here is really good as well.', 'If you are reading this you have too much time on your hands.', \
                 'More content, blah, blah.']
                ]
    """

    ########################
    # START YOUR CODE HERE #
    ########################
    output = []
    for doc in tqdm(corpus):
        output.append(text_splitter.split_text(doc[content_field]))

    return output


def encode_content_splits(
    content_splits: list[list[str]],
    model: SentenceTransformer,
    device: str = "cuda:0" if cuda.is_available() else "cpu",
) -> list[list[tuple[str, list[float]]]]:
    """
    Encode content splits as vector embeddings from a vectors of content splits
    where each vectors of splits is a single podcast episode.

    Example
    -------
    content_splits =  [['There is so much good content on this show.', 'This would normally be a really long block of content.'],
                       ['The content here is really good as well.', 'More content, blah, blah.']
                      ]

    output = encode_content_splits(content_splits, model)

    output >>> [
          EPISODE 1 -> [('There is so much good content on this show.',[ 1.78036056e-02, -1.93265956e-02,  3.61164124e-03, -5.89650944e-02,
                                                                         1.91510320e-02,  1.60808843e-02,  1.13610983e-01,  3.59948091e-02,
                                                                        -1.73066761e-02, -3.30348089e-02, -1.00898169e-01,  2.34847311e-02]
                                                                        )
                         tuple(text, vectors), tuple(text, vectors), tuple(text, vectors)....],
          EPISODE 2 ->  [tuple(text, vectors), tuple(text, vectors), tuple(text, vectors)....],
          EPISODE n ... [tuple(text, vectors), tuple(text, vectors), tuple(text, vectors)....]
    """

    text_vector_tuples = []

    ########################
    # START YOUR CODE HERE #
    ########################

    model.to(device)

    for content in tqdm(content_splits):
        vecs = model.encode(content).tolist()
        text_vector = [(t, v) for t, v in zip(content, vecs)]
        text_vector_tuples.append(text_vector)

    return text_vector_tuples


def join_metadata(
    corpus: list[dict],
    text_vector_list: list[list[tuple[str, list]]],
    unique_id_field: str = "video_id",
    content_field: str = "content",
    embedding_field: str = "content_embedding",
) -> list[dict]:
    """
    Combine episode metadata from original corpus with text/vectors tuples.
    Creates a new dictionary for each text/vector combination.
    """

    joined_documents = []

    ########################
    # START YOUR CODE HERE #
    ########################

    for i, doc in enumerate(corpus):
        for j, tv in enumerate(text_vector_list[i]):
            corp_dict = {key: value for key, value in doc.items() if key != "content"}
            video_id = doc["video_id"]
            corp_dict["doc_id"] = f"{video_id}_{j}"
            corp_dict["content"] = tv[0]
            corp_dict["content_embedding"] = tv[1]
            joined_documents.append(corp_dict)

    return joined_documents


def create_dataset(
    corpus: list[dict],
    embedding_model: SentenceTransformer,
    text_splitter: SentenceSplitter,
    save_to_disk: bool,
    file_outpath: str = None,
    unique_id_field: str = "video_id",
    content_field: str = "content",
    embedding_field: str = "content_embedding",
    device: str = "cuda:0" if cuda.is_available() else "cpu",
) -> list[dict]:
    """
    Given a raw corpus of data, this function creates a new dataset where each dataset
    doc contains episode metadata and it's associated text chunk and vector representation.
    Output is directly saved to disk.
    """
    if save_to_disk and not file_outpath:
        raise ValueError(
            f"Saving to disk is enabled but file_outpath was left as a None value.\n\
            Enter a valid file_outpath or mark save_to_disk as False"
        )

    io = FileIO()

    chunk_size = text_splitter.chunk_size
    print(f"Creating dataset using chunk_size: {chunk_size}")
    start = time.perf_counter()
    ########################
    # START YOUR CODE HERE #
    ########################
    content_splits = split_contents(corpus, text_splitter)
    text_vector_tuples = encode_content_splits(content_splits, embedding_model)
    joined_docs = join_metadata(corpus, text_vector_tuples)
    ########################
    # END YOUR CODE HERE #
    ########################
    if save_to_disk:
        io.save_as_parquet(file_path=file_outpath, data=joined_docs, overwrite=False)
    end = time.perf_counter() - start
    print(
        f"Total Time to process dataset of chunk_size ({chunk_size}): {round(end/60, 2)} minutes"
    )
    return joined_docs

In [12]:
# root folder on Google Colab is: /content/
root_folder = "../data/"
data_file = "huberman_labs.json"
data_path = os.path.join(root_folder, data_file)
data_path

data = FileIO.load_json(data_path)

In [13]:
def load_pretrained_model(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
    """
    Loads sentence transformer modules and returns a pretrained
    model for finetuning.
    """
    word_embedding_model = models.Transformer(model_name_or_path=model_name)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    model.to("cuda")
    return model

In [14]:
# define the model you want to use
model_names = [
    "../models/bge-base-finetuned-500",
]

base = ["bge_finetuned_500"]

In [15]:
chunk_sizes = [512]
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0125")

client = WeaviateWCS(endpoint=url, api_key=api_key, model_name_or_path=model_name)

for chunk_size in chunk_sizes:
    for model_name, bas in zip(model_names, base):
        gpt35_txt_splitter = SentenceSplitter(
            chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=0
        )
        outpath = f"../data/huberman_{bas}_{chunk_size}"
        model = load_pretrained_model(model_name)
        create_dataset(
            data, model, gpt35_txt_splitter, save_to_disk=True, file_outpath=outpath
        )

        data_path = f"../data/huberman_{bas}_{chunk_size}.parquet"

        data_pqt = FileIO.load_parquet(data_path)

        collection_name = f"Huberman_{bas}_{chunk_size}"

        client.create_collection(
            collection_name=collection_name,
            properties=properties,
            description="Huberman Labs: 193 full-length transcripts",
        )

        indexer = WeaviateIndexer(client)

        batch_object = indexer.batch_index_data(data_pqt, collection_name)

client.close()

100%|██████████| 193/193 [00:18<00:00, 10.50it/s]
100%|██████████| 193/193 [06:37<00:00,  2.06s/it]
[32m2024-05-21 01:53:46.894[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m42[0m - [1mDataFrame saved as parquet file here: ../data/huberman_bge_finetuned_500_512.parquet[0m


Shape of data: (11602, 13)
Memory Usage: 1.15+ MB
Collection "Huberman_bge_finetuned_500_512" created


  indexer = WeaviateIndexer(client)
100%|██████████| 11602/11602 [00:19<00:00, 595.29it/s]


Processing finished in 0.58 minutes.
Batch job completed with zero errors.
