In [1]:
%load_ext autoreload
%autoreload 2

#load from local .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import sys
sys.path.append('../../')

#standard libraries
import json
import os
import time
from typing import List
from math import ceil

#external libraries
import pandas as pd
import numpy as np
from rich import print
from torch import cuda
from tqdm import tqdm
import tiktoken # bad ass tokenizer library for use with OpenAI LLMs 
from llama_index.text_splitter import SentenceSplitter #one of the best on the market
from sentence_transformers import SentenceTransformer

#external files
from src.preprocessor.preprocessing import FileIO
from src.evaluation.retrieval_evaluation import execute_evaluation
from src.database.weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer
from src.database.database_utils import get_weaviate_client
from src.database.properties_template import properties
from src.pipelines.pipeline import (chunk_data, create_vectors, join_docs, 
                                    create_dataset, groupby_episode, create_parent_chunks,
                                    convert_raw_data)
from torch import cuda 
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses, InputExample, models

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def load_pretrained_model(pretrained_model_name: str='sentence-transformers/all-MiniLM-L6-v2'):
    '''
    Loads sentence transformer modules and returns a pretrained 
    model for finetuning. 
    '''
    word_embedding_model = models.Transformer(pretrained_model_name)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

In [3]:
def load_data(data_path: str='../../answer_key/data/qa_triplets.json',
              triplets: bool=True):
    '''
    Loads data from disk and returns a Pytorch Dataloader.
    '''
    data = FileIO.load_json(data_path)
    if triplets:
        train_examples = [InputExample(texts=[sample['anchor'],
                                              sample['positive'],
                                              sample['hard_negative']
                                     ]) for sample in data]
    else:
        train_examples = [InputExample(texts=[sample['anchor'],
                                              sample['positive']
                                             ]) for sample in data]
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
    return train_dataloader

In [4]:
def train_model(epochs: int, model, dataloader, loss_fx=losses.MultipleNegativesRankingLoss):
    train_loss = loss_fx(model=model)
    warmup_steps = int(len(dataloader) * epochs * 0.1)
    model.fit(train_objectives=[(dataloader, train_loss)],
              epochs=epochs,
              warmup_steps=warmup_steps)
    return model

In [5]:
def train_pipe(pretrained_model_name: str, epochs: int, triplets: bool):
    pretrained_model = load_pretrained_model(pretrained_model_name)
    dataloader = load_data(triplets=triplets)
    fine_tuned_model = train_model(epochs, pretrained_model, dataloader)
    return fine_tuned_model

In [110]:
pretrained_model = 'BAAI/bge-small-en-v1.5'
ft_model = train_pipe(pretrained_model, 2, False)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

In [111]:
def save_model(model, model_path: str, model_name: str):
    model.save(path=model_path, model_name=model_name)
    return model_path

In [112]:
model_path = save_model(ft_model, '../../models/bge-small-finetuned-256-couplets-2', 'bge-small-finetuned-256-couplets-2')

In [113]:
model_path

'../../models/bge-small-finetuned-256-couplets-2'

### Set Constants

In [114]:
chunk_size = 256

In [115]:
def build_index_dataset(model_path: str, 
                        chunk_size: int,
                        finetuned: bool=True,
                        outpath: str='../../answer_key/data/'):
    '''
    Creates dataset using model_path and then indexes on Weaviate.
    '''
    model_ext = os.path.split(model_path)[1]
    model = SentenceTransformer(model_path, device='cuda:0')
    encoding = tiktoken.get_encoding(encoding_name='cl100k_base')
    splitter = SentenceSplitter(chunk_overlap=0, chunk_size=chunk_size, tokenizer=encoding.encode)
    data = FileIO.load_json('../../data/huberman_labs.json')
    outpath = os.path.join(outpath, model_ext)
    docs = create_dataset(data, model, splitter, file_outpath_prefix=outpath, overwrite_existing=True)
    client = get_weaviate_client(model_name_or_path=model_path)
    # model_run = 'finetuned' if finetuned else 'baseline' _{model_run}_{chunk_size}
    collection_name = f'Huberman_{model_ext}'.replace('-', '_').replace('.','')
    indexer = WeaviateIndexer(client)
    indexer.create_collection(collection_name, 
                              properties, 
                              description='Full index of 193 Huberman Labs episodes as of April 5, 2024')
    batch = indexer.batch_index_data(docs, collection_name, properties=properties)
    return collection_name, client

### Create + Index Dataset

In [116]:
collection_name, client = build_index_dataset(model_path, chunk_size=256, finetuned=True)

CHUNKING:   0%|          | 0/193 [00:00<?, ?it/s]

VECTORS:   0%|          | 0/193 [00:00<?, ?it/s]

[32m2024-05-13 13:36:55.555[0m | [1mINFO    [0m | [36msrc.preprocessor.preprocessing[0m:[36msave_as_parquet[0m:[36m42[0m - [1mDataFrame saved as parquet file here: ../../answer_key/data/bge-small-finetuned-256-couplets-2-256.parquet[0m


Collection "Huberman_bge_small_finetuned_256_couplets_2" created


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23905/23905 [00:33<00:00, 703.21it/s]


Processing finished in 1.15 minutes.
Batch job completed with zero errors.


  collection_name, client = build_index_dataset(model_path, chunk_size=256, finetuned=True)


### Measure Retrieval performance

In [117]:
golden_path = '../../data/golden_datasets/golden_256.json'
goldens = FileIO.load_json(golden_path)
collection_name

'Huberman_bge_small_finetuned_256_couplets_2'

In [131]:
model_path = '../../models/bge-small-finetuned-256-2/'
client = get_weaviate_client(model_name_or_path=model_path)
collection_name = 'Huberman_bge_small_finetuned_256_2'

In [134]:
test = execute_evaluation(goldens, collection_name, client, reranker=None, retrieve_limit=5, chunk_size=256, search_type=['vector'])

Queries: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:10<00:00,  9.13it/s]


In [136]:
test

{'n': 5,
 'top_k': 5,
 'alpha': 0.5,
 'Retriever': '../../models/bge-small-finetuned-256-2/',
 'Ranker': 'None',
 'chunk_size': 256,
 'query_props': ['content'],
 'total_misses': 14,
 'total_questions': 100,
 'vector_hit_rate': 0.86,
 'vector_mrr': 0.69}

In [127]:
bge_small_finetuned_couplets_2

{'n': 3,
 'top_k': 5,
 'alpha': 0.5,
 'Retriever': '../../models/bge-small-finetuned-256-couplets-2',
 'Ranker': 'None',
 'chunk_size': 256,
 'query_props': ['content'],
 'total_misses': 22,
 'total_questions': 100,
 'vector_hit_rate': 0.78,
 'vector_mrr': 0.68}

In [108]:
bge_small_finetuned_1

{'n': 5,
 'top_k': 5,
 'alpha': 0.5,
 'Retriever': '../../models/bge-small-finetuned-256-1',
 'Ranker': 'None',
 'chunk_size': 256,
 'query_props': ['content'],
 'total_misses': 17,
 'total_questions': 100,
 'vector_hit_rate': 0.83,
 'vector_mrr': 0.66}

In [98]:
bge_small_finetuned_2

{'n': 5,
 'top_k': 5,
 'alpha': 0.5,
 'Retriever': '../../models/bge-small-finetuned-256-2',
 'Ranker': 'None',
 'chunk_size': 256,
 'query_props': ['content'],
 'total_misses': 14,
 'total_questions': 100,
 'vector_hit_rate': 0.86,
 'vector_mrr': 0.69}

In [129]:
bge_small_couplets

{'n': 5,
 'top_k': 5,
 'alpha': 0.5,
 'Retriever': '../../models/bge-small-finetuned-256-couplets',
 'Ranker': 'None',
 'chunk_size': 256,
 'query_props': ['content'],
 'total_misses': 15,
 'total_questions': 100,
 'vector_hit_rate': 0.85,
 'vector_mrr': 0.69}

In [50]:
bge_small_fintuned_4

{'n': 5,
 'top_k': 5,
 'alpha': 0.5,
 'Retriever': '../../models/bge-small-finetuned-256',
 'Ranker': 'None',
 'chunk_size': 256,
 'query_props': ['content'],
 'total_misses': 17,
 'total_questions': 100,
 'vector_hit_rate': 0.83,
 'vector_mrr': 0.66}

In [38]:
bge_small_baseline

{'n': 5,
 'top_k': 5,
 'alpha': 0.5,
 'Retriever': 'BAAI/bge-small-en-v1.5',
 'Ranker': 'None',
 'chunk_size': 256,
 'query_props': ['content'],
 'total_misses': 18,
 'total_questions': 100,
 'vector_hit_rate': 0.82,
 'vector_mrr': 0.62}

In [123]:
ft= {'n': 5, 'top_k': 5, 'alpha': 0.5, 'Retriever': '../../models/minilm-finetuned-256/', 'Ranker': 'None', 'chunk_size': 256, 'query_props': ['content'], 'total_misses': 19, 'total_questions': 100, 'vector_hit_rate': 0.81, 'vector_mrr': 0.66}

In [124]:
baseline = {'n': 5, 'top_k': 5, 'alpha': 0.5, 'Retriever': 'sentence-transformers/all-MiniLM-L6-v2', 'Ranker': 'None', 'chunk_size': 256, 'query_props': ['content'], 'total_misses': 29, 'total_questions': 100, 'vector_hit_rate': 0.71, 'vector_mrr': 0.59}

In [125]:
baseline

{'n': 5,
 'top_k': 5,
 'alpha': 0.5,
 'Retriever': 'sentence-transformers/all-MiniLM-L6-v2',
 'Ranker': 'None',
 'chunk_size': 256,
 'query_props': ['content'],
 'total_misses': 29,
 'total_questions': 100,
 'vector_hit_rate': 0.71,
 'vector_mrr': 0.59}