In [32]:
%load_ext autoreload
%autoreload 2

import json
import os
from dotenv import load_dotenv
load_dotenv('./.env', override=True)

from tqdm import tqdm
from collections import defaultdict
from typing import List, Dict, Tuple

from preprocessing import FileIO, Splitters, Vectorizor
from sentence_transformers import SentenceTransformer
from llama_index.text_splitter import SentenceSplitter
from concurrent.futures import ProcessPoolExecutor, as_completed

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# from llama_hub.file.pymu_pdf.base import PyMuPDFReader
# loader = PyMuPDFReader()
# docs = loader.load('./data/llama2.pdf')
# len(docs)

### 1.) Import Podcast Transcripts

In [3]:
data_path = '/home/elastic/notebooks/vector_search_applications/data/impact_theory_metadata.json'

In [4]:
%%time
with open(data_path) as f:
    data =  json.load(f)
    data = [d for d in data if d.get('content')]
len(data)

CPU times: user 30.6 ms, sys: 43.6 ms, total: 74.3 ms
Wall time: 73.7 ms


385

In [8]:
# data[384]

### 2a.) Split Text into Sentences - LlamaIndex

In [7]:
#TODO: Dig into why text_splitter is using NLTK tokenizer under the hood. 

In [15]:
text_splitter = SentenceSplitter(chunk_size=172, chunk_overlap=20)
splitter = Splitters()

In [18]:
split_dict = splitter.split_corpus(data, text_splitter)

Docs: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [00:21<00:00, 17.61it/s]


In [20]:
lens = [len(split_dict[key]) for key in split_dict]
sum(lens)

42863

### 2b.) Split Text into Sentences - SentenceSplitter

In [21]:
%%time
utils = Utilities()
transcripts = [d.get('content', '') for d in data]
print(len(transcripts))
transcripts = [text for text in transcripts if text]
print(len(transcripts))
split_sentences = [utils.sentence_splitter(text) for text in transcripts]

387
385
CPU times: user 5.09 s, sys: 35.2 ms, total: 5.13 s
Wall time: 5.13 s


In [24]:
pd.DataFrame([len(corpus) for corpus in split_sentences]).describe()

Unnamed: 0,0
count,385.0
mean,857.015584
std,642.827887
min,1.0
25%,469.0
50%,668.0
75%,1056.0
max,5342.0


### 3. Encode Chunks as Vectors

### 3a.) SentenceTransformers

In [8]:
# sbert = SentenceTransformer('all-MiniLM-L6-v2'). ##  35 seconds to encode all ImpactTheory 

# model = SentenceTransformer(model_path, device='cuda:0') ## 136 seconds to encode all ImpactTheory

In [38]:
model_path = '/home/elastic/notebooks/vector_search_applications/models/gte-base/' 
vectorizer = Vectorizor(model_name_or_path=model_path)

In [35]:
%%time
merged_dict = vectorizer.encode_from_dict(split_dict)

Docs: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [02:16<00:00,  2.82it/s]

CPU times: user 2min 31s, sys: 877 ms, total: 2min 32s
Wall time: 2min 16s





In [107]:
# np.save('./gte_vectors.npy', gte_vectors, allow_pickle=False)

In [39]:
docs = vectorizer.join_metadata(corpus=data, merged_dict=merged_dict)
len(docs)

42863

In [41]:
io = FileIO()
io.save_as_parquet(file_path='/home/elastic/notebooks/vector_search_applications/data/impact_theory_GTE.parquet', data=docs, overwrite=True)

[32m2023-10-09 22:00:01.540[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m34[0m - [1mDataFrame saved as parquet file here: /home/elastic/notebooks/vector_search_applications/data/impact_theory_GTE.parquet[0m


### 3b.) OpenAI Ada Embeddings

In [104]:
openai.api_key = os.environ['OPENAI_API_KEY']
model = "text-embedding-ada-002"
from openai.embeddings_utils import get_embedding, cosine_similarity
#get cost first
tokenizer = Tokenizer(model_type="cl100k_base", price=0.001)

# cost = tokenizer.get_cost(text_chunks)

In [103]:
openai.api_key

'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'

#### Working around OpenAI rate limits

In [93]:
#split text_chunks into roughly 1 million tokens total per group
for num in range(0,43000,6000):
    chunks = text_chunks[num:num+6000]
    cost = tokenizer.get_cost(chunks)
    

Total Tokens: 890642	Cost: 0.891
Total Tokens: 889149	Cost: 0.889
Total Tokens: 892516	Cost: 0.893
Total Tokens: 887084	Cost: 0.887
Total Tokens: 892144	Cost: 0.892
Total Tokens: 887583	Cost: 0.888
Total Tokens: 876077	Cost: 0.876
Total Tokens: 126477	Cost: 0.126


In [55]:
# openai.Embedding.create(text_chunks[:2], engine=model)

In [106]:
openai.api_key

'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'

In [56]:
# %%time
# embeddings = []
# for num in range(0,43000,6000):
#     chunks = text_chunks[num:num+6000]
#     results = openai.Embedding.create(input=chunks, engine=model)
#     embeddings.append(results)
#     time.sleep(60)

In [None]:
vectors = results['data']
vectors = [vec['embedding'] for vec in vectors]
len(vectors)