In [1]:
%load_ext autoreload
%autoreload 2

import json
import os
from dotenv import load_dotenv
# load_dotenv('./.env', override=True)

from tqdm import tqdm
from collections import defaultdict
from typing import List, Dict, Tuple
import pandas as pd
import numpy as np

from preprocessing import FileIO, Vectorizor, Splitters
from sentence_transformers import SentenceTransformer
from llama_index.text_splitter import SentenceSplitter
from concurrent.futures import ProcessPoolExecutor, as_completed

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Step 1 -->  Import Podcast Transcripts

In [2]:
data_path = './data/impact_theory_metadata.json'

In [3]:
#should see 385 unique podcast entries 
with open(data_path) as f:
    data =  json.load(f)
len(data)

385

In [4]:
#let's get some rough statistics on the content lengths of each podcast
lens = [len(d['content'].split()) for d in data]
df = pd.DataFrame(lens, columns=['Lengths'])
df.describe()

Unnamed: 0,Lengths
count,385.0
mean,12884.654545
std,7741.43973
min,1819.0
25%,7891.0
50%,9899.0
75%,16860.0
max,48502.0


In [5]:
#peek at what a data entry looks like (use shortest transcript)
data[np.argmin(df)]

{'author': 'Tom Bilyeu',
 'title': '"Real Life SEX ROBOTS Are Coming..." - The Dangers Of Seductive AI | Mo Gawdat',
 'video_id': 'IK9lN__kBXs',
 'playlist_id': 'PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
 'channel_id': 'UCnYMOamNKLGVlJgRUbamveA',
 'description': 'No description provided',
 'keywords': [],
 'length': 684,
 'publish_date': '07-08-2023',
 'thumbnail_url': 'https://i.ytimg.com/vi/IK9lN__kBXs/hq720.jpg',
 'views': 61134,
 'age_restricted': False,
 'content': "what are the near-term disruptions? The one that freaks me out, and every time I talk to a parent with a teenage boy, I'm like, your kid is like, sex robots are really gonna be a thing for them, like for real, for real. I worry if I grew up five years from now, I would not graduate from high school. I would just find a sex robot and go into oblivion. What are one, what do you think is the reality of that one in particular? And then I'd love to branch out to some others. So the word robot is interesting, but sex alternatives

## Step 2 -->  Split Text into Sentences - LlamaIndex

In [6]:
#TODO: Dig into why text_splitter is using NLTK tokenizer under the hood. 

In [6]:
#discussion on chunk size
chunk_size = 196
splitter = Splitters()
text_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=20)

In [21]:
split_dict = splitter.split_corpus(data, text_splitter)

Docs: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [00:21<00:00, 18.09it/s]


In [22]:
lens = [len(split_dict[key]) for key in split_dict]
sum(lens)

37007

## Step 3 -->  Encode Chunks as Vectors (Transfer to Google Colab)

### 3a.) SentenceTransformers

In [9]:
# sbert = SentenceTransformer('all-MiniLM-L6-v2'). ##  35 seconds to encode all ImpactTheory 
# model = SentenceTransformer(model_path, device='cuda:0') ## 136 seconds to encode all ImpactTheory
model = SentenceTransformer('thenlper/gte-base')

In [23]:
model_path = 'thenlper/gte-base'
base_model = 'sentence-transformers/all-MiniLM-L6-v2'
vectorizer = Vectorizor(model_name_or_path=base_model)

In [17]:
# CPU demonstration
%%time
vectors = []
from tqdm import tqdm
for sent in tqdm(split_dict[0]):
    vectors.append(model.encode(sent, device='cpu'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 280/280 [00:52<00:00,  5.30it/s]


In [19]:
len(vectors), len(split_dict[0])

(280, 280)

In [24]:
%%time
## GPU demonstration
merged_dict = vectorizer.encode_from_dict(split_dict, device='cpu')

Docs: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [14:39<00:00,  2.29s/it]

CPU times: user 1h 7min 59s, sys: 6min 2s, total: 1h 14min 2s
Wall time: 14min 39s





In [17]:
385/25

15.4

In [107]:
# np.save('./gte_vectors.npy', gte_vectors, allow_pickle=False)

In [21]:
docs = vectorizer.join_metadata(corpus=data, merged_dict=merged_dict, create_doc_id=True)
len(docs)

9055

In [22]:
test = [d for d in docs if d['video_id'] == 'mrND5lSPEQU']
len(test)

280

In [51]:
io = FileIO()
io.save_as_parquet(file_path=f'/home/elastic/notebooks/vector_search_applications/data/impact_theory_gte_{chunk_size}.parquet', data=docs, overwrite=True)

[32m2023-10-12 20:19:15.464[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m34[0m - [1mDataFrame saved as parquet file here: /home/elastic/notebooks/vector_search_applications/data/impact_theory_gte_128.parquet[0m
Bad pipe message: %s [b"q(\xeaMV\xab\xc0\x03\xd3\xaf\x94<\xc1\xbe\xd8\x1a\x8az\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005"]
Bad pipe message: %s [b'B\xa4\x83\x950y9\xd2\xbf\x87\x1b\xfc\x83\x9e\xd1\x8bx\x86\x00\x00>\xc0\x14\xc0\n\x009\x008\x007\x006\xc0\x0f\xc0\x05\x005\xc0\x13\xc0\t\x003\x002\x001']
Bad pipe message: %s [b'\xac\x143\xff\\\xfe\xa7\x7f\xf5\xf3M\xabz\x11=\r\x1a\xb0\x00\x00\xa2\xc0\x14\xc0\n\x009\x008\x007\x006\x00\x88\x00\x87\

## Step - 3.1 --> OPTIONAL: OpenAI Ada Embeddings

In [104]:
openai.api_key = os.environ['OPENAI_API_KEY']
model = "text-embedding-ada-002"
from openai.embeddings_utils import get_embedding, cosine_similarity
#get cost first
tokenizer = Tokenizer(model_type="cl100k_base", price=0.001)

# cost = tokenizer.get_cost(text_chunks)

In [103]:
openai.api_key

'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'

#### Working around OpenAI rate limits

In [93]:
#split text_chunks into roughly 1 million tokens total per group
for num in range(0,43000,6000):
    chunks = text_chunks[num:num+6000]
    cost = tokenizer.get_cost(chunks)
    

Total Tokens: 890642	Cost: 0.891
Total Tokens: 889149	Cost: 0.889
Total Tokens: 892516	Cost: 0.893
Total Tokens: 887084	Cost: 0.887
Total Tokens: 892144	Cost: 0.892
Total Tokens: 887583	Cost: 0.888
Total Tokens: 876077	Cost: 0.876
Total Tokens: 126477	Cost: 0.126


In [55]:
# openai.Embedding.create(text_chunks[:2], engine=model)

In [106]:
openai.api_key

'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'

In [56]:
# %%time
# embeddings = []
# for num in range(0,43000,6000):
#     chunks = text_chunks[num:num+6000]
#     results = openai.Embedding.create(input=chunks, engine=model)
#     embeddings.append(results)
#     time.sleep(60)

In [None]:
vectors = results['data']
vectors = [vec['embedding'] for vec in vectors]
len(vectors)