In [1]:
%load_ext autoreload
%autoreload 2

import json
import os
from dotenv import load_dotenv
# load_dotenv('./.env', override=True)

from tqdm import tqdm
from collections import defaultdict
from typing import List, Dict, Tuple
import pandas as pd
import numpy as np

from preprocessing import FileIO, Vectorizor, Splitters, Utilities
from sentence_transformers import SentenceTransformer
from llama_index.text_splitter import SentenceSplitter
from concurrent.futures import ProcessPoolExecutor, as_completed
from tiktoken impo

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Step 1 -->  Import Podcast Transcripts

In [2]:
data_path = './data/impact_theory_data.json'

#should see 385 unique podcast entries 
with open(data_path) as f:
    data =  json.load(f)
len(data)

385

In [3]:
#let's get some rough statistics on the content lengths of each podcast
lens = [len(d['content'].split()) for d in data]
df = pd.DataFrame(lens, columns=['Lengths'])
df[df['Lengths'] == 16860]

Unnamed: 0,Lengths
94,16860


In [5]:
# data[94]

In [7]:
#peek at what a data entry looks like (use shortest transcript)
# data[np.argmin(df)]

## Step 2 -->  Split Text into Sentences - LlamaIndex

In [8]:
from transformers import AutoTokenizer
import tiktoken # bad ass tokenizer library for use with OpenAI LLMs 
from llama_index.text_splitter import SentenceSplitter #one of the best on the market

#instantiate tokenzier for our embedding model of choice
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

#instantiate tokenizer for use with ChatGPT-3.5-Turbo
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')

#set chunk size and instantiate three SentenceSplitters 
chunk_size = 256
default_txt_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=0)
allmini_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=tokenizer.encode, chunk_overlap=0)
gpt35_txt_splitter   = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=0)

In [9]:
def assingnment_1_1(data, gpt35_txt_splitter, content_field='content'):
    return [gpt35_txt_splitter.split_text(d[content_field]) for d in tqdm(data)]

In [14]:
splits = assingnment_1_1(data, gpt35_txt_splitter)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [00:30<00:00, 12.73it/s]


In [15]:
lens = [len(chunk) for chunk in split_dict]
pd.DataFrame(lens).describe()

Unnamed: 0,0
count,385.0
mean,69.215584
std,42.037964
min,10.0
25%,42.0
50%,53.0
75%,90.0
max,269.0


## Step 3 -->  Encode Chunks as Vectors (Transfer to Google Colab)

### 3a.) SentenceTransformers

In [19]:
model_path = 'thenlper/gte-base'
base_model = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(base_model)

In [17]:
# # CPU demonstration
# %%time
# vectors = []
# from tqdm import tqdm
# for sent in tqdm(split_dict[0]):
#     vectors.append(model.encode(sent, device='cpu'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 280/280 [00:52<00:00,  5.30it/s]


In [19]:
# len(vectors), len(split_dict[0])

(280, 280)

In [22]:
%%time
## GPU demonstration
def assignment_1_2(content_splits: List[List[str]], model: SentenceTransformer):
    text_vector_tuples = []
    for chunk in tqdm(content_splits):
        vectors = model.encode(chunk, show_progress_bar=False, device='cuda:0')
        text_vector_tuples.append(list(zip(chunk, vectors)))
    return text_vector_tuples

tvt = assignment_1_2(splits, model)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [00:32<00:00, 11.81it/s]

CPU times: user 46.4 s, sys: 697 ms, total: 47 s
Wall time: 32.6 s





In [23]:
def join(corpus, tuples):
    docs = []
    for i, d in enumerate(corpus):
        for j, episode in enumerate(tuples[i]):
            doc = {k:v for k,v in d.items() if k != 'content'}
            video_id = doc['video_id']
            doc['doc_id'] = f'{video_id}_{j}'
            doc['content'] = episode[0]
            doc['content_embedding'] = episode[1]
            docs.append(doc)
    return docs
docs = join(data, tvt)
len(docs)

26648

In [24]:
chunk_size

256

In [25]:
io = FileIO()
io.save_as_parquet(file_path=f'/home/elastic/notebooks/vector_search_applications/practice_data/impact_theory_minilm_{chunk_size}.parquet', data=docs, overwrite=True)

[32m2023-11-11 01:36:15.547[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m38[0m - [1mDataFrame saved as parquet file here: /home/elastic/notebooks/vector_search_applications/practice_data/impact_theory_minilm_256.parquet[0m


## Step - 3.1 --> OPTIONAL: OpenAI Ada Embeddings

In [104]:
openai.api_key = os.environ['OPENAI_API_KEY']
model = "text-embedding-ada-002"
from openai.embeddings_utils import get_embedding, cosine_similarity
#get cost first
tokenizer = Tokenizer(model_type="cl100k_base", price=0.001)

# cost = tokenizer.get_cost(text_chunks)

In [103]:
openai.api_key

'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'

#### Working around OpenAI rate limits

In [93]:
#split text_chunks into roughly 1 million tokens total per group
for num in range(0,43000,6000):
    chunks = text_chunks[num:num+6000]
    cost = tokenizer.get_cost(chunks)
    

Total Tokens: 890642	Cost: 0.891
Total Tokens: 889149	Cost: 0.889
Total Tokens: 892516	Cost: 0.893
Total Tokens: 887084	Cost: 0.887
Total Tokens: 892144	Cost: 0.892
Total Tokens: 887583	Cost: 0.888
Total Tokens: 876077	Cost: 0.876
Total Tokens: 126477	Cost: 0.126


In [55]:
# openai.Embedding.create(text_chunks[:2], engine=model)

In [106]:
openai.api_key

'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'

In [56]:
# %%time
# embeddings = []
# for num in range(0,43000,6000):
#     chunks = text_chunks[num:num+6000]
#     results = openai.Embedding.create(input=chunks, engine=model)
#     embeddings.append(results)
#     time.sleep(60)

In [9]:
episode = '''
In the Impact Theory episode featuring Ray Dalio, a renowned billionaire investor and hedge fund manager, a wide range of topics is explored, with a central focus on the global economic landscape and the challenges it currently faces. The conversation between Tom Bilyeu and Ray Dalio delves into various aspects of economics, geopolitics, and the dynamics that shape our world today.

One of the primary concerns highlighted in the discussion is the vulnerability of the U.S. dollar. Ray Dalio expresses his unease regarding the ongoing international efforts by the BRICS nations (Brazil, Russia, India, China, and South Africa) to reduce their reliance on the dollar as the world's primary reserve currency. He emphasizes that this shift is not an outright attack on the dollar but rather a reflection of the changing economic landscape.

Dalio draws parallels between historical events where other reserve currencies, such as the British pound and the Dutch guilder, lost their dominance due to a combination of factors, including holding excessive amounts of the currency and concerns over potential sanctions. He explains that countries are now looking for alternatives to transact in, driven by a desire to avoid the risks associated with holding large amounts of dollar-denominated debt and the potential for sanctions.

The discussion also touches on the importance of financial responsibility for countries, suggesting that being financially strong is crucial in this evolving global environment. The idea of externalizing inflation through currency devaluation is explored, along with the complex interplay between interest rates, money printing, and fiscal responsibility.

Throughout the conversation, Ray Dalio emphasizes that understanding the historical context and the cyclical nature of economic events is crucial. He advocates for principles like strong family structures, quality education, and equal opportunities as foundational elements for a thriving middle class and a prosperous society. However, he acknowledges that achieving these principles in today's complex world is challenging, particularly when it comes to addressing issues like education inequality and poverty.

In summary, the Impact Theory episode featuring Ray Dalio provides valuable insights into the current state of the global economy and the challenges it faces. Dalio's expertise and historical perspective shed light on the potential vulnerabilities of the U.S. dollar and the importance of financial responsibility for nations. The conversation serves as a reminder of the complex interplay of economic forces, geopolitics, and social factors that shape our world, and it underscores the significance of addressing these challenges to ensure a prosperous future.
'''

In [10]:
len(episode.split())

406