<a href="https://colab.research.google.com/github/americanthinker/vector_search_applications_PUBLIC/blob/master/1_Preprocess_Data_Week1_COLAB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install libraries for this notebook

In [1]:
!pip install tqdm --quiet
!pip install pandas --quiet
!pip install sentence-transformers==2.2.2 --quiet
!pip install llama-index==0.8.41 --quiet
!pip install loguru==0.7.0 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━

## Download preprocessing file (helper code) from public repo

In [2]:
!curl -o preprocessing.py https://raw.githubusercontent.com/americanthinker/vector_search_applications_PUBLIC/master/preprocessing.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11558  100 11558    0     0  29316      0 --:--:-- --:--:-- --:--:-- 29260


## Download data from s3 bucket
Total download size should be about 25 MB

In [5]:
!curl -o impact_theory_data.json https://vector-search-applications-data.s3.us-west-2.amazonaws.com/impact_theory_data.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 25.3M  100 25.3M    0     0  43.8M      0 --:--:-- --:--:-- --:--:-- 43.9M


In [6]:
import json
import os

from tqdm import tqdm
from collections import defaultdict
from typing import List, Dict, Tuple
import pandas as pd
import numpy as np

from preprocessing import FileIO, Vectorizor, Splitters
from sentence_transformers import SentenceTransformer
from llama_index.text_splitter import SentenceSplitter
from concurrent.futures import ProcessPoolExecutor, as_completed

## Step 1 -->  Import Podcast Transcripts

In [7]:
data_path = '/content/impact_theory_data.json'

In [8]:
#should see 385 unique podcast entries
with open(data_path) as f:
    data =  json.load(f)
len(data)

385

In [9]:
#let's get some rough statistics on the content lengths of each podcast
lens = [len(d['content'].split()) for d in data]
df = pd.DataFrame(lens, columns=['Lengths'])
df.describe()

Unnamed: 0,Lengths
count,385.0
mean,12884.654545
std,7741.43973
min,1819.0
25%,7891.0
50%,9899.0
75%,16860.0
max,48502.0


In [10]:
#peek at what a data entry looks like (use shortest transcript)
data[np.argmin(df)]

{'author': 'Tom Bilyeu',
 'title': '"Real Life SEX ROBOTS Are Coming..." - The Dangers Of Seductive AI | Mo Gawdat',
 'video_id': 'IK9lN__kBXs',
 'playlist_id': 'PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
 'channel_id': 'UCnYMOamNKLGVlJgRUbamveA',
 'description': 'No description provided',
 'keywords': [],
 'length': 684,
 'publish_date': '07-08-2023',
 'thumbnail_url': 'https://i.ytimg.com/vi/IK9lN__kBXs/hq720.jpg',
 'views': 61134,
 'age_restricted': False,
 'content': "what are the near-term disruptions? The one that freaks me out, and every time I talk to a parent with a teenage boy, I'm like, your kid is like, sex robots are really gonna be a thing for them, like for real, for real. I worry if I grew up five years from now, I would not graduate from high school. I would just find a sex robot and go into oblivion. What are one, what do you think is the reality of that one in particular? And then I'd love to branch out to some others. So the word robot is interesting, but sex alternatives

## Step 2 -->  Split Text into Sentences - LlamaIndex

In [None]:
#TODO: Dig into why text_splitter is using NLTK tokenizer under the hood.

In [11]:
#discussion on chunk size
chunk_size = 196
splitter = Splitters()
text_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=20)

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
split_dict = splitter.split_corpus(data[:50], text_splitter)

Docs: 100%|██████████| 50/50 [00:08<00:00,  6.02it/s]


In [13]:
lens = [len(split_dict[key]) for key in split_dict]
sum(lens)

9055

## Step 3 -->  Encode Chunks as Vectors (Transfer to Google Colab)

### 3a.) SentenceTransformers

In [None]:
# sbert = SentenceTransformer('all-MiniLM-L6-v2'). ##  35 seconds to encode all ImpactTheory
# model = SentenceTransformer(model_path, device='cuda:0') ## 136 seconds to encode all ImpactTheory
model = SentenceTransformer('thenlper/gte-base')

In [14]:
model_path = 'thenlper/gte-base'
vectorizer = Vectorizor(model_name_or_path=model_path)

Downloading (…)a8668/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)10cbba8668/README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

Downloading (…)cbba8668/config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading (…)668/onnx/config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

Downloading model.onnx:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)/onnx/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)a8668/onnx/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)a8668/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)10cbba8668/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bba8668/modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [None]:
# CPU demonstration
%%time
vectors = []
from tqdm import tqdm
for sent in tqdm(split_dict[0]):
    vectors.append(model.encode(sent, device='cpu'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 280/280 [00:52<00:00,  5.30it/s]


In [16]:
%%time
## GPU demonstration
merged_dict = vectorizer.encode_from_dict(split_dict, device='cuda:0')

Docs: 100%|██████████| 50/50 [01:43<00:00,  2.06s/it]

CPU times: user 1min 38s, sys: 1.17 s, total: 1min 39s
Wall time: 1min 43s





In [None]:
# np.save('./gte_vectors.npy', gte_vectors, allow_pickle=False)

In [17]:
docs = vectorizer.join_metadata(corpus=data, merged_dict=merged_dict, create_doc_id=True)
len(docs)

9055

In [18]:
test = [d for d in docs if d['video_id'] == 'mrND5lSPEQU']
len(test)

280

In [None]:
io = FileIO()
io.save_as_parquet(file_path=f'/home/elastic/notebooks/vector_search_applications/data/impact_theory_gte_{chunk_size}.parquet', data=docs, overwrite=True)

[32m2023-10-12 20:19:15.464[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m34[0m - [1mDataFrame saved as parquet file here: /home/elastic/notebooks/vector_search_applications/data/impact_theory_gte_128.parquet[0m
Bad pipe message: %s [b"q(\xeaMV\xab\xc0\x03\xd3\xaf\x94<\xc1\xbe\xd8\x1a\x8az\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005"]
Bad pipe message: %s [b'B\xa4\x83\x950y9\xd2\xbf\x87\x1b\xfc\x83\x9e\xd1\x8bx\x86\x00\x00>\xc0\x14\xc0\n\x009\x008\x007\x006\xc0\x0f\xc0\x05\x005\xc0\x13\xc0\t\x003\x002\x001']
Bad pipe message: %s [b'\xac\x143\xff\\\xfe\xa7\x7f\xf5\xf3M\xabz\x11=\r\x1a\xb0\x00\x00\xa2\xc0\x14\xc0\n\x009\x008\x007\x006\x00\x88\x00\x87\

## Step - 3.1 --> OPTIONAL: OpenAI Ada Embeddings

In [None]:
openai.api_key = os.environ['OPENAI_API_KEY']
model = "text-embedding-ada-002"
from openai.embeddings_utils import get_embedding, cosine_similarity
#get cost first
tokenizer = Tokenizer(model_type="cl100k_base", price=0.001)

# cost = tokenizer.get_cost(text_chunks)

In [None]:
openai.api_key

'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'

#### Working around OpenAI rate limits

In [None]:
#split text_chunks into roughly 1 million tokens total per group
for num in range(0,43000,6000):
    chunks = text_chunks[num:num+6000]
    cost = tokenizer.get_cost(chunks)


Total Tokens: 890642	Cost: 0.891
Total Tokens: 889149	Cost: 0.889
Total Tokens: 892516	Cost: 0.893
Total Tokens: 887084	Cost: 0.887
Total Tokens: 892144	Cost: 0.892
Total Tokens: 887583	Cost: 0.888
Total Tokens: 876077	Cost: 0.876
Total Tokens: 126477	Cost: 0.126


In [None]:
# openai.Embedding.create(text_chunks[:2], engine=model)

In [None]:
openai.api_key

'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'

In [None]:
# %%time
# embeddings = []
# for num in range(0,43000,6000):
#     chunks = text_chunks[num:num+6000]
#     results = openai.Embedding.create(input=chunks, engine=model)
#     embeddings.append(results)
#     time.sleep(60)

In [None]:
vectors = results['data']
vectors = [vec['embedding'] for vec in vectors]
len(vectors)