In [74]:
%load_ext autoreload
%autoreload 2

import json
import pandas as pd
from listennotes import podcast_api
from dotenv import load_dotenv
load_dotenv('./.env', override=True)
import os
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
from typing import List, Dict, Tuple, Union
from tqdm import tqdm
import time
from math import ceil
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
from preprocessing import Utilities
import openai
from llama_index.text_splitter import SentenceSplitter
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
from llama_index.vector_stores import OpensearchVectorStore, OpensearchVectorClient
# from test_folder.opensearch_interface import OpenSearchClient
# from test_folder.reranker import ReRanker
from tiktoken_functions import Tokenizer
import tiktoken

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# loader = PyMuPDFReader()
# docs = loader.load('./data/llama2.pdf')
# len(docs)

### 1.) Import Podcast Transcript

In [2]:
podcast_folder = '/home/elastic/notebooks/podcast_transcripts/'
podcasts = [os.path.join(podcast_folder, file) for file in os.listdir(podcast_folder) if file.endswith('.txt')]
len(podcasts)

100

In [3]:
def load_podcasts(files: List[str]) -> List[str]:
    transcripts = []
    for file in files:
        with open(file) as f:
            text = f.read().strip()
            transcripts.append(text)
    return transcripts

In [3]:
%%time
with open('/home/elastic/notebooks/vector_search_applications/data/impact_theory_metadata.json') as f:
    data =  json.load(f)

CPU times: user 31.9 ms, sys: 44.1 ms, total: 76 ms
Wall time: 75.4 ms


### 2a.) Split Text into Sentences - LlamaIndex

In [4]:
from llama_index.text_splitter import SentenceSplitter
#TODO: Dig into why text_splitter is using NLTK tokenizer under the hood. 

In [31]:
text_splitter = SentenceSplitter(chunk_size=172, chunk_overlap=20)

In [32]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(data):
    cur_text_chunks = text_splitter.split_text(doc.get('content', ''))
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [49]:
len(doc_idxs)

42863

### 2b.) Split Text into Sentences - SentenceSplitter

In [21]:
%%time
utils = Utilities()
transcripts = [d.get('content', '') for d in data]
print(len(transcripts))
transcripts = [text for text in transcripts if text]
print(len(transcripts))
split_sentences = [utils.sentence_splitter(text) for text in transcripts]

387
385
CPU times: user 5.09 s, sys: 35.2 ms, total: 5.13 s
Wall time: 5.13 s


In [24]:
pd.DataFrame([len(corpus) for corpus in split_sentences]).describe()

Unnamed: 0,0
count,385.0
mean,857.015584
std,642.827887
min,1.0
25%,469.0
50%,668.0
75%,1056.0
max,5342.0


### 3. Group Sentences into chunks

In [26]:
def grouper(text_sentences: List[List[str]], sent_chunk_length: int=8) -> List[List[str]]:
    if isinstance(text_sentences[0], str):
        text_sentences = [text_sentences]
    chunks = []
    for corpus in text_sentences:
        for x in range(0,len(corpus),sent_chunk_length):
            achunk = []
            achunk.append(' '.join(corpus[x:x+sent_chunk_length]))
            chunks.append(achunk)
    return [string for alist in chunks for string in alist]
        

In [27]:
chunks = grouper(split_sentences)

In [12]:
df = pd.DataFrame(chunks, columns=['Chunks'])
# df.to_parquet('./test_chunks.parquet')
df.loc[0,'Chunks']

"Hello and welcome to The Intelligence from The Economist. I'm Aure Ogunbiyi. And I'm Jason Palmer. Every weekday we provide a fresh perspective on the events shaping your world. China was one of the last countries to abandon pandemic lockdowns, and investors and analysts alike were waiting anxiously for its recovery. The comeback came, but things aren't going quite the way that many expected. And what's the best possible fate for a potato? Our correspondent argues passionately that the answer is a crisp, a potato chip."

In [66]:
#create docs
docs = df.Chunks.values.tolist()
#create ids
ids = df.index.tolist()
dicts = [{'content': doc, 'name': 'Intelligent Economist'} for doc in docs]

### 4. Encode Chunks as Vectors

### 4a.) SentenceTransformers

In [62]:
sbert = SentenceTransformer('all-MiniLM-L6-v2'). ##  35 seconds to encode all ImpactTheory 
model_path = '/home/elastic/notebooks/vector_search_applications/models/gte-base/' 
model = SentenceTransformer(model_path, device='cuda:0') ## 136 seconds to encode all ImpactTheory

In [72]:
%%time
gte_vectors = model.encode(sentences=text_chunks, show_progress_bar=True, device='cuda:0')

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1340/1340 [02:14<00:00,  9.94it/s]


CPU times: user 2min 31s, sys: 1.06 s, total: 2min 32s
Wall time: 2min 16s


In [70]:
sbert_vectors = vectors.copy()

In [107]:
np.save('./gte_vectors.npy', gte_vectors, allow_pickle=False)

In [110]:
doc_idxs?

[0;31mType:[0m        list
[0;31mString form:[0m [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  <...> 6, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386, 386]
[0;31mLength:[0m      42863
[0;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.

### 4b.) OpenAI Ada Embeddings

In [104]:
openai.api_key = os.environ['OPENAI_API_KEY']
model = "text-embedding-ada-002"
from openai.embeddings_utils import get_embedding, cosine_similarity
#get cost first
tokenizer = Tokenizer(model_type="cl100k_base", price=0.001)

# cost = tokenizer.get_cost(text_chunks)

In [103]:
openai.api_key

'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'

#### Working around OpenAI rate limits

In [93]:
#split text_chunks into roughly 1 million tokens total per group
for num in range(0,43000,6000):
    chunks = text_chunks[num:num+6000]
    cost = tokenizer.get_cost(chunks)
    

Total Tokens: 890642	Cost: 0.891
Total Tokens: 889149	Cost: 0.889
Total Tokens: 892516	Cost: 0.893
Total Tokens: 887084	Cost: 0.887
Total Tokens: 892144	Cost: 0.892
Total Tokens: 887583	Cost: 0.888
Total Tokens: 876077	Cost: 0.876
Total Tokens: 126477	Cost: 0.126


In [105]:
openai.Embedding.create(text_chunks[:2], engine=model)

AuthenticationError: Incorrect API key provided: ["I'm no***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************y."]. You can find your API key at https://platform.openai.com/account/api-keys.

In [106]:
openai.api_key

'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'

In [97]:
%%time
embeddings = []
for num in range(0,43000,6000):
    chunks = text_chunks[num:num+6000]
    results = openai.Embedding.create(input=chunks, engine=model)
    embeddings.append(results)
    time.sleep(60)

InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.

In [None]:
vectors = results['data']
vectors = [vec['embedding'] for vec in vectors]
len(vectors)

In [36]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

In [40]:
client = QdrantClient('localhost', port=6333)
collection = 'test_collection'

In [69]:
client.recreate_collection(collection_name=collection, vectors_config=VectorParams(size=1536, distance=Distance.COSINE))

True

In [46]:
# model = SentenceTransformer('models/my-128dim-model')

In [71]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='test_collection')])

In [70]:
client.upload_collection(collection_name=collection, vectors=vectors, payload=dicts, ids=ids, batch_size=256)

In [67]:
client.delete_collection(collection)

True

### Search (Qdrant)

In [102]:
retriever = Retriever(model_type=model)
reranker = ReRanker()

In [108]:
query = 'GOP thoughts on antitrust law'

In [109]:
%%time
results = retriever.search(query, collection=collection, limit=25, return_all=False)
sorted_scores = reranker.rerank(results, query)

CPU times: user 40.4 ms, sys: 7.15 ms, total: 47.6 ms
Wall time: 245 ms


In [112]:
import urllib.request
urllib.request.urlretrieve("https://artifacts.opensearch.org/models/ml-models/amazon/gpt/GPT2_xl_sqg/1.0.0/generate.py", "generate.py")

('generate.py', <http.client.HTTPMessage at 0x7fadcaceb1c0>)

In [113]:
ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 [0m[01;34mIRIS[0m/                            [01;34mpodcast_summary_demo[0m/
 [01;34mPC_Metrics[0m/                      [01;34mpodcast_transcripts[0m/
 [01;34m__pycache__[0m/                     podcast_vectors.npy
 bert_similarity.ipynb            preprocess_helpers.py
 calculating_similarities.ipynb   [01;34mqdrant_storage[0m/
 chunks.parquet                  'query_data(4).csv'
 [01;34mdata[0m/                            ray_data_practice.ipynb
 [01;34mdatasets[0m/                        reranker.py
 embeddings_helper.py             retrieval.py
 generate.py                      [01;34mroberta-stsb-cross-encoder[0m/
 install_kernel.sh                [01;34mspace[0m/
 [01;34mmodel

Bad pipe message: %s [b'\xf0{u\xbaQ_y\x16)(']
Bad pipe message: %s [b'\x88\xd3\xc9E\xc9\xf6 M\x96\xac"2{\xb6\x81y\x81g\x02\x19\x03"\xf54\xcc\xa0q\xc7\xa85\x19\xd9A\x8a\xdf\x00\x07\x9f\xbc\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00']
Bad pipe message: %s [b"U7A\xa4\x1f\xdbB\xd6`W\xeb\xbb\xfe\xdd\x13\xeb\xfc\xc9\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x00", b'\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00']
Bad pipe message: %s [b"\x19\xd0y\xa2\x8f.\xf7\x82v\x07\