In [9]:
%load_ext autoreload 
%autoreload 2

from preprocessing import FileIO, Vectorizor
from opensearch_interface import OpenSearchClient
from index_templates import youtube_body

import os
import time
from rich import print
from dotenv import load_dotenv
load_env=load_dotenv('./.env', override=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Instantiate OpenSearch Client

In [10]:
model_path = 'sentence-transformers/all-MiniLM-L6-v2'
model_path_on_disk = os.environ['ST_MODEL_PATH']
osclient = OpenSearchClient(model_name_or_path=model_path)

### Display Existing Indices

In [11]:
osclient.show_indexes()

health status index                              uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   kw-impact-theory                   2MjMun4bQYOoeUpv5UsJxg   3   1      33164            0     29.4mb         29.4mb
yellow open   semantic-impact-theory-196         SY2nXyvmQ9i5LAS4hmn82g   3   1      37007            0    694.6mb        694.6mb
yellow open   kw-impact-theory-196               vsuHausxRb6EjysQriOX5w   3   1      37007            0     30.5mb         30.5mb
yellow open   security-auditlog-2023.10.21       Vj43Da3dTQm0mwBFNWHjCg   1   1          9            0    151.3kb        151.3kb
yellow open   security-auditlog-2023.10.22       YXYp6DkYT-aLgGxRZNGsUA   1   1       1704            0      1.6mb          1.6mb
yellow open   paul-graham3                       -74ZPvxoSMmtCPSzAI9o1A   1   1         18            0    768.2kb        768.2kb
yellow open   security-auditlog-2023.10.25       1Cn9t6VhT227XHl2KJJ-WQ   1   1        852

### Load saved data from previous Ingest/Split/Vectorize process

In [16]:
data_path = './practice_data/impact_theory_minilm_196.parquet'

In [17]:
data = FileIO().load_parquet(data_path)

Shape of data: (37007, 16)
Memory Usage: 4.27+ MB


### Preview some data stats

In [18]:
data_keys = sorted(list(data[0].keys()))
emb_dim = len(data[0]['content_embedding'])
line_break = '-'*100
print(f'Data Keys: {data_keys}\n{line_break}\nEmbedding Dimension: {emb_dim}')

In [20]:
[d for d in data if d['doc_id'] == 'mJNM7iLAibU_4']

[{'author': 'Tom Bilyeu',
  'title': 'A Future Without Death | Bob Hariri on Impact Theory',
  'video_id': 'mJNM7iLAibU',
  'playlist_id': 'PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
  'channel_id': 'UCnYMOamNKLGVlJgRUbamveA',
  'description': 'No description provided',
  'keywords': array([], dtype=object),
  'length': 2247,
  'publish_date': '02-27-2018',
  'thumbnail_url': 'https://i.ytimg.com/vi/mJNM7iLAibU/hq720.jpg',
  'views': 30633,
  'age_restricted': False,
  'episode_num': 61,
  'content': "It's true. Which is pretty crazy, so I'm really amped up to get you on and give you a chance to talk about some of the things that I know you've got going on in your life. And I think the most fun place to start with you is you've been absurdly successful by every worldly measure, but you're still super driven. What drives you? Like why the new company, Cellularity, why are you doing that? It's gonna take an incredible amount of time, energy, and effort. So what is the goal with the company and

### Data Indexing

In [4]:
index_name = 'impact-theory-minilm-196'

In [5]:
#Review Indexing Body
youtube_body

{'settings': {'number_of_shards': 3,
  'refresh_interval': '30s',
  'index': {'knn': False}},
 'mappings': {'properties': {'title': {'type': 'text', 'index': 'true'},
   'unique_id': {'type': 'keyword', 'index': 'false'},
   'group_id': {'type': 'short', 'index': 'false'},
   'video_id': {'type': 'keyword', 'index': 'false'},
   'playlist_id': {'type': 'keyword', 'index': 'false'},
   'episode_url': {'type': 'keyword', 'index': 'false'},
   'episode_num': {'type': 'short', 'index': 'false'},
   'description': {'type': 'text', 'index': 'true'},
   'length': {'type': 'long', 'index': 'false'},
   'publish_date': {'type': 'keyword', 'index': 'false'},
   'views': {'type': 'long', 'index': 'false'},
   'thumbnail_url': {'type': 'keyword', 'index': 'false'},
   'content': {'type': 'text', 'index': 'true'},
   'doc_id': {'type': 'keyword', 'index': 'false'}}}}

In [21]:
%%time
osclient.document_indexer(index_name=index_name, data=data, chunk_size=1500, body_template=youtube_body, semantic_index=True)
# time.sleep(30)

[32m2023-10-21 21:47:45.022[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m212[0m - [1mThe ** impact-theory-minilm-196 ** index was created[0m
[32m2023-10-21 21:47:45.023[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m219[0m - [1mThe # of documents to be indexed = 37007[0m
 93%|██████████████████████████████████████████████████████████████████████████████████▉      | 34501/37007 [00:12<00:00, 2657.84Docs Indexed/s][32m2023-10-21 21:47:57.918[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m235[0m - [1mNumber of docs indexed: 37007[0m
100%|█████████████████████████████████████████████████████████████████████████████████████████| 37007/37007 [00:12<00:00, 2870.34Docs Indexed/s]

CPU times: user 12.5 s, sys: 753 ms, total: 13.3 s
Wall time: 13.2 s





In [25]:
osclient.indices.refresh(index_name)

{'_shards': {'total': 6, 'successful': 3, 'failed': 0}}

## Test Search on New Index

In [22]:
query = "Like why the new company, Cellularity, why are you doing that? It's gonna take an incredible amount of time, energy, and effort."

In [24]:
osclient.get(index_name, id='3No1VIsB_9uU36NHh3vr')

{'_index': 'impact-theory-minilm-196',
 '_id': '3No1VIsB_9uU36NHh3vr',
 '_version': 1,
 '_seq_no': 10943,
 '_primary_term': 1,
 'found': True,
 '_source': {'author': 'Tom Bilyeu',
  'title': 'A Future Without Death | Bob Hariri on Impact Theory',
  'video_id': 'mJNM7iLAibU',
  'playlist_id': 'PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
  'channel_id': 'UCnYMOamNKLGVlJgRUbamveA',
  'description': 'No description provided',
  'keywords': [],
  'length': 2247,
  'publish_date': '02-27-2018',
  'thumbnail_url': 'https://i.ytimg.com/vi/mJNM7iLAibU/hq720.jpg',
  'views': 30633,
  'age_restricted': False,
  'episode_num': 61,
  'content': "It's true. Which is pretty crazy, so I'm really amped up to get you on and give you a chance to talk about some of the things that I know you've got going on in your life. And I think the most fun place to start with you is you've been absurdly successful by every worldly measure, but you're still super driven. What drives you? Like why the new company, Cellularit

Bad pipe message: %s [b'wB\x8d\xbb\t\x98\xddFb\xdd\x94\x06\xeap\xe9\xc6\xfc\xcb J\x08\xcd\xcaH^\xa5\xdcR4@\x0f\xbe\xd7\xe6\x97\x03\xe5\x8dIxZS5F\x9cJ\x92\x01<M\x0e\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \x90\xf7\xd6w\xa1\x8d\xb96%{\xd8\xcf \xce']
Bad pipe message: %s [b"r$\x11\xce\xde\x93\xafE!5L\xb1\x16iXT\xe0!\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\

### Keyword Search

In [23]:
kw_response = osclient.keyword_search(query=query, index=index_name, size=5)
print(kw_response)

### Vector Search

In [49]:
vec_response = osclient.vector_search(query, index_name, size=5)
print(vec_response)

### Hybrid Search

In [52]:
hyb_response = osclient.hybrid_search(query, index_name, index_name, kw_size=3, vec_size=3)
print(hyb_response)