In [1]:
%load_ext autoreload 
%autoreload 2

from preprocessing import FileIO, Vectorizor
from opensearch_interface import OpenSearchClient
from index_templates import youtube_body

import os
import time
from rich import print
from dotenv import load_dotenv
load_env=load_dotenv('./.env', override=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Instantiate OpenSearch Client

In [2]:
model_path = 'sentence-transformers/all-MiniLM-L6-v2'
model_path_on_disk = os.environ['ST_MODEL_PATH']

In [3]:
os_endpoint = os.environ['OPENSEARCH_ENDPOINT']
os_username = os.environ['OPENSEARCH_USERNAME']
os_password = os.environ['OPENSEARCH_PASSWORD']

In [4]:
osclient = OpenSearchClient(hosts=os_endpoint,http_auth=(os_username, os_password))

### Display Existing Indices

In [13]:
# osclient.indices.delete(index_name)
osclient.show_indexes()

health status index                     uuid                   pri rep docs.count docs.deleted store.size pri.store.size
green  open   .opensearch-observability SszFhNsdQn6XzZRbNH_maQ   1   0          0            0       208b           208b
yellow open   impact-theory-minilm-196  lsZrOS9WTCihEe1RUWoQeA   3   1      37007            0    369.1mb        369.1mb
green  open   .opendistro_security      cDQQKOHYRYu-Wrev6dWoAw   1   0         10            0     52.7kb         52.7kb
green  open   .kibana_1                 EMgTmu59TJSqO6po7c6RNg   1   0          1            0      5.1kb          5.1kb



### Load saved data from previous Ingest/Split/Vectorize process

In [6]:
data_path = './practice_data/impact_theory_minilm_196.parquet'

In [7]:
data = FileIO().load_parquet(data_path)

Shape of data: (37007, 17)
Memory Usage: 4.55+ MB


### Preview some data stats

In [8]:
data_keys = sorted(list(data[0].keys()))
emb_dim = len(data[0]['content_embedding'])
line_break = '-'*100
print(f'Data Keys: {data_keys}\n{line_break}\nEmbedding Dimension: {emb_dim}')

In [9]:
# [d for d in data if d['doc_id'] == 'mJNM7iLAibU_4']

### Data Indexing

In [10]:
index_name = 'impact-theory-minilm-196'

In [11]:
#Review Indexing Body
youtube_body

{'settings': {'number_of_shards': 3,
  'refresh_interval': '30s',
  'index': {'knn': False}},
 'mappings': {'properties': {'title': {'type': 'text', 'index': 'true'},
   'unique_id': {'type': 'keyword', 'index': 'false'},
   'group_id': {'type': 'short', 'index': 'false'},
   'video_id': {'type': 'keyword', 'index': 'false'},
   'playlist_id': {'type': 'keyword', 'index': 'false'},
   'episode_url': {'type': 'keyword', 'index': 'false'},
   'episode_num': {'type': 'short', 'index': 'false'},
   'description': {'type': 'text', 'index': 'true'},
   'length': {'type': 'long', 'index': 'false'},
   'publish_date': {'type': 'keyword', 'index': 'false'},
   'views': {'type': 'long', 'index': 'false'},
   'thumbnail_url': {'type': 'keyword', 'index': 'false'},
   'content': {'type': 'text', 'index': 'true'},
   'doc_id': {'type': 'keyword', 'index': 'false'}}}}

In [12]:
%%time
osclient.document_indexer(index_name=index_name, data=data, chunk_size=5, body_template=youtube_body, semantic_index=True)
time.sleep(30)

[32m2023-11-02 21:24:05.235[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m212[0m - [1mThe ** impact-theory-minilm-196 ** index was created[0m
[32m2023-11-02 21:24:05.237[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m219[0m - [1mThe # of documents to be indexed = 37007[0m
 94%|███████████████████████████████████████████████████████████████████▍    | 34691/37007 [02:41<09:31,  4.05Docs Indexed/s][32m2023-11-02 21:26:46.717[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m236[0m - [1mNumber of docs indexed: 37007[0m
100%|███████████████████████████████████████████████████████████████████████| 37007/37007 [02:41<00:00, 229.18Docs Indexed/s]


CPU times: user 17.6 s, sys: 1.44 s, total: 19.1 s
Wall time: 3min 12s


In [62]:
osclient.indices.refresh(index_name)

{'_shards': {'total': 6, 'successful': 3, 'failed': 0}}

In [57]:
osclient.indices.delete(index_name)

{'acknowledged': True}

## Test Search on New Index

In [14]:
query = "Like why the new company, Cellularity, why are you doing that? It's gonna take an incredible amount of time, energy, and effort."

### Keyword Search

In [17]:
kw_response = osclient.keyword_search(query=query, index=index_name, size=5)
print(kw_response)

### Vector Search

In [19]:
vec_response = osclient.vector_search(query, index_name, size=5)
print(vec_response)

### Hybrid Search

In [20]:
hyb_response = osclient.hybrid_search(query, index_name, index_name, kw_size=3, vec_size=3)
print(hyb_response)