In [62]:
%load_ext autoreload 
%autoreload 2

from preprocessing import FileIO, Vectorizor
from opensearch_interface import OpenSearchClient
from index_templates import youtube_body
from opensearchpy.helpers import parallel_bulk

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
model_path = '/home/elastic/notebooks/vector_search_applications/models/gte-base/' 
osclient = OpenSearchClient(model_name_or_path=model_path)

In [64]:
osclient.show_indexes()

health status index                              uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   semantic-538-testrun               DjBPg6CdQwKbOGhJrI4YIQ   3   1        284            0      2.9mb          2.9mb
yellow open   kw-impact-theory                   2MjMun4bQYOoeUpv5UsJxg   3   1      33164            0     29.4mb         29.4mb
yellow open   semantic-impact-theory-196         SY2nXyvmQ9i5LAS4hmn82g   3   1      37007            0    694.6mb        694.6mb
yellow open   kw-impact-theory-196               vsuHausxRb6EjysQriOX5w   3   1      37007            0     30.5mb         30.5mb
yellow open   security-auditlog-2023.10.11       Q2X02yykRwWSdsI1O_8dSw   1   1         19            0    177.9kb        177.9kb
yellow open   kw-full                            uNhdaqbnRVuyJci_L1Om8Q   3   1       6678            0     12.1mb         12.1mb
yellow open   climate-report                     bqGtrEySTQ-Wc6aCMTe_sw   1   1       1728

In [65]:
data_path = '/home/elastic/notebooks/vector_search_applications/data/impact_theory_gte_128.parquet'

In [66]:
data = FileIO().load_parquet(data_path)

Shape of data: (60380, 16)
Memory Usage: 6.97+ MB


In [67]:
sorted(list(data[0].keys()))

['age_restricted',
 'author',
 'channel_id',
 'content',
 'content_embedding',
 'description',
 'doc_id',
 'episode_num',
 'keywords',
 'length',
 'playlist_id',
 'publish_date',
 'thumbnail_url',
 'title',
 'video_id',
 'views']

In [69]:
# [d for d in data if d['doc_id'] == gold]

### KW Indexing

In [70]:
index_name = "kw-impact-theory-128"
youtube_body

{'settings': {'number_of_shards': 3,
  'refresh_interval': '30s',
  'index': {'knn': True}},
 'mappings': {'properties': {'title': {'type': 'text', 'index': 'true'},
   'unique_id': {'type': 'keyword', 'index': 'false'},
   'group_id': {'type': 'short', 'index': 'false'},
   'video_id': {'type': 'keyword', 'index': 'false'},
   'playlist_id': {'type': 'keyword', 'index': 'false'},
   'episode_url': {'type': 'keyword', 'index': 'false'},
   'episode_num': {'type': 'short', 'index': 'false'},
   'description': {'type': 'text', 'index': 'true'},
   'length': {'type': 'long', 'index': 'false'},
   'publish_date': {'type': 'keyword', 'index': 'false'},
   'views': {'type': 'long', 'index': 'false'},
   'thumbnail_url': {'type': 'keyword', 'index': 'false'},
   'content': {'type': 'text', 'index': 'true'},
   'doc_id': {'type': 'keyword', 'index': 'false'},
   'content_embedding': {'type': 'knn_vector', 'dimension': 768}}}}

In [71]:
osclient.document_indexer(index_name=index_name, data=data, body_template=youtube_body)

[32m2023-10-12 20:20:31.001[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m212[0m - [1mThe ** kw-impact-theory-128 ** index was created[0m
[32m2023-10-12 20:20:31.002[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m219[0m - [1mThe # of documents to be indexed = 60380[0m
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 52762/60380 [00:04<00:00, 16570.61Docs Indexed/s][32m2023-10-12 20:20:35.181[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m235[0m - [1mNumber of docs indexed: 60380[0m
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60380/60380 [00:04<00:00, 14451.32Docs Indexed/s]


In [44]:
osclient.indices.delete("security-audit*")

{'acknowledged': True}

In [60]:
osclient.indices.refresh(index=index_name)

{'_shards': {'total': 6, 'successful': 3, 'failed': 0}}

In [80]:
osclient.show_indexes()

health status index                              uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   semantic-538-testrun               DjBPg6CdQwKbOGhJrI4YIQ   3   1        284            0      2.9mb          2.9mb
yellow open   kw-impact-theory                   2MjMun4bQYOoeUpv5UsJxg   3   1      33164            0     29.4mb         29.4mb
yellow open   semantic-impact-theory-196         SY2nXyvmQ9i5LAS4hmn82g   3   1      37007            0    694.6mb        694.6mb
yellow open   kw-impact-theory-196               vsuHausxRb6EjysQriOX5w   3   1      37007            0     30.5mb         30.5mb
yellow open   security-auditlog-2023.10.11       Q2X02yykRwWSdsI1O_8dSw   1   1         19            0    177.9kb        177.9kb
yellow open   kw-full                            uNhdaqbnRVuyJci_L1Om8Q   3   1       6678            0     12.1mb         12.1mb
yellow open   climate-report                     bqGtrEySTQ-Wc6aCMTe_sw   1   1       1728

In [19]:
def keyword_search(query: str, index: str, size: int=10, return_raw: bool=False):
    body = {
            "_source": ['content','group_id','show_link','video_id','length','publish_date','thumbnail_url','title','views'], 
            "size": size,
            "query": {
                "bool": {
                    "must": {
                        "match": {"content": query,}
                            },
                        "filter": {"bool": {"must_not": {"match_phrase": {"content": "Vishal"}}}},
                    },
                },            
            }
    response = osclient.search(body=body, index=index)
    if return_raw: 
        return response 
    else: return response['hits']['hits']

In [21]:
query = "Does trump have support in iowa"

In [23]:
# osclient.keyword_search(query, index=index_name)

In [29]:
osclient.indices.get_mapping(index=index_name)

{'kw-impact-theory': {'mappings': {'properties': {'age_restricted': {'type': 'boolean'},
    'author': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'channel_id': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'content': {'type': 'text'},
    'description': {'type': 'text'},
    'episode_num': {'type': 'short', 'index': False},
    'episode_url': {'type': 'keyword', 'index': False},
    'group_id': {'type': 'short', 'index': False},
    'length': {'type': 'long', 'index': False},
    'playlist_id': {'type': 'keyword', 'index': False},
    'publish_date': {'type': 'keyword', 'index': False},
    'thumbnail_url': {'type': 'keyword', 'index': False},
    'title': {'type': 'text'},
    'unique_id': {'type': 'keyword', 'index': False},
    'video_id': {'type': 'keyword', 'index': False},
    'views': {'type': 'long', 'index': False}}}}}

### Semantic Indexing

In [74]:
sem_index = 'semantic-impact-theory-128'

In [20]:
osclient.indices.delete(sem_index)

{'acknowledged': True}

In [75]:
%%time
osclient.document_indexer(index_name=sem_index, data=data, chunk_size=1500, body_template=youtube_body, semantic_index=True)

[32m2023-10-12 20:21:23.076[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m212[0m - [1mThe ** semantic-impact-theory-128 ** index was created[0m
[32m2023-10-12 20:21:23.077[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m219[0m - [1mThe # of documents to be indexed = 60380[0m
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 57001/60380 [00:39<00:03, 1005.59Docs Indexed/s][32m2023-10-12 20:22:02.597[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m235[0m - [1mNumber of docs indexed: 60380[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60380/60380 [00:39<00:00, 1527.87Docs Indexed/s]

CPU times: user 37.9 s, sys: 1.47 s, total: 39.4 s
Wall time: 39.9 s





In [41]:
osclient.indices.refresh(sem_index)

{'_shards': {'total': 6, 'successful': 3, 'failed': 0}}