In [48]:
%load_ext autoreload 
%autoreload 2

from reranker import ReRanker
from preprocessing import Utilities
from opensearch_interface import OpenSearchClient
from sentence_transformers import SentenceTransformer
from index_templates import youtube_body
from typing import List, Union, Dict
from reranker import ReRanker
import json
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
osclient = OpenSearchClient()

In [3]:
osclient.info()

{'name': '7d5740afb0b1',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'u62AiiEFR4yWUStAhyFveg',
 'version': {'distribution': 'opensearch',
  'number': '2.9.0',
  'build_type': 'tar',
  'build_hash': '1164221ee2b8ba3560f0ff492309867beea28433',
  'build_date': '2023-07-18T21:23:29.367080729Z',
  'build_snapshot': False,
  'lucene_version': '9.7.0',
  'minimum_wire_compatibility_version': '7.10.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'The OpenSearch Project: https://opensearch.org/'}

In [4]:
osclient.show_indexes()

health status index                        uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   kw-538-testrun               hZARxQqgStO4-dexNIsR8A   1   1        284            0    307.6kb        307.6kb
green  open   .opensearch-observability    nN299E0QS9OvsRh_UcbJVQ   1   0          0            0       208b           208b
yellow open   .plugins-ml-config           IEeXrm-DRiOMm2qzo7PbqA   1   1          1            0      3.9kb          3.9kb
yellow open   test-kw-index                6EF4Q2xDT9Gz1wua5a2IpQ   3   1        158            0      5.6mb          5.6mb
yellow open   kw-full                      uNhdaqbnRVuyJci_L1Om8Q   3   1       6678            0     12.1mb         12.1mb
yellow open   semantic-full                ydvbifM0Rcu0DBz-rjCHzQ   3   1       6678            0     74.2mb         74.2mb
yellow open   security-auditlog-2023.09.04 ApPSasK0S7CadlbWroBRYA   1   1        908            0      1.7mb          1.7mb
yellow o

In [5]:
data_path = '/home/elastic/notebooks/vector_search_applications/data/podcast_transcripts/fivethirtyeight/five_with_vectors.json'

In [6]:
def json_data_loader(file_path: str):
    with open(file_path) as f:
        data = json.loads(f.read())
    return data

In [7]:
data = json_data_loader(data_path)
len(data)
for d in data:
    d['content_embedding'] = d['vector']
    del d['vector']

In [18]:
list(data[0].keys())

['content',
 'group_id',
 'show_link',
 'video_id',
 'playlist_id',
 'channel_id',
 'description',
 'keywords',
 'length',
 'publish_date',
 'thumbnail_url',
 'title',
 'views',
 'content_embedding']

### KW Indexing

In [8]:
index_name = "kw-538-testrun"
youtube_body

{'settings': {'number_of_shards': 3,
  'refresh_interval': '30s',
  'index': {'knn': False}},
 'mappings': {'properties': {'title': {'type': 'text', 'index': 'true'},
   'group_id': {'type': 'short', 'index': 'false'},
   'video_id': {'type': 'keyword', 'index': 'false'},
   'playlist_id': {'type': 'keyword', 'index': 'false'},
   'episode_url': {'type': 'keyword', 'index': 'false'},
   'description': {'type': 'text', 'index': 'true'},
   'length': {'type': 'long', 'index': 'false'},
   'publish_date': {'type': 'keyword', 'index': 'false'},
   'views': {'type': 'long', 'index': 'false'},
   'thumbnail_url': {'type': 'keyword', 'index': 'false'},
   'content': {'type': 'text', 'index': 'true'}}}}

In [10]:
osclient.document_indexer(index_name=index_name, data=data, body_template=youtube_body)

[32m2023-09-07 23:21:13.770[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m206[0m - [1mThe ** kw-538-testrun ** index was created[0m
[32m2023-09-07 23:21:13.771[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m217[0m - [1mThe # of documents to be indexed = 284[0m


In [9]:
osclient.indices.delete(index_name)

{'acknowledged': True}

In [11]:
osclient.indices.refresh(index=index_name)

{'_shards': {'total': 6, 'successful': 3, 'failed': 0}}

In [12]:
osclient.show_indexes()

health status index                        uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   kw-538-testrun               Wam6NsdMR7K5lE8N8ZBTRQ   3   1        284            0    351.1kb        351.1kb
green  open   .opensearch-observability    nN299E0QS9OvsRh_UcbJVQ   1   0          0            0       208b           208b
yellow open   .plugins-ml-config           IEeXrm-DRiOMm2qzo7PbqA   1   1          1            0      3.9kb          3.9kb
yellow open   test-kw-index                6EF4Q2xDT9Gz1wua5a2IpQ   3   1        158            0      5.6mb          5.6mb
yellow open   kw-full                      uNhdaqbnRVuyJci_L1Om8Q   3   1       6678            0     12.1mb         12.1mb
yellow open   semantic-full                ydvbifM0Rcu0DBz-rjCHzQ   3   1       6678            0     74.2mb         74.2mb
yellow open   security-auditlog-2023.09.04 ApPSasK0S7CadlbWroBRYA   1   1        908            0      1.7mb          1.7mb
yellow o

In [13]:
osclient.cat.count(index=index_name, format="json")

[{'epoch': '1694128901', 'timestamp': '23:21:41', 'count': '284'}]

In [19]:
def keyword_search(query: str, index: str, size: int=10, return_raw: bool=False):
    body = {
            "_source": ['content','group_id','show_link','video_id','length','publish_date','thumbnail_url','title','views'], 
            "size": size,
            "query": {
                "bool": {
                    "must": {
                        "match": {"content": query,}
                            },
                        "filter": {"bool": {"must_not": {"match_phrase": {"content": "Vishal"}}}},
                    },
                },            
            }
    response = osclient.search(body=body, index=index)
    if return_raw: 
        return response 
    else: return response['hits']['hits']

In [21]:
query = "Does trump have support in iowa"

In [23]:
# osclient.keyword_search(query, index=index_name)

In [25]:
osclient.indices.get_mapping(index=index_name)

{'kw-538-testrun': {'mappings': {'properties': {'channel_id': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'content': {'type': 'text'},
    'description': {'type': 'text'},
    'episode_url': {'type': 'keyword', 'index': False},
    'group_id': {'type': 'short', 'index': False},
    'length': {'type': 'long', 'index': False},
    'playlist_id': {'type': 'keyword', 'index': False},
    'publish_date': {'type': 'keyword', 'index': False},
    'show_link': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'thumbnail_url': {'type': 'keyword', 'index': False},
    'title': {'type': 'text'},
    'video_id': {'type': 'keyword', 'index': False},
    'views': {'type': 'long', 'index': False}}}}}

### Semantic Indexing

In [26]:
#upload data
utils = Utilities()

In [62]:
path = '/home/elastic/notebooks/vector_search_applications/data/podcast_transcripts/fivethirtyeight/'

In [74]:
# data = utils.json_data_loader(path)
# #be sure to change vector field name to "content_embedding"
# # [d.update(content_embedding=d['vector']) for d in data]
# # for d in data:
# #     del d['vector']

In [27]:
sem_index = 'semantic-538-testrun'
model = SentenceTransformer('all-minilm-l6-v2')
# osclient.indices.delete(sem_index)

In [29]:
osclient.indices.delete(sem_index)

{'acknowledged': True}

In [30]:
osclient.document_indexer(index_name=sem_index, data=data, body_template=youtube_body, semantic_index=True)

[32m2023-09-07 23:33:02.450[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m206[0m - [1mThe ** semantic-538-testrun ** index was created[0m
[32m2023-09-07 23:33:02.451[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m217[0m - [1mThe # of documents to be indexed = 284[0m


In [159]:
def vector_search(query: str, 
                  index: str, 
                  model: Union[str, SentenceTransformer], 
                  size: int=10,
                  k: int=10,
                  return_raw: bool=False
                  ) -> Dict[str,str]:
    if isinstance(model, SentenceTransformer):
        query_embedding = model.encode(query).tolist()

    body={  "_source": ['title', 'episode_id', 'group_id', 'episode_num', 'episode_url', 'mp3_url', 'content'],
            "size": size,
            "query": 
               {"knn": {"content_embedding": {"vector": query_embedding, "k": k}}},
         }
    response = osclient.search(body=body, index=index)
    if return_raw: 
        return response 
    else: return response['hits']['hits']

In [73]:
query = 'which color to assign the state of ohio'

In [68]:
response = osclient.vector_search(query, sem_index, model)

### Reranking + Hybrid Search

In [50]:
reranker = ReRanker()

In [74]:
hybrid = osclient.hybrid_search(query, kw_index='kw-538-testrun', vec_index=sem_index, model=model)
final = reranker.rerank(hybrid, query, top_k=20)

In [71]:
def cleanup(results: List[dict]) -> List[dict]:
    final_results = []
    unique_ids = set()
    for res in results:
        unique_id = f"{res['_source']['group_id']}-{res['_source']['video_id']}"
        if unique_id in unique_ids:
            continue
        else: 
            unique_ids.add(unique_id)
            final_results.append(res)
    return final_results

In [79]:
final = cleanup(final)

In [80]:
def display_results(results: List[dict], threshhold: float=0.0, include_indexes: bool=False):
    filtered = [r for r in results if r['cross_score'] > threshhold]
    if include_indexes:
        indices = [d['_index'] for d in filtered]
        return filtered, indices
    else: return filtered

In [84]:
display_results(final, threshhold=-10, include_indexes=True)

([{'_index': 'kw-538-testrun',
   '_id': 'd4nzcYoB1G4yg5ZoISM4',
   '_score': 6.8077273,
   '_source': {'group_id': 2,
    'show_link': 'https://www.youtube.com/watch?v=0H9CfT351U0&list=PLAiITlQWhsRTcIw_C4SHr0y_WqogzR8lR',
    'length': 3849,
    'thumbnail_url': 'https://i.ytimg.com/vi/0H9CfT351U0/hq720.jpg',
    'title': 'Who Is The Likeliest GOP VP Candidate?',
    'publish_date': '08/14/2023',
    'content': "I think we can probably call Ohio a red state at this point, although I got a Politico alert earlier today saying that the Biden campaign suggested that both North Carolina and Ohio were in play for them in 2024. They can say what they want. We will find out. Either way, abortion rights advocates notched a win in Ohio. Last Tuesday, Ohioans voted by a 14-point margin not to raise the threshold to amend the Constitution to a 60% supermajority. Instead, it will remain a simple majority, making it likelier that Ohioans will pass an amendment to codify abortion rights into the sta