In [1]:

import pandas as pd
from settings import INPUT_DATA_PATH, EMBEDDING_MODEL, INDEX_NAME
from src.logs import logger
df = pd.read_csv(INPUT_DATA_PATH)


In [2]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(EMBEDDING_MODEL)

  from .autonotebook import tqdm as notebook_tqdm


# Building index

1. settings / mappings
2. ingest data code
3. synonyms: add volumn to docker file
4. show UI index management -> explorer to see the documents
5. Query on UI: normal query, synonyms
6. Query on python: normal query, terms id, multi search, vector search

##### update policy
##### alias

In [3]:
# %reload_ext autoreload
# %autoreload 2
from src.utils import split_data_chunk

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/yuhsuanting/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yuhsuanting/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yuhsuanting/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
full_data = []
for index, row in df.iterrows():
    current_batch = split_data_chunk(embedding_model=embedding_model,row_data=row)
    full_data.extend(current_batch)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
len(full_data)

400

In [6]:
full_data[0]

{'product_description': "Nike Air Force 1 '07 part 1: doesnt get legendary designed turn head nike air force 1 07 cross hardwood comfort offcourt flair crisp leather upper look sleek fresh lustrous swoosh logo give almost iridescent look add perfect amount flash make shine consider slam dunk",
 'id': 'C0UZQQIYM7_1',
 'title': "Nike Air Force 1 '07",
 'subtitle': "Men's Shoes",
 'product_id': 'C0UZQQIYM7',
 'embedded_product_description': array([ 1.91667620e-02, -2.45630071e-02, -1.29681057e-03,  3.28077786e-02,
         5.56558589e-05, -8.99029523e-03, -6.80366829e-02, -5.00352122e-03,
        -2.24727613e-04, -3.03653460e-02,  2.57720277e-02,  6.81640133e-02,
         3.34987529e-02,  8.51920545e-02,  3.39937210e-02,  5.19116260e-02,
        -1.18538840e-02, -5.08263614e-03,  1.03366915e-02,  5.66853676e-04,
         1.10639744e-02,  8.06545187e-03,  8.24122876e-03,  4.02045362e-02,
         2.10054293e-02, -2.97298133e-02,  4.56043407e-02,  3.59699596e-03,
        -3.48684229e-02, -2

In [None]:
# INDEX_NAME = INDEX_NAME+"_1"
from src.elasticsearch_helper import ES_HELPER
es_helper = ES_HELPER()
if not es_helper.is_index_exists(INDEX_NAME):
    es_helper.create_index(INDEX_NAME)
es_helper.update_data(data=full_data, index_name=INDEX_NAME)

In [None]:
# es_helper._delete_index(INDEX_NAME)

## Search on python

In [None]:
"""
GET /nike_product/_search
{
  "_source": ["product_id","title","subtitle","product_description"],
  "query": {
    "term": {
      "product_id": "C0UZQQIYM7"
    }
  }
}

"""
resp = es_helper.client.search(
    index="nike_product",
    _source=["product_id", "title", "subtitle", "product_description"],
    query={
        "term": {
            "product_id": "C0UZQQIYM7"
        }
    }
)
print(resp)


In [None]:
resp = es_helper.client.search(
    index="nike_product",
    _source=["product_id", "title", "subtitle", "product_description"],
    query={
        "multi_match": {
            "query": "running shoes female",
            "fields": ["title^3", "subtitle^2", "product_description"],
            "analyzer": "custom_analyzer"
        }
    },
    size=3,)

print(resp)


# Vector search similarity search + KNN

In [None]:
input_text = "running shoes female"
embedded_question = embedding_model.encode(input_text)

query_element = {
    "bool": {
        "should": [
            {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": """
                        double score = cosineSimilarity(params.query_vector, 'embedded_product_description') + 1.0;
                        return score;
                        """,
                        "params": {"query_vector": embedded_question},
                    },
                }
            },
            {"match": {"title": {"query": input_text, "boost": 1.0}}},
            # {
            #         "multi_match": {
            #             "query": input_text,
            #             "fields": [
            #                 "title^3",
            #                 # "content"
            #             ],
            #             "minimum_should_match": "1"
            #         }
            #     }
        ],
        "minimum_should_match": 1,
    }
}

min_score = 0
result = es_helper.client.search(
    index=INDEX_NAME,
    query=query_element,
    _source=["product_id", "title", "subtitle", "product_description"],
    min_score=0.5,
    size=10,
    request_cache=True,
    track_scores=True,
    explain=False,
)

In [None]:
print(result)

# Enterprise version

- Hosting vectorize model + ingest pipeline to facilitate the process, [documentation](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-text-emb-vector-search-example)

# Professional common update use case, set up alias vs how to acheive this in databricks

Hourly update on the index, and full update on weekly bases and setup alias



### Alias on ElasticSearch

In [None]:
alias_name = "nike_product_alias"
# INDEX_NAME = "nike_product"

In [None]:
es_helper.client.indices.update_aliases(actions=[{"add": {"index": INDEX_NAME, "alias": alias_name}}])

In [None]:
es_helper.client.indices.get_alias(name=alias_name)

In [None]:
"""
POST _aliases
{
  "actions": [
    {
      "remove": {
        "index": "index1",
        "alias": "logs-non-existing"
      }
    },
    {
      "add": {
        "index": "index2",
        "alias": "logs-non-existing"
      }
    }
  ]
}
"""
# example code 
def redirect_alias(
    alias_name: str, index_name: str
) -> None:
    aliased_indices = {}
    try:
        aliased_indices = es_helper.client.indices.get_alias(name=alias_name)
    except:
        pass
    remove_actions = [
        {"remove": {"index": alias, "alias": alias_name}}
        for alias in aliased_indices.keys()
    ]
    add_actions = [{"add": {"index": index_name, "alias": alias_name}}]
    actions = remove_actions + add_actions
    res = es_helper.client.indices.update_aliases(actions=actions)
    print(res)

In [None]:
redirect_alias(alias_name, "nike_product")

In [None]:
es_helper.client.indices.get_alias(name=alias_name)

In [None]:
es_helper.client.search(
    index=alias_name,#INDEX_NAME,
    query=query_element,
    _source=["product_id", "title", "subtitle", "product_description"],
    min_score=0.5,
    size=10,
    request_cache=True,
    track_scores=True,
    explain=False,
)

### Alias concept on Databricks

why is it important, versioning for rollback and if we are going to update many documents at once we can experience delays or increased latency during the sync. So when we do a large sync it's better to perform alias concept.

* upsert data it doesn't block the production same as elastic search
* we can also create a new index but here we need to redirect the search to use the new end points
    1. Create a table metadata, logging the latest index
    2. upsert the data to the source table
    3. create a new index to sync the index
    4. once the sync is done update in the metadatatable
    5. when calling we query to get the latest completed index from the metadatatable (so the program to call the index search no need to change)



# (Optional) if time available, how to perform hybrid search

https://www.databricks.com/blog/announcing-hybrid-search-general-availability-mosaic-ai-vector-search