In [1]:

import pandas as pd
from settings import INPUT_DATA_PATH, EMBEDDING_MODEL, INDEX_NAME
from src.logs import logger
df = pd.read_csv(INPUT_DATA_PATH)


In [2]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(EMBEDDING_MODEL)

  from .autonotebook import tqdm as notebook_tqdm


# Building index

1. settings / mappings
2. ingest data code
3. synonyms: add volumn to docker file
4. show UI index management -> explorer to see the documents
5. Query on UI: normal query, synonyms
6. Query on python: normal query, terms id, multi search, vector search

##### update policy
##### alias

In [3]:
# %reload_ext autoreload
# %autoreload 2
from src.utils import split_data_chunk

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/yuhsuanting/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yuhsuanting/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yuhsuanting/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
full_data = []
for index, row in df.iterrows():
    current_batch = split_data_chunk(embedding_model=embedding_model,row_data=row)
    full_data.extend(current_batch)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
len(full_data)

400

In [9]:
from src.elasticsearch_helper import ES_HELPER
es_helper = ES_HELPER()
if not es_helper.is_index_exists(INDEX_NAME):
    es_helper.create_index(INDEX_NAME)
es_helper.update_data(data=full_data, index_name=INDEX_NAME)

2025-07-03 15:37:19,472 ES_DEMO INFO https://localhost:9200


In [None]:
# es_helper._delete_index(INDEX_NAME)

2025-07-03 15:36:27,067 ES_DEMO INFO index nike_product deleted


ObjectApiResponse({'acknowledged': True})

## Search on python

In [None]:
"""
GET /nike_product/_search
{
  "_source": ["product_id","title","subtitle","product_description"],
  "query": {
    "term": {
      "product_id": "C0UZQQIYM7"
    }
  }
}

"""
resp = es_helper.client.search(
    index="nike_product",
    _source=["product_id", "title", "subtitle", "product_description"],
    query={
        "term": {
            "product_id": "C0UZQQIYM7"
        }
    }
)
print(resp)


{'took': 20, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 5.588496, 'hits': [{'_index': 'nike_product', '_id': 'C0UZQQIYM7_1', '_score': 5.588496, '_source': {'product_description': "Nike Air Force 1 '07 part 1: doesnt get legendary designed turn head nike air force 1 07 cross hardwood comfort offcourt flair crisp leather upper look sleek fresh lustrous swoosh logo give almost iridescent look add perfect amount flash make shine consider slam dunk", 'title': "Nike Air Force 1 '07", 'subtitle': "Men's Shoes", 'product_id': 'C0UZQQIYM7'}}]}}


In [16]:
resp = es_helper.client.search(
    index="nike_product",
    _source=["product_id", "title", "subtitle", "product_description"],
    query={
        "multi_match": {
            "query": "running shoes female",
            "fields": ["title^3", "subtitle^2", "product_description"],
            "analyzer": "custom_analyzer"
        }
    },
    size=3,)

print(resp)


{'took': 70, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 224, 'relation': 'eq'}, 'max_score': 12.409704, 'hits': [{'_index': 'nike_product', '_id': 'SJ4Z8YKC02_1', '_score': 12.409704, '_source': {'product_description': "Jordan Women's Paris Collective x LALA &ce part 1: check expectation redefine style collab tee parisian musical artist lala ce embroidered messaging sleeve speaks experience black woman symbolic graphic back showcase strength soft durable cotton slightly baggy fit comfort get going ease", 'title': "Jordan Women's Paris Collective x LALA &ce", 'subtitle': "Women's T-Shirt", 'product_id': 'SJ4Z8YKC02'}}, {'_index': 'nike_product', '_id': 'GGVJVI89ZW_1', '_score': 8.968395, '_source': {'product_description': 'Nike React Infinity 3 part 1: stay foot soft supportive cushioning built help keep run wider forefoot higher foam stack help shield recurring attrition giving peace mind pound pavement every day

# Vector search similarity search + KNN

In [None]:
input_text = "running shoes female"

query_element = {
    "bool": {
        "should": [
            {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": """
                        double score = cosineSimilarity(params.query_vector, 'embedded_content') + 1.0;
                        return score;
                        """,
                        "params": {"query_vector": embedded_question},
                    },
                }
            },
            {"match": {"title": {"query": input_text, "boost": 1.0}}},
            # {
            #         "multi_match": {
            #             "query": input_text,
            #             "fields": [
            #                 "title^3",
            #                 # "content"
            #             ],
            #             "minimum_should_match": "1"
            #         }
            #     }
        ],
        "minimum_should_match": 1,
    }
}

min_score = 0
result = es_helper.client.search(
    index=INDEX_NAME,
    query=query_element,
    _source=["product_id", "title", "subtitle", "product_description"],
    min_score=0.5,
    size=10,
    request_cache=True,
    track_scores=True,
    explain=False,
)

# Enterprise version

- Hosting vectorize model + ingest pipeline to facilitate the process