In [3]:
import ast
import datetime
import json
import os
import time

from elasticsearch import Elasticsearch
import numpy as np
import pandas as pd
import pytrec_eval
from tqdm import tqdm

from query_functions import query_elasticsearch_hybrid

current_timestamp = datetime.datetime.now()


In [4]:
# Connect to local elastic

es = Elasticsearch('http://localhost:9200')
es.ping()

True

# Load Data

In [5]:
def convert_to_dict(string):
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        return None  # Handle cases where the string is not a valid dictionary representation


In [6]:
def dict_to_string(my_dict):
  result_str = ""
  for key, value in my_dict.items():
    result_str += str(key) + ' ' + str(value) + ' '
  return result_str

In [7]:
# Products

filename = "processed_data/df_prods.csv"

if os.path.isfile(filename):
    df_prods = pd.read_csv(filename)
    df_prods['product_attributes'] = df_prods['product_attributes'].apply(convert_to_dict)

else:
    print('Cannot locate file')

df_prods.head()

Unnamed: 0,product_uid,product_title,product_description,product_attributes
0,100001,Simpson Strong-Tie 12-Gauge Angle,"Not only do angles make joints stronger, they ...",{'Bullet01': 'Versatile connector for various ...
1,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,BEHR Premium Textured DECKOVER is an innovativ...,"{'Application Method': 'Brush,Roller,Spray', '..."
2,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,Update your bathroom with the Delta Vero Singl...,"{'Bath Faucet Type': 'Combo Tub and Shower', '..."
3,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,Achieving delicious results is almost effortle...,"{'Appliance Type': 'Over the Range Microwave',..."
4,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,The Quantum Adjustable 2-Light LED Black Emerg...,"{'Battery Power Type': 'Ni-Cad', 'Battery Size..."


In [8]:
df_prods['product_attributes_string'] = [dict_to_string(x) if x is not None else x for x in df_prods['product_attributes']]
df_prods['product_text_string'] = df_prods['product_title'].fillna('') + ' ' + df_prods['product_description'].fillna('') + ' ' + df_prods['product_attributes_string'].fillna('') 

In [9]:
# Query

filename = "processed_data/df_queries.csv"

if os.path.isfile(filename):
    df_queries = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1


In [10]:
# Relevance

filename = "processed_data/df_relevance.csv"

if os.path.isfile(filename):
    df_relevance = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_relevance.head()

Unnamed: 0,query_id,product_uid,relevance
0,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,100001,3.0
1,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,100001,2.5
2,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,100002,3.0
3,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,100005,2.33
4,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,100005,2.67


# Embeddings

In [11]:
# Embeddings were run on Kaggle for GPU

query_embeddings = np.load('processed_data/query_embeddings.npy')

In [12]:
df_queries['query_embedding'] = [embedding.tolist() for embedding in query_embeddings]
df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results,query_embedding
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1,"[-0.058361075818538666, 0.026495283469557762, ..."
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1,"[-0.027898991480469704, -0.024725843220949173,..."
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1,"[-0.07859756052494049, -0.00036610415554605424..."
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1,"[-0.020222697407007217, 0.06791018694639206, 0..."
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1,"[-0.003950382117182016, 0.012035505846142769, ..."


# Hybrid Queries

In [13]:
index_name = 'products-embeddings'

## Sample Query

In [12]:
# A sample query

search_query_id = df_queries['query_id'][0]
search_vector = df_queries['query_embedding'][0]
search_text = df_queries['search_term'][0]

results = query_elasticsearch_hybrid(es, index_name, search_text=search_text, search_vector=search_vector, num_results=10)

hits = pd.DataFrame(results['hits']['hits'])

In [13]:

product_uids = []
product_titles = []
product_descriptions = []
product_attributes = []
query_id_list = []
relevances = []
for h in hits['_source']:
    product_uids.append(h['product_uid'])
    product_titles.append(h['product_title'])
    product_descriptions.append(h['product_description'])
    product_attributes.append(h['product_attributes'])
    
    if len(h['query_scores'])>0:
        query_id_list.append(h['query_scores'][0]['query_id'])
        relevances.append(h['query_scores'][0]['relevance'])
    else:
        query_id_list.append(None)
        relevances.append(None)

hits['product_uid'] = product_uids
hits['product_title'] = product_titles
hits['product_description'] = product_descriptions
hits['product_attribute'] = product_attributes
hits['query_id'] = query_id_list
hits['relevance'] = relevances

hits.head()


Unnamed: 0,_index,_id,_score,_source,product_uid,product_title,product_description,product_attribute,query_id,relevance
0,products-embeddings,VEg05Y8BqeArwfGV1Foe,40.223774,"{'product_uid': 198519, 'product_title': 'Vera...",198519,Veranda White Vinyl Traditional Left/Right Ang...,The Traditional Vinyl White Left/Right Angle B...,"[{'name': 'Accessory type', 'value': 'Left/Rig...",45476ecfe43c98557ae68d2eec00a37d8f21c0fef3d913...,2.0
1,products-embeddings,gkg95Y8BqeArwfGVwOtd,34.11208,"{'product_uid': 174846, 'product_title': 'Marq...",174846,Marquee Railing Black Left Multi-Angle Bracket...,"Designed with a beautiful hammered-metal look,...","[{'name': 'Accessory type', 'value': 'Bracket'...",,
2,products-embeddings,_Eg95Y8BqeArwfGVMeH_,34.11095,"{'product_uid': 170398, 'product_title': 'Marq...",170398,Marquee Railing Black Right Multi-Angle Bracke...,"Designed with a beautiful hammered-metal look,...","[{'name': 'Accessory type', 'value': 'Bracket'...",,
3,products-embeddings,6UlG5Y8BqeArwfGVwXXR,33.91701,"{'product_uid': 223811, 'product_title': 'Marq...",223811,Marquee Railing White Left Multi-Angle Bracket...,"Designed with a beautiful hammered-metal look,...","[{'name': 'Accessory type', 'value': 'Bracket'...",,
4,products-embeddings,C0g15Y8BqeArwfGVuWlK,31.861212,"{'product_uid': 102005, 'product_title': 'Supe...",102005,Superstrut 90Ë 4-Hole Angle Bracket - Silver ...,The Superstrut 90-Degree 4-Hole Channel Bracke...,"[{'name': 'Bullet01', 'value': 'Use to support...",b39f9b993b97bb3eb11c9dc73220691215ab99971b52b7...,2.67


## Run the vector queries

In [14]:
# Filter down to queries that have results

relevant_queries = df_queries[df_queries['has_relevant_results']==1]

In [15]:
boosts = {"title_boost": 8,
          "description_boost": 2,
          "attributes_boost": 1,
          "product_text_string_vector_boost": 1}

In [16]:
# Create query result dictionaries

filename = "query_runs/run_hybrid.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_h = json.load(file)
else:
    run_h = {}

    start_time = time.time()

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text=search_text,
                                                search_vector=search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run_h[search_query_id] = query_doc_dict

    end_time = time.time() 

    with open(filename, "w") as file:
        json.dump(run_h, file)

100%|██████████| 11795/11795 [11:02<00:00, 17.80it/s]


## Evaluate Results

In [17]:
# Load ground truth
filename = "query_runs/qrel.json"

with open(filename, "r") as file:
    qrel = json.load(file)

In [18]:
# Place to store results and initialize an evaluator

ranking_results = []
measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

In [19]:
# Evaluate hybrid queries

results_df = pd.DataFrame(evaluator.evaluate(run_h))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'hybrid'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

ranking_results.append(results_df)

In [20]:
# Put the results into a dataframe, add to previous results

ranking_results = pd.concat(ranking_results)

all_results = pd.read_csv('query_runs/query_results.csv')
all_results = pd.concat([all_results, ranking_results])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
2,0.236762,0.159441,0.330878,vectorsearch,509.612543,2024-06-23 12:50:09.220148
3,0.155515,0.096555,0.241066,vectorsearch_multifield,622.987846,2024-06-23 12:50:09.220148
4,0.167586,0.10611,0.254867,vectorsearch_multifield_tuned,662.439398,2024-06-23 12:50:09.220148
0,0.238384,0.16287,0.324941,hybrid,662.608457,2024-06-23 13:25:22.725108


# Tune Vector Boosts

In [None]:
boost_list = []

for b in [10,50,100,200]:
    boost_list.append({"title_boost": 8,
                        "description_boost": 2,
                        "attributes_boost": 1,
                        "vector_boost": b})

In [None]:
# Run hybrid queries with boosts

measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

for boosts in tqdm(boost_list):

    run = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text, search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run[search_query_id] = query_doc_dict

    results_df = pd.DataFrame(evaluator.evaluate(run))
    results_dict = {}
    for measure in measures:
        results_dict['mean '+measure] = results_df.loc[measure].mean()
    results_df = pd.DataFrame(results_dict, index=[0])

    boosts['mean map_cut_10'] = results_df['mean map_cut_10'].values[0]
    boosts['mean ndcg_cut_10'] = results_df['mean ndcg_cut_10'].values[0]
    boosts['mean recip_rank'] = results_df['mean recip_rank'].values[0]


In [None]:
pd.DataFrame(boost_list).sort_values('mean ndcg_cut_10')

In [None]:
for b in [25,75]:
    boost_list.append({"title_boost": 8,
                        "description_boost": 2,
                        "attributes_boost": 1,
                        "vector_boost": b})

In [None]:
# Run hybrid queries with boosts

measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

for boosts in tqdm(boost_list[-2:]):

    run = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text, search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run[search_query_id] = query_doc_dict

    results_df = pd.DataFrame(evaluator.evaluate(run))
    results_dict = {}
    for measure in measures:
        results_dict['mean '+measure] = results_df.loc[measure].mean()
    results_df = pd.DataFrame(results_dict, index=[0])

    boosts['mean map_cut_10'] = results_df['mean map_cut_10'].values[0]
    boosts['mean ndcg_cut_10'] = results_df['mean ndcg_cut_10'].values[0]
    boosts['mean recip_rank'] = results_df['mean recip_rank'].values[0]


In [None]:
pd.DataFrame(boost_list).sort_values('vector_boost')

In [None]:
for b in [40,60]:
    boost_list.append({"title_boost": 8,
                        "description_boost": 2,
                        "attributes_boost": 1,
                        "vector_boost": b})

In [None]:
# Run hybrid queries with boosts

measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

for boosts in tqdm(boost_list[-2:]):

    run = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text, search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run[search_query_id] = query_doc_dict

    results_df = pd.DataFrame(evaluator.evaluate(run))
    results_dict = {}
    for measure in measures:
        results_dict['mean '+measure] = results_df.loc[measure].mean()
    results_df = pd.DataFrame(results_dict, index=[0])

    boosts['mean map_cut_10'] = results_df['mean map_cut_10'].values[0]
    boosts['mean ndcg_cut_10'] = results_df['mean ndcg_cut_10'].values[0]
    boosts['mean recip_rank'] = results_df['mean recip_rank'].values[0]


In [None]:
pd.DataFrame(boost_list).sort_values('mean ndcg_cut_10')

In [21]:
boosts = {"title_boost": 8,
          "description_boost": 2,
          "attributes_boost": 1,
          "product_text_string_vector_boost": 50}

In [22]:
# Create query result dictionaries

filename = "query_runs/run_hybrid_boosted.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_hb = json.load(file)
else:
    run_hb = {}

    start_time = time.time()

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text=search_text,
                                                search_vector=search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run_hb[search_query_id] = query_doc_dict

    end_time = time.time() 

    with open(filename, "w") as file:
        json.dump(run_hb, file)

100%|██████████| 11795/11795 [11:56<00:00, 16.47it/s]


## Evaluate Results

In [23]:
# Place to store results and initialize an evaluator

ranking_results = []
measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

In [24]:
# Evaluate hybrid queries

results_df = pd.DataFrame(evaluator.evaluate(run_hb))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'hybrid_boosted'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

ranking_results.append(results_df)

In [25]:
# Put the results into a dataframe, add to previous results

ranking_results = pd.concat(ranking_results)
all_results = pd.concat([all_results, ranking_results])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
2,0.236762,0.159441,0.330878,vectorsearch,509.612543,2024-06-23 12:50:09.220148
3,0.155515,0.096555,0.241066,vectorsearch_multifield,622.987846,2024-06-23 12:50:09.220148
4,0.167586,0.10611,0.254867,vectorsearch_multifield_tuned,662.439398,2024-06-23 12:50:09.220148
0,0.238384,0.16287,0.324941,hybrid,662.608457,2024-06-23 13:25:22.725108
0,0.25135,0.170474,0.342187,hybrid_boosted,716.31887,2024-06-23 13:25:22.725108


In [26]:
all_results.to_csv('query_runs/query_results.csv', index=False)


# Queries with Results

In [28]:
text_searches_without_hits = 0
hybrid_searches_without_hits = 0

for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
    search_vector = row['query_embedding']
    search_query_id = row['query_id']
    search_text = row['search_term']

    text_results = query_elasticsearch_hybrid(es, index_name, search_text=search_text,
                                              num_results=10, boost_values=boosts)
    text_hits = pd.DataFrame(text_results['hits']['hits'])

    if len(text_hits)==0:
        text_searches_without_hits+=1

    hybrid_results = query_elasticsearch_hybrid(es, index_name, search_text=search_text,
                                                search_vector=search_vector,
                                                num_results=10, boost_values=boosts)
    hybrid_hits = pd.DataFrame(hybrid_results['hits']['hits'])

    if len(hybrid_hits)==0:
        hybrid_searches_without_hits+=1

In [None]:
text_searches_without_hits / len(relevant_queries)

In [None]:
hybrid_searches_without_hits / len(relevant_queries)

# Sample Queries

In [2]:
import os

In [17]:
# Load ground truth
filename = "query_runs/qrel.json"

with open(filename, "r") as file:
    qrel = json.load(file)

# Text
filename = "query_runs/run_boosted.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_t = json.load(file)
        
# Vector
filename = "query_runs/run_vector.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_v = json.load(file)

# Hybrid
filename = "query_runs/run_hybrid_boosted.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_h = json.load(file)

In [16]:
ranking_results = []
measures = {'ndcg_cut_10'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

In [57]:
results_df = pd.DataFrame(evaluator.evaluate(run_t))
results_df = results_df.T
results_df.rename(columns={'ndcg_cut_10': 'ndcg_cut_10_text'}, inplace=True)

results_df_2 = pd.DataFrame(evaluator.evaluate(run_v))
results_df_2 = results_df_2.T
results_df_2.rename(columns={'ndcg_cut_10': 'ndcg_cut_10_vector'}, inplace=True)

results_df = results_df.merge(results_df_2, how='left', left_index=True, right_index=True)

results_df_2 = pd.DataFrame(evaluator.evaluate(run_h))
results_df_2 = results_df_2.T
results_df_2.rename(columns={'ndcg_cut_10': 'ndcg_cut_10_hybrid'}, inplace=True)

results_df = results_df.merge(results_df_2, how='left', left_index=True, right_index=True)

results_df['h_minus_v'] = results_df['ndcg_cut_10_hybrid'] - results_df['ndcg_cut_10_vector']
results_df['v_minus_t'] = results_df['ndcg_cut_10_vector'] - results_df['ndcg_cut_10_text']

results_df[(results_df['h_minus_v']>0) & (results_df['v_minus_t']>0) & (results_df['ndcg_cut_10_text']>0)].sort_values('h_minus_v', ascending=False).head(10)

Unnamed: 0,ndcg_cut_10_text,ndcg_cut_10_vector,ndcg_cut_10_hybrid,h_minus_v,v_minus_t
40ce92c92b5777f7f83c57b93db3f7e2ce8a5f09676a17c1493ea7bf6550ee2d,0.201515,0.247117,0.638788,0.391671,0.045602
b4f4a18e032207b9f91f899f366bdc8715369d8d70348da6ae12c80907c567a8,0.5,0.63093,1.0,0.36907,0.13093
57657f3064d298a9f70967f5d4b33f247c26f3e13c10304b012dcb020e0e0311,0.326386,0.432111,0.800767,0.368656,0.105725
012a4e265162adc17e6680e174116b9f55dd590f8cee25b5e3045dab12b860c5,0.149824,0.189057,0.544751,0.355694,0.039233
3b3fea9ac8be367e689cc0e80323275db96d7d9426cf02a32f1dfc543b577b93,0.083409,0.21561,0.567255,0.351645,0.132201
a9b59ad2657f5b2d9b9f1b8d5b41516475efa5540ff0ab6547032b292302ae96,0.237198,0.264068,0.613147,0.349079,0.02687
1629f47ede2d4729a23611ca2110c09e3ff916fcecb06c9bdbf7be666ed36c18,0.265826,0.468348,0.804889,0.336541,0.202522
f025d0011dd596a09a1a3c6ee6f3c227ead362e0a2ec953bea8239c7a8e6879d,0.192294,0.319394,0.638788,0.319394,0.1271
5620846ec4ac55e556f76c13b0473988ad34edc67444df10bf1b1fc1455b1efb,0.110046,0.136985,0.444755,0.30777,0.026939
18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec612c19636a7e1c276e0,0.156607,0.221994,0.525914,0.30392,0.065387


In [62]:
sample_query_id = '18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec612c19636a7e1c276e0'

# A sample text query

search_text = df_queries[df_queries['query_id']==sample_query_id]['search_term'].values[0]
search_vector = df_queries[df_queries['query_id']==sample_query_id]['query_embedding'].values[0]

print(search_text)

house paint dark brown


In [63]:
t_boosts = {"title_boost": 8,
          "description_boost": 2,
          "attributes_boost": 1}

h_boosts = {"title_boost": 8,
          "description_boost": 2,
          "attributes_boost": 1,
          "product_text_string_vector_boost": 50}

### Text Search

In [64]:
results = query_elasticsearch_hybrid(es, 'products-embeddings',
                                     search_text=search_text, boost_values=t_boosts)

hits = pd.DataFrame(results['hits']['hits'])

In [65]:

product_uids = []
product_titles = []
product_descriptions = []
product_attributes = []
query_id_list = []
relevances = []
for h in hits['_source']:
    product_uids.append(h['product_uid'])
    product_titles.append(h['product_title'])
    product_descriptions.append(h['product_description'])
    product_attributes.append(h['product_attributes'])
    
    if len(h['query_scores'])>0:
        query_id_list.append([x['query_id'] for x in h['query_scores']])
        relevances.append([x['relevance'] for x in h['query_scores'] if x['query_id']==sample_query_id])
    else:
        query_id_list.append(None)
        relevances.append(None)

hits['product_uid'] = product_uids
hits['product_title'] = product_titles
hits['product_description'] = product_descriptions
hits['product_attribute'] = product_attributes
hits['query_id'] = query_id_list
hits['relevance'] = relevances

hits.head(10)


Unnamed: 0,_index,_id,_score,_source,product_uid,product_title,product_description,product_attribute,query_id,relevance
0,products-embeddings,bUg05Y8BqeArwfGV1VqC,125.69568,"{'product_uid': 198579, 'product_title': 'Rust...",198579,Rust-Oleum Stops Rust 12-oz. Protective Enamel...,The Rust-Oleum Stops Rust 12 oz Satin Dark Bro...,"[{'name': 'Application Method', 'value': 'Spra...",[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[2.0]
1,products-embeddings,L0gw5Y8BqeArwfGVjhhu,121.83252,"{'product_uid': 160692, 'product_title': 'Marv...",160692,Marvy Uchida DecoColor Dark Brown Broad Point ...,DecoColor bold point oil-based paint markers a...,[],[ab76b8fd575a57e3ec2581916471cb7ace994a2a1e5f7...,[]
2,products-embeddings,_Eg75Y8BqeArwfGVMr3B,117.351234,"{'product_uid': 152689, 'product_title': 'Home...",152689,HomeSullivan Dark Brown Upholstered Daybed,"Perfect for the multifunctional space, this up...",[],,
3,products-embeddings,hEgx5Y8BqeArwfGVeyaf,115.94648,"{'product_uid': 168302, 'product_title': 'Euro...",168302,Eurostyle 24x30x0.75 in. Finishing End Panel i...,Use the Eurostyle 24x30 in. Finishing Panel to...,"[{'name': 'Bullet01', 'value': 'High quality m...",[5ea5ad8064f9f1dfe2fcfff5fddb3ff8125b4238f9665...,[]
4,products-embeddings,Tkg15Y8BqeArwfGVHl_o,115.63731,"{'product_uid': 201655, 'product_title': 'Arch...",201655,Architectural Mailboxes 5 in. Dark Aged Copper...,The Solid Cast Brass 5 in. Floating House Numb...,"[{'name': 'Background', 'value': 'No', 'name_v...",[a8f12e63719abf8826c27fe659612a01cda2eaa8885bc...,[]
5,products-embeddings,V0k_5Y8BqeArwfGVXQao,115.14169,"{'product_uid': 187177, 'product_title': 'Euro...",187177,Eurostyle 24x80x0.75 in. Replacement End Panel...,The Eurostyle 24x80 in. Replacement Panel repl...,"[{'name': 'Bullet01', 'value': 'High quality v...",,
6,products-embeddings,ZElG5Y8BqeArwfGVWW9B,114.149506,"{'product_uid': 222142, 'product_title': 'Leat...",222142,Leather-Look Chaise Lounger in Dark Brown,Stretch out in comfort on this gently contoure...,[],,
7,products-embeddings,ikcr5Y8BqeArwfGVd8i_,113.16471,"{'product_uid': 122097, 'product_title': 'Hamp...",122097,Hampton Bay Outdoor Dark Brown Solar LED Walk ...,The Hampton Bay Outdoor Solar LED Walk Lights ...,"[{'name': 'Adjustable Lamp Head', 'value': 'No...",[17ffef03f62e3eb0a720f0d6011b9f74329899fa1fa24...,[]
8,products-embeddings,bUlC5Y8BqeArwfGVszM-,112.70711,"{'product_uid': 206792, 'product_title': 'Vene...",206792,Venetian Worldwide Clive Microfiber Recliner i...,"Enjoy an afternoon nap, the big game or your f...",[],,
9,products-embeddings,TEg55Y8BqeArwfGV_ql7,112.70357,"{'product_uid': 141721, 'product_title': 'Dark...",141721,Dark Brown Upholstered Side Chair (Set of 2),The rich cherry finished rubberwood with dark ...,[],,


### Vector Search

In [66]:
results = query_elasticsearch_hybrid(es, 'products-embeddings', search_vector=search_vector)
hits = pd.DataFrame(results['hits']['hits'])

In [67]:

product_uids = []
product_titles = []
product_descriptions = []
product_attributes = []
query_id_list = []
relevances = []
for h in hits['_source']:
    product_uids.append(h['product_uid'])
    product_titles.append(h['product_title'])
    product_descriptions.append(h['product_description'])
    product_attributes.append(h['product_attributes'])
    
    if len(h['query_scores'])>0:
        query_id_list.append([x['query_id'] for x in h['query_scores']])
        relevances.append([x['relevance'] for x in h['query_scores'] if x['query_id']==sample_query_id])
    else:
        query_id_list.append(None)
        relevances.append(None)

hits['product_uid'] = product_uids
hits['product_title'] = product_titles
hits['product_description'] = product_descriptions
hits['product_attribute'] = product_attributes
hits['query_id'] = query_id_list
hits['relevance'] = relevances

hits.head(10)


Unnamed: 0,_index,_id,_score,_source,product_uid,product_title,product_description,product_attribute,query_id,relevance
0,products-embeddings,20gw5Y8BqeArwfGVehZQ,0.72815,"{'product_uid': 159946, 'product_title': 'Glid...",159946,Glidden Premium 1-gal. #HDGWN13 Stewart House ...,The Glidden Premium 1-gal. Satin Latex Exterio...,[],[6e9c28e6c0af4f0383a87f0efc2505252e423cf3fee87...,[]
1,products-embeddings,DUgw5Y8BqeArwfGVjBhT,0.726483,"{'product_uid': 160596, 'product_title': 'BEHR...",160596,BEHR MARQUEE Home Decorators Collection #HDC-C...,BEHR MARQUEE Flat Exterior is a stain-blocking...,[],[3409ac7b187b4862d11b98bca5763b41cae807a86d468...,[]
2,products-embeddings,cUg05Y8BqeArwfGVdFQ0,0.7234,"{'product_uid': 194932, 'product_title': 'Glid...",194932,Glidden Premium 5-gal. #HDGWN13 Stewart House ...,The Glidden 5-gal. Flat Interior Paint provide...,[],[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[2.33]
3,products-embeddings,rkgz5Y8BqeArwfGVm0a7,0.720089,"{'product_uid': 186643, 'product_title': 'BEHR...",186643,BEHR MARQUEE Home Decorators Collection #HDC-A...,BEHR MARQUEE Semi-Gloss Enamel Exterior is a s...,"[{'name': 'Bullet01', 'value': 'Protection for...",[beb53d2cda1cd4d698b0d94a9b8deb85eb48e19e1b36c...,[]
4,products-embeddings,WEcq5Y8BqeArwfGVWLVR,0.719881,"{'product_uid': 113895, 'product_title': 'BEHR...",113895,BEHR MARQUEE #MQ1-43 Piano Brown Paint,For the ultimate in durability and beauty on v...,[],[6e9c28e6c0af4f0383a87f0efc2505252e423cf3fee87...,[]
5,products-embeddings,dkgv5Y8BqeArwfGVwAuC,0.719701,"{'product_uid': 153916, 'product_title': 'BEHR...",153916,BEHR MARQUEE #S170-7 Dark Cherry Mocha Exterio...,"For a classic, cultivated look on your home's ...",[],[3409ac7b187b4862d11b98bca5763b41cae807a86d468...,[]
6,products-embeddings,ykg95Y8BqeArwfGVhufZ,0.719561,"{'product_uid': 173124, 'product_title': 'BEHR...",173124,BEHR Premium Plus #BNC-29 Dark Room Paint,"For tough, all-purpose paint with a touch of s...","[{'name': 'Bullet01', 'value': 'Ideal for fami...",,
7,products-embeddings,6Ugy5Y8BqeArwfGVzznU,0.719055,"{'product_uid': 179276, 'product_title': 'Glid...",179276,Glidden Premium 1-gal. #HDGWN13 Stewart House ...,The Glidden Premium 1-gal. Semi-Gloss Latex Ex...,[],[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[2.0]
8,products-embeddings,z0g15Y8BqeArwfGVRWFd,0.718674,"{'product_uid': 203177, 'product_title': 'Glid...",203177,Glidden Premium 5-gal. #HDGR39D Ranch House Br...,The Glidden 5-gal. Semi-Gloss Latex Interior P...,[],[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[2.67]
9,products-embeddings,A0lF5Y8BqeArwfGV2mcb,0.718509,"{'product_uid': 219997, 'product_title': 'BEHR...",219997,BEHR MARQUEE #BXC-45 Classic Brown Exterior Paint,Add dazzling impact and style to your homes ex...,"[{'name': 'Bullet01', 'value': 'Protection for...",,


### Hybrid Seach

In [68]:
results = query_elasticsearch_hybrid(es, index_name, search_text=search_text,
                                        search_vector=search_vector, num_results=10, boost_values=h_boosts)
hits = pd.DataFrame(results['hits']['hits'])

In [69]:

product_uids = []
product_titles = []
product_descriptions = []
product_attributes = []
query_id_list = []
relevances = []
for h in hits['_source']:
    product_uids.append(h['product_uid'])
    product_titles.append(h['product_title'])
    product_descriptions.append(h['product_description'])
    product_attributes.append(h['product_attributes'])
    
    if len(h['query_scores'])>0:
        query_id_list.append([x['query_id'] for x in h['query_scores']])
        relevances.append([x['relevance'] for x in h['query_scores'] if x['query_id']==sample_query_id])
    else:
        query_id_list.append(None)
        relevances.append(None)

hits['product_uid'] = product_uids
hits['product_title'] = product_titles
hits['product_description'] = product_descriptions
hits['product_attribute'] = product_attributes
hits['query_id'] = query_id_list
hits['relevance'] = relevances

hits.head(10)


Unnamed: 0,_index,_id,_score,_source,product_uid,product_title,product_description,product_attribute,query_id,relevance
0,products-embeddings,20gw5Y8BqeArwfGVehZQ,148.02765,"{'product_uid': 159946, 'product_title': 'Glid...",159946,Glidden Premium 1-gal. #HDGWN13 Stewart House ...,The Glidden Premium 1-gal. Satin Latex Exterio...,[],[6e9c28e6c0af4f0383a87f0efc2505252e423cf3fee87...,[]
1,products-embeddings,G0gx5Y8BqeArwfGV2CwN,146.89905,"{'product_uid': 171355, 'product_title': 'Glid...",171355,Glidden Premium 5-gal. #HDGWN13 Stewart House ...,The Glidden 5-gal. Flat Latex Exterior Paint c...,[],[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[1.67]
2,products-embeddings,6Ugy5Y8BqeArwfGVzznU,143.38411,"{'product_uid': 179276, 'product_title': 'Glid...",179276,Glidden Premium 1-gal. #HDGWN13 Stewart House ...,The Glidden Premium 1-gal. Semi-Gloss Latex Ex...,[],[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[2.0]
3,products-embeddings,cUg05Y8BqeArwfGVdFQ0,142.18007,"{'product_uid': 194932, 'product_title': 'Glid...",194932,Glidden Premium 5-gal. #HDGWN13 Stewart House ...,The Glidden 5-gal. Flat Interior Paint provide...,[],[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[2.33]
4,products-embeddings,r0g05Y8BqeArwfGVuVhP,141.48108,"{'product_uid': 197507, 'product_title': 'Glid...",197507,Glidden Premium 1-gal. #HDGWN13 Stewart House ...,Add a smooth sheen to indoor surfaces such as ...,[],[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[3.0]
5,products-embeddings,Z0g05Y8BqeArwfGVFU6l,141.01993,"{'product_uid': 191297, 'product_title': 'Glid...",191297,Glidden Premium 1-gal. #HDGR39D Ranch House Br...,"Featuring a stain-resistant, high-coverage lat...",[],[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[2.0]
6,products-embeddings,z0g15Y8BqeArwfGVRWFd,137.15631,"{'product_uid': 203177, 'product_title': 'Glid...",203177,Glidden Premium 5-gal. #HDGR39D Ranch House Br...,The Glidden 5-gal. Semi-Gloss Latex Interior P...,[],[18c245dedc8017569265ec2d7c1a64c57cdd16b7621ec...,[2.67]
7,products-embeddings,wUlA5Y8BqeArwfGVuRm1,135.13614,"{'product_uid': 195767, 'product_title': 'BEHR...",195767,BEHR Premium Plus #M140-7 Dark Crimson Paint,For a paint that's as versatile as it is beaut...,"[{'name': 'Bullet01', 'value': 'Ideal for fami...",,
8,products-embeddings,qkgz5Y8BqeArwfGV7UvB,129.95815,"{'product_uid': 189614, 'product_title': 'Ralp...",189614,Ralph Lauren #RL1320 Country House Interior Paint,Ralph Lauren Paint represents a tradition of e...,"[{'name': 'Bullet01', 'value': 'Superior color...",[ff07e37c6afef6c3ac509ef4e5d65d2fb22c4ee63b08e...,[]
9,products-embeddings,cUg45Y8BqeArwfGVh5VG,129.4676,"{'product_uid': 130700, 'product_title': 'BEHR...",130700,BEHR MARQUEE #MQ2-51 Pasha Brown Paint,Take a new twist on neutrals with BEHR MARQUEE...,[],,
