In [23]:
import ast
import datetime
import json
import os
import time

from elasticsearch import Elasticsearch
import numpy as np
import pandas as pd
import pytrec_eval
# import torch
from tqdm import tqdm

from query_functions import query_elasticsearch_hybrid

current_timestamp = datetime.datetime.now()


In [24]:
# Connect to local elastic

es = Elasticsearch('http://localhost:9200') 
es.ping()

True

# Load Data

In [25]:
def convert_to_dict(string):
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        return None  # Handle cases where the string is not a valid dictionary representation


In [26]:
def dict_to_string(my_dict):
  result_str = ""
  for key, value in my_dict.items():
    result_str += str(key) + ' ' + str(value) + ' '
  return result_str

In [27]:
# Products

filename = "processed_data/df_prods.csv"

if os.path.isfile(filename):
    df_prods = pd.read_csv(filename)
    df_prods['product_attributes'] = df_prods['product_attributes'].apply(convert_to_dict)

else:
    print('Cannot locate file')

df_prods.head()

Unnamed: 0,product_uid,product_title,product_description,product_attributes
0,100001,Simpson Strong-Tie 12-Gauge Angle,"Not only do angles make joints stronger, they ...",{'Bullet01': 'Versatile connector for various ...
1,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,BEHR Premium Textured DECKOVER is an innovativ...,"{'Application Method': 'Brush,Roller,Spray', '..."
2,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,Update your bathroom with the Delta Vero Singl...,"{'Bath Faucet Type': 'Combo Tub and Shower', '..."
3,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,Achieving delicious results is almost effortle...,"{'Appliance Type': 'Over the Range Microwave',..."
4,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,The Quantum Adjustable 2-Light LED Black Emerg...,"{'Battery Power Type': 'Ni-Cad', 'Battery Size..."


In [28]:
df_prods['product_attributes_string'] = [dict_to_string(x) if x is not None else x for x in df_prods['product_attributes']]
df_prods['product_text_string'] = df_prods['product_title'].fillna('') + ' ' + df_prods['product_description'].fillna('') + ' ' + df_prods['product_attributes_string'].fillna('') 

In [29]:
# Query

filename = "processed_data/df_queries.csv"

if os.path.isfile(filename):
    df_queries = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1


In [30]:
# Relevance

filename = "processed_data/df_relevance.csv"

if os.path.isfile(filename):
    df_relevance = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_relevance.head()

Unnamed: 0,query_id,product_uid,relevance
0,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,100001,3.0
1,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,100001,2.5
2,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,100002,3.0
3,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,100005,2.33
4,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,100005,2.67


# Create Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m", trust_remote_code=True)

print(model)

In [None]:
# Embeddings were run on Kaggle for access GPU

# Product Embeddings
prod_cols = ['product_title', 'product_description',
             'product_attributes_string', 'product_text_string']

for col in tqdm(prod_cols):

    print(f'reading {col}') 

    p_embeddings = pd.read_csv(f'processed_data\df_prods_{col}_embeddings.csv')
    p_embeddings.set_index('Unnamed: 0', drop=True, inplace=True)
    p_embeddings.index.name = None
    p_embeddings.columns = [col+'_embedding']

    print(f'evaling {col} embeddings')
    if col =='product_attributes_string':
        p_embeddings[col+'_embedding'] = p_embeddings[col+'_embedding'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else None)
    else:
        p_embeddings[col+'_embedding'] = p_embeddings[col+'_embedding'].apply(ast.literal_eval)

    df_prods = df_prods.merge(p_embeddings, how = 'left', left_index=True, right_index=True)


In [None]:
del p_embeddings

In [31]:
query_embeddings = np.load('processed_data/query_embeddings.npy')
df_queries['query_embedding'] = [embedding.tolist() for embedding in query_embeddings]
df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results,query_embedding
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1,"[-0.058361075818538666, 0.026495283469557762, ..."
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1,"[-0.027898991480469704, -0.024725843220949173,..."
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1,"[-0.07859756052494049, -0.00036610415554605424..."
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1,"[-0.020222697407007217, 0.06791018694639206, 0..."
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1,"[-0.003950382117182016, 0.012035505846142769, ..."


# Load Into Index

In [None]:
# es.indices.delete(index='products-embeddings', ignore=[400, 404])

In [38]:
# Initialize the index

index_name = 'products-embeddings'

mapping = {
    "properties": {
        "product_uid": {
            "type": "integer"
        },
        "product_title": {
            "type": "text"
        },
        "product_description": {
            "type": "text"
        },
        "product_attributes": {
            "type": "nested",
            "properties": {
                "name": {
                    "type": "text"
                },
                "value": {
                    "type": "text"
                },
                "name_value": {
                    "type": "text"
                },
            }
        },
        "product_title_vector": {
            "type": "dense_vector",
            "dims": 768
        },
        "product_description_vector": {
            "type": "dense_vector",
            "dims": 768
        },
        "product_attributes_string_vector": {
            "type": "dense_vector",
            "dims": 768
        },
        "product_text_string_vector": {
            "type": "dense_vector",
            "dims": 768
        },
        "query_scores": {
            "type": "nested",
            "properties": {
                "query_id": {
                    "type": "text"
                },
                "relevance": {
                    "type": "float"
                },
            }
        }
    }
}

# es.indices.create(index=index_name, mappings=mapping)

In [None]:
# Create documents and index them right in
for index, row in tqdm(df_prods.iterrows(), total=len(df_prods)):
    
    query_scores = []
    tmp_query = df_relevance[(df_relevance['product_uid']==row['product_uid'])]
    if len(tmp_query)>0:
        tmp_query = tmp_query.replace(pd.NA, '', regex=True)
        for index_q, row_q in tmp_query.iterrows():
            query_scores.append({'query_id': row_q['query_id'],
                                 'relevance': row_q['relevance']})

    product_attributes = []
    tmp_attr = row['product_attributes']
    if not pd.isnull(tmp_attr):
        for k in tmp_attr.keys():
            product_attributes.append({'name': k,
                                       'value': tmp_attr[k],
                                       'name_value': str(k) + ' ' + str(tmp_attr[k])})

    tmp_doc = {
        'product_uid': row['product_uid'],
        'product_title': row['product_title'],
        'product_description': row['product_description'],
        'product_attributes': product_attributes,
        'query_scores': query_scores
    }

    if isinstance(row['product_title_embedding'], list):
        tmp_doc['product_title_vector'] = row['product_title_embedding']

    if isinstance(row['product_description_embedding'], list):
        tmp_doc['product_description_vector'] = row['product_description_embedding']

    if isinstance(row['product_attributes_string_embedding'], list):
        tmp_doc['product_attributes_string_vector'] = row['product_attributes_string_embedding']

    if isinstance(row['product_text_string_embedding'], list):
        tmp_doc['product_text_string_vector'] = row['product_text_string_embedding']

    #product_document_list.append(tmp_doc)
    # es.index(index=index_name, document=tmp_doc)

# KNN Queries

## Sample Query

In [39]:
# A sample query

search_vector = df_queries['query_embedding'][0]

results = query_elasticsearch_hybrid(es, index_name, search_vector=search_vector, num_results=10)

hits = pd.DataFrame(results['hits']['hits'])

In [40]:

product_uids = []
product_titles = []
product_descriptions = []
product_attributes = []
query_id_list = []
relevances = []
for h in hits['_source']:
    product_uids.append(h['product_uid'])
    product_titles.append(h['product_title'])
    product_descriptions.append(h['product_description'])
    product_attributes.append(h['product_attributes'])
    
    if len(h['query_scores'])>0:
        query_id_list.append(h['query_scores'][0]['query_id'])
        relevances.append(h['query_scores'][0]['relevance'])
    else:
        query_id_list.append(None)
        relevances.append(None)

hits['product_uid'] = product_uids
hits['product_title'] = product_titles
hits['product_description'] = product_descriptions
hits['product_attribute'] = product_attributes
hits['query_id'] = query_id_list
hits['relevance'] = relevances

hits.head()


Unnamed: 0,_index,_id,_score,_source,product_uid,product_title,product_description,product_attribute,query_id,relevance
0,products-embeddings,jUg85Y8BqeArwfGV_d7n,0.693194,"{'product_uid': 168743, 'product_title': 'Lock...",168743,LockState Angle L Bracket for 600 lb.,LS-3320 angle l bracket for LS-600S. Used to m...,"[{'name': 'Bullet01', 'value': 'Satin finish',...",,
1,products-embeddings,gkg95Y8BqeArwfGVwOtd,0.679624,"{'product_uid': 174846, 'product_title': 'Marq...",174846,Marquee Railing Black Left Multi-Angle Bracket...,"Designed with a beautiful hammered-metal look,...","[{'name': 'Accessory type', 'value': 'Bracket'...",,
2,products-embeddings,tUgx5Y8BqeArwfGVJCDR,0.678604,"{'product_uid': 165217, 'product_title': 'Vera...",165217,Veranda Vinyl Wicker Premier Rail Left-Right A...,The Veranda Vinyl White Left/Right Angle Brack...,"[{'name': 'Accessory type', 'value': 'Left/Rig...",45476ecfe43c98557ae68d2eec00a37d8f21c0fef3d913...,3.0
3,products-embeddings,_Eg95Y8BqeArwfGVMeH_,0.678497,"{'product_uid': 170398, 'product_title': 'Marq...",170398,Marquee Railing Black Right Multi-Angle Bracke...,"Designed with a beautiful hammered-metal look,...","[{'name': 'Accessory type', 'value': 'Bracket'...",,
4,products-embeddings,FUg55Y8BqeArwfGVdaJb,0.677744,"{'product_uid': 137713, 'product_title': 'Vera...",137713,Veranda 3-1/2 in. x 3-1/2 in. x 3 in. Vinyl Tr...,The Veranda Vinyl Stair Angle Brackets (4-Pack...,[],,


## Run the vector queries

In [41]:
# Filter down to queries that have results

relevant_queries = df_queries[df_queries['has_relevant_results']==1]

In [42]:
# Create query result dictionaries

filename = "query_runs/run_vector.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_vu = json.load(file)
else:
    start_time = time.time()
    run_vu = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']

        results = query_elasticsearch_hybrid(es, index_name, search_vector=search_vector, num_results=10)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run_vu[search_query_id] = query_doc_dict

    end_time = time.time() 

    with open(filename, "w") as file:
        json.dump(run_vu, file)

100%|██████████| 11795/11795 [08:29<00:00, 23.15it/s]


## Evaluate results

In [43]:
# Load ground truth
filename = "query_runs/qrel.json"

with open(filename, "r") as file:
    qrel = json.load(file)

In [44]:
# Place to store results and initialize an evaluator

ranking_results = []
measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

In [45]:
# Evaluate vector queries

results_df = pd.DataFrame(evaluator.evaluate(run_vu))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'vectorsearch'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

ranking_results.append(results_df)

In [46]:
# Put the results into a dataframe, add to previous results

ranking_results = pd.concat(ranking_results)

all_results = pd.read_csv('query_runs/query_results.csv')
all_results = pd.concat([all_results, ranking_results])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
0,0.236762,0.159441,0.330878,vectorsearch,509.612543,2024-06-23 11:49:18.153651


## Run the multi-field vector queries

In [47]:
# Create query result dictionaries

boosts = {"product_title_vector_boost": 1,
          "product_description_vector_boost": 1,
          "product_attributes_string_vector_boost": 1}

filename = "query_runs/run_multi_vector.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_vu_m = json.load(file)
else:
    start_time = time.time()
    run_vu_m = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']

        results = query_elasticsearch_hybrid(es, index_name, search_vector=search_vector,
                                             num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run_vu_m[search_query_id] = query_doc_dict

    end_time = time.time() 

    with open(filename, "w") as file:
        json.dump(run_vu_m, file)

100%|██████████| 11795/11795 [10:22<00:00, 18.93it/s]


In [48]:
# Evaluate vector queries

results_df = pd.DataFrame(evaluator.evaluate(run_vu_m))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'vectorsearch_multifield'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

all_results = pd.concat([all_results, results_df])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
0,0.236762,0.159441,0.330878,vectorsearch,509.612543,2024-06-23 11:49:18.153651
0,0.155515,0.096555,0.241066,vectorsearch_multifield,622.987846,2024-06-23 11:49:18.153651


## Tune Multi-Vector Query

In [21]:
boost_list = []

for t in [1,2,4]:
    for d in [1,2,4]:
        for a in [1,2,4]:
            if (a==t) & (t==d):
                continue 
            boost_list.append({"product_title_vector_boost": t,
                               "product_description_vector_boost": d,
                               "product_attributes_string_vector_boost": a})

In [22]:
# Run vector queries with boosts

measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

for boosts in tqdm(boost_list):

    run = {}

    for index,row in relevant_queries.iterrows():
        search_vector = row['query_embedding']
        search_query_id = row['query_id']

        results = query_elasticsearch_hybrid(es, index_name, search_vector=search_vector,
                                             num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run[search_query_id] = query_doc_dict

    results_df = pd.DataFrame(evaluator.evaluate(run))
    results_dict = {}
    for measure in measures:
        results_dict['mean '+measure] = results_df.loc[measure].mean()
    results_df = pd.DataFrame(results_dict, index=[0])

    boosts['mean map_cut_10'] = results_df['mean map_cut_10'].values[0]
    boosts['mean ndcg_cut_10'] = results_df['mean ndcg_cut_10'].values[0]
    boosts['mean recip_rank'] = results_df['mean recip_rank'].values[0]


100%|██████████| 24/24 [3:22:31<00:00, 506.31s/it]  


In [23]:
pd.DataFrame(boost_list).sort_values('mean ndcg_cut_10')

Unnamed: 0,product_title_vector_boost,product_description_vector_boost,product_attributes_string_vector_boost,mean map_cut_10,mean ndcg_cut_10,mean recip_rank
1,1,1,4,0.075718,0.124099,0.201496
4,1,2,4,0.077206,0.124977,0.20424
7,1,4,4,0.079669,0.129329,0.209868
6,1,4,2,0.081967,0.131827,0.213443
10,2,1,4,0.082701,0.132682,0.213545
3,1,2,2,0.08399,0.134429,0.218259
15,2,4,4,0.084087,0.134496,0.218088
12,2,2,4,0.084512,0.135192,0.217375
0,1,1,2,0.084664,0.13537,0.217767
5,1,4,1,0.088154,0.144403,0.226614


In [24]:
for t in [8]:
    for d in [1,2,4]:
        for a in [1,2,4]:
            if (a==t) & (t==d):
                continue 
            boost_list.append({"product_title_vector_boost": t,
                               "product_description_vector_boost": d,
                               "product_attributes_string_vector_boost": a})

In [31]:
# Run vector queries with boosts

measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

for boosts in tqdm(boost_list[24:]):

    run = {}

    for index,row in relevant_queries.iterrows():
        search_vector = row['query_embedding']
        search_query_id = row['query_id']

        results = query_elasticsearch_hybrid(es, index_name, search_vector=search_vector,
                                             num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run[search_query_id] = query_doc_dict

    results_df = pd.DataFrame(evaluator.evaluate(run))
    results_dict = {}
    for measure in measures:
        results_dict['mean '+measure] = results_df.loc[measure].mean()
    results_df = pd.DataFrame(results_dict, index=[0])

    boosts['mean map_cut_10'] = results_df['mean map_cut_10'].values[0]
    boosts['mean ndcg_cut_10'] = results_df['mean ndcg_cut_10'].values[0]
    boosts['mean recip_rank'] = results_df['mean recip_rank'].values[0]


100%|██████████| 9/9 [1:15:59<00:00, 506.56s/it]


In [32]:
pd.DataFrame(boost_list).sort_values('mean ndcg_cut_10')

Unnamed: 0,product_title_vector_boost,product_description_vector_boost,product_attributes_string_vector_boost,mean map_cut_10,mean ndcg_cut_10,mean recip_rank
1,1,1,4,0.075718,0.124099,0.201496
4,1,2,4,0.077206,0.124977,0.20424
7,1,4,4,0.079669,0.129329,0.209868
6,1,4,2,0.081967,0.131827,0.213443
10,2,1,4,0.082701,0.132682,0.213545
3,1,2,2,0.08399,0.134429,0.218259
15,2,4,4,0.084087,0.134496,0.218088
12,2,2,4,0.084512,0.135192,0.217375
0,1,1,2,0.084664,0.13537,0.217767
5,1,4,1,0.088154,0.144403,0.226614


In [49]:
# Create query result dictionary for the tuned multi-vector

boosts = {"product_title_vector_boost": 8,
          "product_description_vector_boost": 4,
          "product_attributes_string_vector_boost": 1}

filename = "query_runs/run_multi_vector_tuned.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_vu_m_tuned = json.load(file)
else:
    start_time = time.time()
    run_vu_m_tuned = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']

        results = query_elasticsearch_hybrid(es, index_name, search_vector=search_vector,
                                             num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run_vu_m_tuned[search_query_id] = query_doc_dict

    end_time = time.time() 

    with open(filename, "w") as file:
        json.dump(run_vu_m_tuned, file)

100%|██████████| 11795/11795 [11:02<00:00, 17.81it/s]


In [50]:
# Evaluate vector queries

results_df = pd.DataFrame(evaluator.evaluate(run_vu_m_tuned))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'vectorsearch_multifield_tuned'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

all_results = pd.concat([all_results, results_df])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
0,0.236762,0.159441,0.330878,vectorsearch,509.612543,2024-06-23 11:49:18.153651
0,0.155515,0.096555,0.241066,vectorsearch_multifield,622.987846,2024-06-23 11:49:18.153651
0,0.167586,0.10611,0.254867,vectorsearch_multifield_tuned,662.439398,2024-06-23 11:49:18.153651


# Sample Queries

In [1]:
boosts = {"title_boost": 8,
          "description_boost": 2,
          "attributes_boost": 1}

In [12]:
sample_query_id = '083a28c9e216c858cafb3f3a08004fc9f9afa893b5a24612625487bb63d82e0e'

# A sample text query

search_text = df_queries[df_queries['query_id']==sample_query_id]['search_term'].values[0]
print(search_text)

results = query_elasticsearch_hybrid(es, 'products-embeddings', search_text=search_text, boost_values=boosts)

hits = pd.DataFrame(results['hits']['hits'])

real flame gel fuel


In [13]:

product_uids = []
product_titles = []
product_descriptions = []
product_attributes = []
query_id_list = []
relevances = []
for h in hits['_source']:
    product_uids.append(h['product_uid'])
    product_titles.append(h['product_title'])
    product_descriptions.append(h['product_description'])
    product_attributes.append(h['product_attributes'])
    
    if len(h['query_scores'])>0:
        query_id_list.append([x['query_id'] for x in h['query_scores']])
        relevances.append([x['relevance'] for x in h['query_scores']])
    else:
        query_id_list.append(None)
        relevances.append(None)

hits['product_uid'] = product_uids
hits['product_title'] = product_titles
hits['product_description'] = product_descriptions
hits['product_attribute'] = product_attributes
hits['query_id'] = query_id_list
hits['relevance'] = relevances

hits.head(10)


Unnamed: 0,_index,_id,_score,_source,product_uid,product_title,product_description,product_attribute,query_id,relevance
0,products-embeddings,WElF5Y8BqeArwfGV7WjO,295.25946,"{'product_uid': 220338, 'product_title': 'Real...",220338,Real Flame Porter 50 in. Ventless Gel Fuel Fir...,The Porter Fireplace features distinct craftsm...,"[{'name': 'Area Heated (Sq. Ft.)', 'value': '1...",,
1,products-embeddings,Y0g45Y8BqeArwfGVhpWE,294.44052,"{'product_uid': 130661, 'product_title': 'Real...",130661,Real Flame Chateau 41 in. Ventless Gel Fuel Fi...,The Chateau Fireplace features the clean lines...,"[{'name': 'Area Heated (Sq. Ft.)', 'value': '1...",,
2,products-embeddings,aUgx5Y8BqeArwfGVeSbg,292.37097,"{'product_uid': 168245, 'product_title': 'Real...",168245,Real Flame Ashley 48 in. Gel Fuel Fireplace in...,Best-selling gel fireplace. The handsome pilla...,"[{'name': 'Area Heated (Sq. Ft.)', 'value': '2...",[c632e8a28835563a0de6fd31a562d80b4f7653a024b6c...,[2.0]
3,products-embeddings,KEgy5Y8BqeArwfGVRTGN,290.45102,"{'product_uid': 174255, 'product_title': 'Real...",174255,Real Flame Ashley 48 in. Gel Fuel Fireplace in...,Best-selling fireplace. The handsome pillars w...,"[{'name': 'Area Heated (Sq. Ft.)', 'value': '2...",[c632e8a28835563a0de6fd31a562d80b4f7653a024b6c...,[2.33]
4,products-embeddings,TEcs5Y8BqeArwfGVPdW0,290.45102,"{'product_uid': 127790, 'product_title': 'Real...",127790,Real Flame Ashley 48 in. Gel Fuel Fireplace in...,Best-selling fireplace. This handsome pillar w...,"[{'name': 'Area Heated (Sq. Ft.)', 'value': '2...",[2b77823854286393e74a6c020bcb4061ea0237d23d60a...,"[2.67, 2.67, 2.67]"
5,products-embeddings,4Ug95Y8BqeArwfGVAt6i,286.56256,"{'product_uid': 168910, 'product_title': 'Real...",168910,Real Flame Chateau 41 in. Corner Ventless Gel ...,The Chateau Corner Fireplace features the clea...,"[{'name': 'Area Heated (Sq. Ft.)', 'value': '1...",,
6,products-embeddings,LUct5Y8BqeArwfGVbumA,286.56256,"{'product_uid': 137119, 'product_title': 'Real...",137119,Real Flame Chateau 41 in. Corner Ventless Gel ...,The Chateau Corner Fireplace features the clea...,"[{'name': 'Area Heated (Sq. Ft.)', 'value': '1...",[2b77823854286393e74a6c020bcb4061ea0237d23d60a...,"[2.67, 2.33]"
7,products-embeddings,50g15Y8BqeArwfGV32ti,280.71036,"{'product_uid': 104458, 'product_title': 'Real...",104458,Real Flame Silverton 48 in. Gel Fuel Fireplace...,Curl up by the comforting glow of this Real Fl...,[],,
8,products-embeddings,tUg75Y8BqeArwfGVB7o5,279.90768,"{'product_uid': 151041, 'product_title': 'Real...",151041,Real Flame Chateau 41 in. Corner Ventless Gel ...,The Chateau Corner Fireplace features the clea...,"[{'name': 'Area Heated (Sq. Ft.)', 'value': '1...",,
9,products-embeddings,KElD5Y8BqeArwfGV9Uf-,279.46896,"{'product_uid': 211843, 'product_title': 'Real...",211843,Real Flame 15 in. 2-Can Outdoor Gel Fuel Conve...,Convert your existing outdoor fire pit or fire...,"[{'name': 'Assembled Depth (in.)', 'value': '1...",,


In [14]:
# A sample vector query

search_vector = df_queries[df_queries['query_id']==sample_query_id]['query_embedding'].values[0]
results = query_elasticsearch_hybrid(es, 'products-embeddings', search_vector=search_vector)
hits = pd.DataFrame(results['hits']['hits'])

In [15]:

product_uids = []
product_titles = []
product_descriptions = []
product_attributes = []
query_id_list = []
relevances = []
for h in hits['_source']:
    product_uids.append(h['product_uid'])
    product_titles.append(h['product_title'])
    product_descriptions.append(h['product_description'])
    product_attributes.append(h['product_attributes'])
    
    if len(h['query_scores'])>0:
        query_id_list.append([x['query_id'] for x in h['query_scores']])
        relevances.append([x['relevance'] for x in h['query_scores']])
    else:
        query_id_list.append(None)
        relevances.append(None)

hits['product_uid'] = product_uids
hits['product_title'] = product_titles
hits['product_description'] = product_descriptions
hits['product_attribute'] = product_attributes
hits['query_id'] = query_id_list
hits['relevance'] = relevances

hits.head(10)


Unnamed: 0,_index,_id,_score,_source,product_uid,product_title,product_description,product_attribute,query_id,relevance
0,products-embeddings,w0cr5Y8BqeArwfGVO8Tk,0.809501,"{'product_uid': 120490, 'product_title': 'Real...",120490,Real Flame 13 oz. 18.5 lb. Gel Fuel Cans (16-P...,Real Flame is the leading brand of gel fuel in...,[],[083a28c9e216c858cafb3f3a08004fc9f9afa893b5a24...,[3.0]
1,products-embeddings,qEgw5Y8BqeArwfGVCw-0,0.804812,"{'product_uid': 156148, 'product_title': 'Real...",156148,Real Flame 13 oz. 24 lb. Gel Fuel Cans (24-Pack),Real Flame is the leading brand of gel fuel in...,[],[083a28c9e216c858cafb3f3a08004fc9f9afa893b5a24...,[3.0]
2,products-embeddings,LUgw5Y8BqeArwfGVcRYw,0.803864,"{'product_uid': 159600, 'product_title': 'Real...",159600,Real Flame 13 oz. 15 lb. Gel Fuel Cans (12-Pack),Real Flame is the leading brand of gel fuel in...,[],[083a28c9e216c858cafb3f3a08004fc9f9afa893b5a24...,[3.0]
3,products-embeddings,mEg65Y8BqeArwfGV6rjQ,0.776211,"{'product_uid': 149940, 'product_title': 'Real...",149940,Real Flame 24 in. Oak Convert to Gel Fireplace...,Convert your existing gas or wood-burning fire...,[],,
4,products-embeddings,Mkgw5Y8BqeArwfGV9B6A,0.775063,"{'product_uid': 163869, 'product_title': 'Real...",163869,Real Flame Fresno 72 in. Media Console Gel Fue...,Enjoy the beauty of a Real Flame fireplace. Bu...,[],[083a28c9e216c858cafb3f3a08004fc9f9afa893b5a24...,[2.67]
5,products-embeddings,50g15Y8BqeArwfGV32ti,0.769471,"{'product_uid': 104458, 'product_title': 'Real...",104458,Real Flame Silverton 48 in. Gel Fuel Fireplace...,Curl up by the comforting glow of this Real Fl...,[],,
6,products-embeddings,20cs5Y8BqeArwfGVttxg,0.76487,"{'product_uid': 131293, 'product_title': 'Real...",131293,Real Flame Hawthorne 75 in. Media Console Gel ...,The Hawthorne Gel Fireplace features mission i...,[],[07006d6c8e46f85476905aec60a75ee91bbc86d05fed6...,[2.33]
7,products-embeddings,vkcu5Y8BqeArwfGVKvTm,0.763937,"{'product_uid': 142622, 'product_title': 'Real...",142622,Real Flame 18 in. Oak Convert to Gel Fireplace...,Convert your existing gas or wood-burning fire...,[],[083a28c9e216c858cafb3f3a08004fc9f9afa893b5a24...,[2.0]
8,products-embeddings,EEg65Y8BqeArwfGV47iV,0.76364,"{'product_uid': 149669, 'product_title': 'Real...",149669,Real Flame Chateau 41 in. Ventless Gel Fuel Fi...,The Chateau Fireplace features the clean lines...,[],,
9,products-embeddings,KElD5Y8BqeArwfGV9Uf-,0.763134,"{'product_uid': 211843, 'product_title': 'Real...",211843,Real Flame 15 in. 2-Can Outdoor Gel Fuel Conve...,Convert your existing outdoor fire pit or fire...,"[{'name': 'Assembled Depth (in.)', 'value': '1...",,
