In [1]:
import ast
import datetime
import json
import os
import time

from elasticsearch import Elasticsearch
import numpy as np
import pandas as pd
import pytrec_eval
from tqdm import tqdm

from query_functions import query_elasticsearch_hybrid

current_timestamp = datetime.datetime.now()


In [2]:
# Connect to local elastic

es = Elasticsearch('http://localhost:9200')
es.ping()

True

# Load Data

In [3]:
def convert_to_dict(string):
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        return None  # Handle cases where the string is not a valid dictionary representation


In [4]:
def dict_to_string(my_dict):
  result_str = ""
  for key, value in my_dict.items():
    result_str += str(key) + ' ' + str(value) + ' '
  return result_str

In [5]:
# Products

filename = "processed_data/df_prods.csv"

if os.path.isfile(filename):
    df_prods = pd.read_csv(filename)
    df_prods['product_attributes'] = df_prods['product_attributes'].apply(convert_to_dict)

else:
    print('Cannot locate file')

df_prods.head()

Unnamed: 0,product_uid,product_title,product_description,product_attributes
0,100001,Simpson Strong-Tie 12-Gauge Angle,"Not only do angles make joints stronger, they ...",{'Bullet01': 'Versatile connector for various ...
1,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,BEHR Premium Textured DECKOVER is an innovativ...,"{'Application Method': 'Brush,Roller,Spray', '..."
2,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,Update your bathroom with the Delta Vero Singl...,"{'Bath Faucet Type': 'Combo Tub and Shower', '..."
3,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,Achieving delicious results is almost effortle...,"{'Appliance Type': 'Over the Range Microwave',..."
4,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,The Quantum Adjustable 2-Light LED Black Emerg...,"{'Battery Power Type': 'Ni-Cad', 'Battery Size..."


In [6]:
df_prods['product_attributes_string'] = [dict_to_string(x) if x is not None else x for x in df_prods['product_attributes']]
df_prods['product_text_string'] = df_prods['product_title'].fillna('') + ' ' + df_prods['product_description'].fillna('') + ' ' + df_prods['product_attributes_string'].fillna('') 

In [7]:
pd.Series([len(x) for x in df_prods['product_text_string']]).describe()

count    124603.000000
mean       1504.848262
std         904.592258
min         123.000000
25%         888.000000
50%        1297.000000
75%        1890.000000
max       11606.000000
dtype: float64

In [8]:
# Query

filename = "processed_data/df_queries.csv"

if os.path.isfile(filename):
    df_queries = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1


In [9]:
# Relevance

filename = "processed_data/df_relevance.csv"

if os.path.isfile(filename):
    df_relevance = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_relevance.head()

Unnamed: 0,query_id,product_uid,relevance
0,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,100001,3.0
1,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,100001,2.5
2,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,100002,3.0
3,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,100005,2.33
4,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,100005,2.67


# Embeddings

In [10]:
# Embeddings were run on Kaggle for GPU

# docs = df_prods['product_text_string'].tolist()
# prod_embeddings = model.encode(docs)
# np.save('/kaggle/working/prod_embeddings.npy', prod_embeddings)

# prompt = 'Represent this sentence for searching relevant passages: '
# docs = [prompt + str(x) for x in df_queries['search_term']]
# query_embeddings = model.encode(docs)
# np.save('/kaggle/working/query_embeddings.npy', query_embeddings)

prod_embeddings = np.load('processed_data/prod_embeddings.npy')
query_embeddings = np.load('processed_data/query_embeddings.npy')

In [11]:
df_queries['query_embedding'] = [embedding.tolist() for embedding in query_embeddings]
df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results,query_embedding
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1,"[-0.058361075818538666, 0.026495283469557762, ..."
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1,"[-0.027898991480469704, -0.024725843220949173,..."
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1,"[-0.07859756052494049, -0.00036610415554605424..."
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1,"[-0.020222697407007217, 0.06791018694639206, 0..."
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1,"[-0.003950382117182016, 0.012035505846142769, ..."


# Hybrid Queries

In [12]:
index_name = 'products-embeddings'

## Sample Query

In [13]:
# A sample query

search_query_id = df_queries['query_id'][0]
search_vector = df_queries['query_embedding'][0]
search_text = df_queries['search_term'][0]

results = query_elasticsearch_hybrid(es, index_name, search_text=search_text, search_vector=search_vector, num_results=10)

hits = pd.DataFrame(results['hits']['hits'])

In [14]:

product_uids = []
product_titles = []
product_descriptions = []
product_attributes = []
query_id_list = []
relevances = []
for h in hits['_source']:
    product_uids.append(h['product_uid'])
    product_titles.append(h['product_title'])
    product_descriptions.append(h['product_description'])
    product_attributes.append(h['product_attributes'])
    
    if len(h['query_scores'])>0:
        query_id_list.append(h['query_scores'][0]['query_id'])
        relevances.append(h['query_scores'][0]['relevance'])
    else:
        query_id_list.append(None)
        relevances.append(None)

hits['product_uid'] = product_uids
hits['product_title'] = product_titles
hits['product_description'] = product_descriptions
hits['product_attribute'] = product_attributes
hits['query_id'] = query_id_list
hits['relevance'] = relevances

hits.head()


Unnamed: 0,_index,_id,_score,_source,product_uid,product_title,product_description,product_attribute,query_id,relevance
0,products-embeddings,o5EPFo8B20Z26XKW68KM,40.223774,"{'product_uid': 198519, 'product_title': 'Vera...",198519,Veranda White Vinyl Traditional Left/Right Ang...,The Traditional Vinyl White Left/Right Angle B...,"[{'name': 'Accessory type', 'value': 'Left/Rig...",45476ecfe43c98557ae68d2eec00a37d8f21c0fef3d913...,2.0
1,products-embeddings,0ZIVFo8B20Z26XKWTVM4,34.11208,"{'product_uid': 174846, 'product_title': 'Marq...",174846,Marquee Railing Black Left Multi-Angle Bracket...,"Designed with a beautiful hammered-metal look,...","[{'name': 'Accessory type', 'value': 'Bracket'...",,
2,products-embeddings,S5IUFo8B20Z26XKW9Ep8,34.11095,"{'product_uid': 170398, 'product_title': 'Marq...",170398,Marquee Railing Black Right Multi-Angle Bracke...,"Designed with a beautiful hammered-metal look,...","[{'name': 'Accessory type', 'value': 'Bracket'...",,
3,products-embeddings,OJIaFo8B20Z26XKWbt6W,33.91701,"{'product_uid': 223811, 'product_title': 'Marq...",223811,Marquee Railing White Left Multi-Angle Bracket...,"Designed with a beautiful hammered-metal look,...","[{'name': 'Accessory type', 'value': 'Bracket'...",,
4,products-embeddings,WpEQFo8B20Z26XKWi9H2,31.861212,"{'product_uid': 102005, 'product_title': 'Supe...",102005,Superstrut 90Ë 4-Hole Angle Bracket - Silver ...,The Superstrut 90-Degree 4-Hole Channel Bracke...,"[{'name': 'Bullet01', 'value': 'Use to support...",b39f9b993b97bb3eb11c9dc73220691215ab99971b52b7...,2.67


## Run the vector queries

In [15]:
# Filter down to queries that have results

relevant_queries = df_queries[df_queries['has_relevant_results']==1]

In [16]:
boosts = {"title_boost": 8,
          "description_boost": 2,
          "attributes_boost": 1,
          "vector_boost": 1}

In [17]:
# Create query result dictionaries

filename = "query_runs/run_hybrid.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_h = json.load(file)
else:
    run_h = {}

    start_time = time.time()

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text=search_text,
                                                search_vector=search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run_h[search_query_id] = query_doc_dict

    end_time = time.time() 

    with open(filename, "w") as file:
        json.dump(run_h, file)

100%|██████████| 11795/11795 [07:19<00:00, 26.85it/s]


## Evaluate Results

In [18]:
# Load ground truth
filename = "query_runs/qrel.json"

with open(filename, "r") as file:
    qrel = json.load(file)

In [19]:
# Place to store results and initialize an evaluator

ranking_results = []
measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

In [20]:
# Evaluate hybrid queries

results_df = pd.DataFrame(evaluator.evaluate(run_h))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'hybrid'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

ranking_results.append(results_df)

In [21]:
# Put the results into a dataframe, add to previous results

ranking_results = pd.concat(ranking_results)

all_results = pd.read_csv('query_runs/query_results.csv')
all_results = pd.concat([all_results, ranking_results])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
2,0.245862,0.165978,0.342546,vectorsearch,306.276232,2024-05-06 20:39:12.634220
0,0.23893,0.163333,0.325697,hybrid,439.463497,2024-05-06 20:48:41.077788


# Tune Vector Boosts

In [22]:
boost_list = []

for b in [10,50,100,200]:
    boost_list.append({"title_boost": 8,
                        "description_boost": 2,
                        "attributes_boost": 1,
                        "vector_boost": b})

In [None]:
# Run hybrid queries with boosts

measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

for boosts in tqdm(boost_list):

    run = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text, search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run[search_query_id] = query_doc_dict

    results_df = pd.DataFrame(evaluator.evaluate(run))
    results_dict = {}
    for measure in measures:
        results_dict['mean '+measure] = results_df.loc[measure].mean()
    results_df = pd.DataFrame(results_dict, index=[0])

    boosts['mean map_cut_10'] = results_df['mean map_cut_10'].values[0]
    boosts['mean ndcg_cut_10'] = results_df['mean ndcg_cut_10'].values[0]
    boosts['mean recip_rank'] = results_df['mean recip_rank'].values[0]


In [None]:
pd.DataFrame(boost_list).sort_values('mean ndcg_cut_10')

In [None]:
for b in [25,75]:
    boost_list.append({"title_boost": 8,
                        "description_boost": 2,
                        "attributes_boost": 1,
                        "vector_boost": b})

In [None]:
# Run hybrid queries with boosts

measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

for boosts in tqdm(boost_list[-2:]):

    run = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text, search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run[search_query_id] = query_doc_dict

    results_df = pd.DataFrame(evaluator.evaluate(run))
    results_dict = {}
    for measure in measures:
        results_dict['mean '+measure] = results_df.loc[measure].mean()
    results_df = pd.DataFrame(results_dict, index=[0])

    boosts['mean map_cut_10'] = results_df['mean map_cut_10'].values[0]
    boosts['mean ndcg_cut_10'] = results_df['mean ndcg_cut_10'].values[0]
    boosts['mean recip_rank'] = results_df['mean recip_rank'].values[0]


In [None]:
pd.DataFrame(boost_list).sort_values('vector_boost')

In [None]:
for b in [40,60]:
    boost_list.append({"title_boost": 8,
                        "description_boost": 2,
                        "attributes_boost": 1,
                        "vector_boost": b})

In [None]:
# Run hybrid queries with boosts

measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

for boosts in tqdm(boost_list[-2:]):

    run = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text, search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run[search_query_id] = query_doc_dict

    results_df = pd.DataFrame(evaluator.evaluate(run))
    results_dict = {}
    for measure in measures:
        results_dict['mean '+measure] = results_df.loc[measure].mean()
    results_df = pd.DataFrame(results_dict, index=[0])

    boosts['mean map_cut_10'] = results_df['mean map_cut_10'].values[0]
    boosts['mean ndcg_cut_10'] = results_df['mean ndcg_cut_10'].values[0]
    boosts['mean recip_rank'] = results_df['mean recip_rank'].values[0]


In [None]:
pd.DataFrame(boost_list).sort_values('mean ndcg_cut_10')

In [23]:
boosts = {"title_boost": 8,
          "description_boost": 2,
          "attributes_boost": 1,
          "vector_boost": 50}

In [24]:
# Create query result dictionaries

filename = "query_runs/run_hybrid_boosted.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_hb = json.load(file)
else:
    run_hb = {}

    start_time = time.time()

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']
        search_text = row['search_term']

        results = query_elasticsearch_hybrid(es, index_name, search_text=search_text,
                                                search_vector=search_vector, num_results=10, boost_values=boosts)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run_hb[search_query_id] = query_doc_dict

    end_time = time.time() 

    with open(filename, "w") as file:
        json.dump(run_hb, file)

100%|██████████| 11795/11795 [08:41<00:00, 22.63it/s]


## Evaluate Results

In [25]:
# Place to store results and initialize an evaluator

ranking_results = []
measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

In [26]:
# Evaluate hybrid queries

results_df = pd.DataFrame(evaluator.evaluate(run_hb))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'hybrid_boosted'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

ranking_results.append(results_df)

In [27]:
# Put the results into a dataframe, add to previous results

ranking_results = pd.concat(ranking_results)
all_results = pd.concat([all_results, ranking_results])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
2,0.245862,0.165978,0.342546,vectorsearch,306.276232,2024-05-06 20:39:12.634220
0,0.23893,0.163333,0.325697,hybrid,439.463497,2024-05-06 20:48:41.077788
0,0.256408,0.174505,0.34937,hybrid_boosted,521.287253,2024-05-06 20:48:41.077788


In [28]:
all_results.to_csv('query_runs/query_results.csv', index=False)
