In [1]:
import ast
import datetime
import json
import os
import time

from elasticsearch import Elasticsearch
import numpy as np
import pandas as pd
import pytrec_eval
from tqdm import tqdm

from query_functions import query_elasticsearch_rrf, query_elasticsearch_rrf_multi, query_elasticsearch_hybrid

current_timestamp = datetime.datetime.now()

In [2]:
# Connect to local elastic

es = Elasticsearch('http://localhost:9200')
es.ping()

True

# Load Data

In [3]:
def convert_to_dict(string):
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        return None  # Handle cases where the string is not a valid dictionary representation


In [4]:
def dict_to_string(my_dict):
  result_str = ""
  for key, value in my_dict.items():
    result_str += str(key) + ' ' + str(value) + ' '
  return result_str

In [5]:
# Products

filename = "processed_data/df_prods.csv"

if os.path.isfile(filename):
    df_prods = pd.read_csv(filename)
    df_prods['product_attributes'] = df_prods['product_attributes'].apply(convert_to_dict)

else:
    print('Cannot locate file')

df_prods.head()

Unnamed: 0,product_uid,product_title,product_description,product_attributes
0,100001,Simpson Strong-Tie 12-Gauge Angle,"Not only do angles make joints stronger, they ...",{'Bullet01': 'Versatile connector for various ...
1,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,BEHR Premium Textured DECKOVER is an innovativ...,"{'Application Method': 'Brush,Roller,Spray', '..."
2,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,Update your bathroom with the Delta Vero Singl...,"{'Bath Faucet Type': 'Combo Tub and Shower', '..."
3,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,Achieving delicious results is almost effortle...,"{'Appliance Type': 'Over the Range Microwave',..."
4,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,The Quantum Adjustable 2-Light LED Black Emerg...,"{'Battery Power Type': 'Ni-Cad', 'Battery Size..."


In [6]:
df_prods['product_attributes_string'] = [dict_to_string(x) if x is not None else x for x in df_prods['product_attributes']]
df_prods['product_text_string'] = df_prods['product_title'].fillna('') + ' ' + df_prods['product_description'].fillna('') + ' ' + df_prods['product_attributes_string'].fillna('') 

In [7]:
# Query

filename = "processed_data/df_queries.csv"

if os.path.isfile(filename):
    df_queries = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1


In [8]:
# Relevance

filename = "processed_data/df_relevance.csv"

if os.path.isfile(filename):
    df_relevance = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_relevance.head()

Unnamed: 0,query_id,product_uid,relevance
0,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,100001,3.0
1,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,100001,2.5
2,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,100002,3.0
3,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,100005,2.33
4,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,100005,2.67


# Embeddings

In [9]:
# Embeddings were run on Kaggle for GPU

query_embeddings = np.load('processed_data/query_embeddings.npy')

In [10]:
df_queries['query_embedding'] = [embedding.tolist() for embedding in query_embeddings]
df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results,query_embedding
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1,"[-0.058361075818538666, 0.026495283469557762, ..."
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1,"[-0.027898991480469704, -0.024725843220949173,..."
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1,"[-0.07859756052494049, -0.00036610415554605424..."
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1,"[-0.020222697407007217, 0.06791018694639206, 0..."
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1,"[-0.003950382117182016, 0.012035505846142769, ..."


# RRF Queries

In [11]:
# Filter down to queries that have results

relevant_queries = df_queries[df_queries['has_relevant_results']==1]

In [12]:
# Create query result dictionaries from RRF queries

filename = "query_runs/run_rrf.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_r = json.load(file)
else:
    run_r = {}

    start_time = time.time()

    boost_values = {
        "title_boost": 8,
        "description_boost": 2,
        "attributes_boost": 1,
        "product_text_string_vector_boost": 1,
        }
    num_query_results = 50
    k = 60

    for index, row in tqdm(relevant_queries.iterrows(), total = len(relevant_queries)):
        search_term = row['search_term']
        search_vector = row['query_embedding']
        search_query_id = row['query_id']

        results = query_elasticsearch_rrf(es, 'products-embeddings',
                                          search_text=search_term,
                                          search_vector=search_vector,
                                          num_results=10,
                                          num_query_results=num_query_results,
                                          k=k,
                                          boost_values = boost_values,)
 
        query_doc_dict = {}
        for index, row in results.iterrows():
            query_doc_dict[str(row['product_uid'])] = row['rrf_score']
        
        run_r[search_query_id] = query_doc_dict

    end_time = time.time()

    with open(filename, "w") as file:
        json.dump(run_r, file)

100%|██████████| 11795/11795 [1:07:06<00:00,  2.93it/s]


## Evaluate Results

In [13]:
# Load ground truth
filename = "query_runs/qrel.json"

with open(filename, "r") as file:
    qrel = json.load(file)

In [14]:
# Place to store results and initialize an evaluator

ranking_results = []
measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

In [15]:
# Evaluate rrf queries

results_df = pd.DataFrame(evaluator.evaluate(run_r))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'rrf'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

ranking_results.append(results_df)

In [16]:
# Put the results into a dataframe, add to previous results

ranking_results = pd.concat(ranking_results)

all_results = pd.read_csv('query_runs/query_results.csv')
all_results = pd.concat([all_results, ranking_results])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
2,0.236762,0.159441,0.330878,vectorsearch,509.612543,2024-06-23 12:50:09.220148
3,0.155515,0.096555,0.241066,vectorsearch_multifield,622.987846,2024-06-23 12:50:09.220148
4,0.167586,0.10611,0.254867,vectorsearch_multifield_tuned,662.439398,2024-06-23 12:50:09.220148
5,0.238384,0.16287,0.324941,hybrid,662.608457,2024-06-23 13:25:22.725108
6,0.25135,0.170474,0.342187,hybrid_boosted,716.31887,2024-06-23 13:25:22.725108
0,0.260202,0.174498,0.361499,rrf,4026.238381,2024-06-25 14:29:42.651482


In [12]:
# Create query result dictionaries from RRF queries

filename = "query_runs/run_rrf_multi.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_rm = json.load(file)
else:
    run_rm = {}

    start_time = time.time()

    boost_values = {
        "title_boost": 8,
        "description_boost": 2,
        "attributes_boost": 1,
        "product_title_vector_boost": 1,
        "product_description_vector_boost": 1,
        "product_attributes_string_vector_boost": 1}
    num_query_results = 50
    k = 60

    for index, row in tqdm(relevant_queries.iterrows(), total = len(relevant_queries)):
        search_term = row['search_term']
        search_vector = row['query_embedding']
        search_query_id = row['query_id']

        results = query_elasticsearch_rrf_multi(es, 'products-embeddings',
                                          search_text=search_term,
                                          search_vector=search_vector,
                                          num_results=10,
                                          num_query_results=num_query_results,
                                          k=k,
                                          boost_values = boost_values,)
 
        query_doc_dict = {}
        for index, row in results.iterrows():
            query_doc_dict[str(row['product_uid'])] = row['rrf_score']
        
        run_rm[search_query_id] = query_doc_dict

    end_time = time.time()

    with open(filename, "w") as file:
        json.dump(run_rm, file)

100%|██████████| 11795/11795 [1:56:53<00:00,  1.68it/s] 


## Evaluate Results

In [13]:
# Load ground truth
filename = "query_runs/qrel.json"

with open(filename, "r") as file:
    qrel = json.load(file)

In [14]:
# Place to store results and initialize an evaluator

ranking_results = []
measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

In [15]:
# Evaluate rrf queries

results_df = pd.DataFrame(evaluator.evaluate(run_rm))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'rrf_multi'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

ranking_results.append(results_df)

In [16]:
# Put the results into a dataframe, add to previous results

ranking_results = pd.concat(ranking_results)

all_results = pd.read_csv('query_runs/query_results.csv')
all_results = pd.concat([all_results, ranking_results])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
2,0.236762,0.159441,0.330878,vectorsearch,509.612543,2024-06-23 12:50:09.220148
3,0.155515,0.096555,0.241066,vectorsearch_multifield,622.987846,2024-06-23 12:50:09.220148
4,0.167586,0.10611,0.254867,vectorsearch_multifield_tuned,662.439398,2024-06-23 12:50:09.220148
5,0.238384,0.16287,0.324941,hybrid,662.608457,2024-06-23 13:25:22.725108
6,0.25135,0.170474,0.342187,hybrid_boosted,716.31887,2024-06-23 13:25:22.725108
0,0.226787,0.148293,0.327549,rrf_multi,7013.143374,2024-06-25 15:52:18.000973


In [17]:
all_results.to_csv('query_runs/query_results.csv', index=False)