In [1]:
import ast
import datetime
import json
import os
import time

from elasticsearch import Elasticsearch
import numpy as np
import pandas as pd
import pytrec_eval
import torch
from tqdm import tqdm

from query_functions import query_elasticsearch_rrf, query_elasticsearch_hybrid

current_timestamp = datetime.datetime.now()


In [2]:
# Connect to local elastic

es = Elasticsearch('http://localhost:9200') 
es.ping()

True

# Load Data

In [3]:
def convert_to_dict(string):
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        return None  # Handle cases where the string is not a valid dictionary representation


In [4]:
def dict_to_string(my_dict):
  result_str = ""
  for key, value in my_dict.items():
    result_str += str(key) + ' ' + str(value) + ' '
  return result_str

In [5]:
# Products

filename = "processed_data/df_prods.csv"

if os.path.isfile(filename):
    df_prods = pd.read_csv(filename)
    df_prods['product_attributes'] = df_prods['product_attributes'].apply(convert_to_dict)

else:
    print('Cannot locate file')

df_prods.head()

Unnamed: 0,product_uid,product_title,product_description,product_attributes
0,100001,Simpson Strong-Tie 12-Gauge Angle,"Not only do angles make joints stronger, they ...",{'Bullet01': 'Versatile connector for various ...
1,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,BEHR Premium Textured DECKOVER is an innovativ...,"{'Application Method': 'Brush,Roller,Spray', '..."
2,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,Update your bathroom with the Delta Vero Singl...,"{'Bath Faucet Type': 'Combo Tub and Shower', '..."
3,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,Achieving delicious results is almost effortle...,"{'Appliance Type': 'Over the Range Microwave',..."
4,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,The Quantum Adjustable 2-Light LED Black Emerg...,"{'Battery Power Type': 'Ni-Cad', 'Battery Size..."


In [6]:
df_prods['product_attributes_string'] = [dict_to_string(x) if x is not None else x for x in df_prods['product_attributes']]
df_prods['product_text_string'] = df_prods['product_title'].fillna('') + ' ' + df_prods['product_description'].fillna('') + ' ' + df_prods['product_attributes_string'].fillna('') 

In [7]:
# Query

filename = "processed_data/df_queries.csv"

if os.path.isfile(filename):
    df_queries = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1


In [8]:
# Relevance

filename = "processed_data/df_relevance.csv"

if os.path.isfile(filename):
    df_relevance = pd.read_csv(filename)
    
else:
    print('Cannot locate file')

df_relevance.head()

Unnamed: 0,query_id,product_uid,relevance
0,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,100001,3.0
1,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,100001,2.5
2,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,100002,3.0
3,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,100005,2.33
4,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,100005,2.67


# Create Embeddings

In [9]:
from sentence_transformers import SentenceTransformer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

model = SentenceTransformer("/kaggle/input/homedepot_finetune/transformers/hd-fintune-testdata/1/finetuned", trust_remote_code=True)

print(model)

In [10]:
# Embeddings were run on Kaggle for access GPU

# Product Embeddings
prod_cols = ['product_text_string']

for col in tqdm(prod_cols):

    print(f'reading {col}') 

    p_embeddings = pd.read_csv(f'processed_data\df_prods_{col}_ft_embeddings.csv')
    p_embeddings.set_index('Unnamed: 0', drop=True, inplace=True)
    p_embeddings.index.name = None
    p_embeddings.columns = [col+'_embedding']

    print(f'evaling {col} embeddings')
    if col =='product_attributes_string':
        p_embeddings[col+'_embedding'] = p_embeddings[col+'_embedding'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else None)
    else:
        p_embeddings[col+'_embedding'] = p_embeddings[col+'_embedding'].apply(ast.literal_eval)

    df_prods = df_prods.merge(p_embeddings, how = 'left', left_index=True, right_index=True)


In [12]:
del p_embeddings

In [13]:
query_embeddings = np.load('processed_data/df_queries_ft_embeddings.npy')
df_queries['query_embedding'] = [embedding.tolist() for embedding in query_embeddings]
df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results,query_embedding
0,angle bracket,6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,1,"[-0.29237082600593567, 0.2657821476459503, -0...."
1,l bracket,5863e75dfdc9ae5db3f6b4dbddf129d5568e085bf57711...,1,"[0.18135201930999756, -0.363744854927063, -0.2..."
2,deck over,406b3569b2db043604fdb42a67f4ec49964a5ff07cddf0...,1,"[-0.5091783404350281, -0.25584468245506287, 0...."
3,rain shower head,49b2dc56a0e1945c435c1579c07df519878619e3e8d59d...,1,"[0.20482461154460907, 0.24850228428840637, 0.2..."
4,shower only faucet,7620551bacb6cdddca5f33ec0943cea7971095a1e9be06...,1,"[0.22215023636817932, 0.3353525698184967, 0.01..."


# Load Into Index

In [14]:
# Initialize the index

index_name = 'products-finetune-embeddings'

mapping = {
    "properties": {
        "product_uid": {
            "type": "integer"
        },
        "product_title": {
            "type": "text"
        },
        "product_description": {
            "type": "text"
        },
        "product_attributes": {
            "type": "nested",
            "properties": {
                "name": {
                    "type": "text"
                },
                "value": {
                    "type": "text"
                },
                "name_value": {
                    "type": "text"
                },
            }
        },
        "product_text_string_vector": {
            "type": "dense_vector",
            "dims": 768
        },
        "query_scores": {
            "type": "nested",
            "properties": {
                "query_id": {
                    "type": "text"
                },
                "relevance": {
                    "type": "float"
                },
            }
        }
    }
}

#es.indices.create(index=index_name, mappings=mapping)

In [16]:
# Create documents and index them right in
for index, row in tqdm(df_prods.iterrows(), total=len(df_prods)):
    
    query_scores = []
    tmp_query = df_relevance[(df_relevance['product_uid']==row['product_uid'])]
    if len(tmp_query)>0:
        tmp_query = tmp_query.replace(pd.NA, '', regex=True)
        for index_q, row_q in tmp_query.iterrows():
            query_scores.append({'query_id': row_q['query_id'],
                                 'relevance': row_q['relevance']})

    product_attributes = []
    tmp_attr = row['product_attributes']
    if not pd.isnull(tmp_attr):
        for k in tmp_attr.keys():
            product_attributes.append({'name': k,
                                       'value': tmp_attr[k],
                                       'name_value': str(k) + ' ' + str(tmp_attr[k])})

    tmp_doc = {
        'product_uid': row['product_uid'],
        'product_title': row['product_title'],
        'product_description': row['product_description'],
        'product_attributes': product_attributes,
        'query_scores': query_scores
    }

    # if isinstance(row['product_title_embedding'], list):
    #     tmp_doc['product_title_vector'] = row['product_title_embedding']

    # if isinstance(row['product_description_embedding'], list):
    #     tmp_doc['product_description_vector'] = row['product_description_embedding']

    # if isinstance(row['product_attributes_string_embedding'], list):
    #     tmp_doc['product_attributes_string_vector'] = row['product_attributes_string_embedding']

    if isinstance(row['product_text_string_embedding'], list):
        tmp_doc['product_text_string_vector'] = row['product_text_string_embedding']

    #product_document_list.append(tmp_doc)
    es.index(index=index_name, document=tmp_doc)

# KNN Queries

## Sample Query

In [17]:
# A sample query

search_vector = df_queries['query_embedding'][0]

results = query_elasticsearch_hybrid(es, index_name, search_vector=search_vector, num_results=10)

hits = pd.DataFrame(results['hits']['hits'])

In [18]:

product_uids = []
product_titles = []
product_descriptions = []
product_attributes = []
query_id_list = []
relevances = []
for h in hits['_source']:
    product_uids.append(h['product_uid'])
    product_titles.append(h['product_title'])
    product_descriptions.append(h['product_description'])
    product_attributes.append(h['product_attributes'])
    
    if len(h['query_scores'])>0:
        query_id_list.append(h['query_scores'][0]['query_id'])
        relevances.append(h['query_scores'][0]['relevance'])
    else:
        query_id_list.append(None)
        relevances.append(None)

hits['product_uid'] = product_uids
hits['product_title'] = product_titles
hits['product_description'] = product_descriptions
hits['product_attribute'] = product_attributes
hits['query_id'] = query_id_list
hits['relevance'] = relevances

hits.head()


Unnamed: 0,_index,_id,_score,_source,product_uid,product_title,product_description,product_attribute,query_id,relevance
0,products-finetune-embeddings,1S7LnZABMcp9Mdda8abR,0.84119,"{'product_uid': 168743, 'product_title': 'Lock...",168743,LockState Angle L Bracket for 600 lb.,LS-3320 angle l bracket for LS-600S. Used to m...,"[{'name': 'Bullet01', 'value': 'Satin finish',...",,
1,products-finetune-embeddings,_S3AnZABMcp9Mddaxl18,0.828835,"{'product_uid': 101370, 'product_title': 'Ever...",101370,Everbilt 3 in. Zinc-Plated Corner Brace (4-Pack),The Everbilt 3 in. Corner Braces (4-Pack) are ...,"[{'name': 'Assembled Depth (in.)', 'value': '0...",6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,3.0
2,products-finetune-embeddings,uS7KnZABMcp9MddagnkH,0.827651,"{'product_uid': 146187, 'product_title': 'Ever...",146187,Everbilt 2 in. Galvanized Corner Brace (4-Pack),The Everbilt 3/5 in. x 2 in. Corner Braces (4-...,"[{'name': 'Assembled Depth (in.)', 'value': '0...",,
3,products-finetune-embeddings,CC_QnZABMcp9MddanzDH,0.827163,"{'product_uid': 220186, 'product_title': 'Ever...",220186,Everbilt 4 in. Zinc-Plated Corner Brace (2-Pack),The Everbilt 4 in. Corner Braces (2-Pack) are ...,"[{'name': 'Assembled Depth (in.)', 'value': '0...",,
4,products-finetune-embeddings,Ci3AnZABMcp9MddasVz7,0.8266,"{'product_uid': 100739, 'product_title': 'Ever...",100739,Everbilt 1 in. Zinc-Plated Corner Brace (20-Pack),The Everbilt 1 in. Corner Braces (20-Pack) are...,"[{'name': 'Assembled Depth (in.)', 'value': '0...",6e0a07626e48aee6f7ce9ec6cd753426d6acafded1598f...,3.0


## Run the vector queries

In [19]:
# Filter down to queries that have results

relevant_queries = df_queries[df_queries['has_relevant_results']==1]

In [20]:
# Create query result dictionaries

filename = "query_runs/run_vector_finetune.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_vu = json.load(file)
else:
    start_time = time.time()
    run_vu = {}

    for index,row in tqdm(relevant_queries.iterrows(), total=len(relevant_queries)):
        search_vector = row['query_embedding']
        search_query_id = row['query_id']

        results = query_elasticsearch_hybrid(es, index_name, search_vector=search_vector, num_results=10)
        hits = pd.DataFrame(results['hits']['hits'])

        query_doc_dict = {}
        for index, row in hits.iterrows():
            query_doc_dict[str(row['_source']['product_uid'])] = row['_score']
        
        run_vu[search_query_id] = query_doc_dict

    end_time = time.time() 

    with open(filename, "w") as file:
        json.dump(run_vu, file)

100%|██████████| 11795/11795 [03:23<00:00, 57.98it/s]


## Evaluate results

In [21]:
# Load ground truth
filename = "query_runs/qrel.json"

with open(filename, "r") as file:
    qrel = json.load(file)

In [22]:
# Place to store results and initialize an evaluator

ranking_results = []
measures = {'map_cut_10', 'ndcg_cut_10', 'recip_rank'} 
evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

In [23]:
# Evaluate vector queries

results_df = pd.DataFrame(evaluator.evaluate(run_vu))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'vectorsearch_finetune'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

ranking_results.append(results_df)

In [24]:
# Put the results into a dataframe, add to previous results

ranking_results = pd.concat(ranking_results)

all_results = pd.read_csv('query_runs/query_results.csv')
all_results = pd.concat([all_results, ranking_results])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
2,0.236762,0.159441,0.330878,vectorsearch,509.612543,2024-06-23 12:50:09.220148
3,0.155515,0.096555,0.241066,vectorsearch_multifield,622.987846,2024-06-23 12:50:09.220148
4,0.167586,0.10611,0.254867,vectorsearch_multifield_tuned,662.439398,2024-06-23 12:50:09.220148
5,0.238384,0.16287,0.324941,hybrid,662.608457,2024-06-23 13:25:22.725108
6,0.25135,0.170474,0.342187,hybrid_boosted,716.31887,2024-06-23 13:25:22.725108
7,0.260202,0.170474,0.342187,rrf,4026.238381,2024-06-25 14:29:42.651482
8,0.226787,0.148293,0.327549,rrf_multi,7013.143374,2024-06-25 15:52:18.000973
0,0.275745,0.18204,0.38229,vectorsearch_finetune,203.419367,2024-07-10 14:36:51.463666


## Run the RRF queries with finetuned vectors

In [25]:
# Create query result dictionaries from RRF queries

filename = "query_runs/run_rrf_finetune.json"

if os.path.isfile(filename):
    with open(filename, "r") as file:
        run_r = json.load(file)
else:
    run_r = {}

    start_time = time.time()

    boost_values = {
        "title_boost": 8,
        "description_boost": 2,
        "attributes_boost": 1,
        "product_text_string_vector_boost": 1,
        }
    num_query_results = 50
    k = 60

    for index, row in tqdm(relevant_queries.iterrows(), total = len(relevant_queries)):
        search_term = row['search_term']
        search_vector = row['query_embedding']
        search_query_id = row['query_id']

        results = query_elasticsearch_rrf(es, index_name,
                                          search_text=search_term,
                                          search_vector=search_vector,
                                          num_results=10,
                                          num_query_results=num_query_results,
                                          k=k,
                                          boost_values = boost_values,)
 
        query_doc_dict = {}
        for index, row in results.iterrows():
            query_doc_dict[str(row['product_uid'])] = row['rrf_score']
        
        run_r[search_query_id] = query_doc_dict

    end_time = time.time()

    with open(filename, "w") as file:
        json.dump(run_r, file)

100%|██████████| 11795/11795 [19:35<00:00, 10.03it/s]


In [26]:
# Evaluate vector queries

results_df = pd.DataFrame(evaluator.evaluate(run_r))
results_dict = {}
for measure in measures:
    results_dict['mean '+measure] = results_df.loc[measure].mean()
results_df = pd.DataFrame(results_dict, index=[0])

results_df['run_name'] = 'rrf_finetune'
results_df['run_time'] = end_time - start_time
results_df['run_timestamp'] = current_timestamp

all_results = pd.concat([all_results, results_df])
all_results

Unnamed: 0,mean ndcg_cut_10,mean map_cut_10,mean recip_rank,run_name,run_time,run_timestamp
0,0.170037,0.112637,0.26115,textsearch,178.50484,2024-05-06 20:30:09.220148
1,0.217595,0.149411,0.31771,textsearch_boosted,207.407446,2024-05-06 20:30:09.220148
2,0.236762,0.159441,0.330878,vectorsearch,509.612543,2024-06-23 12:50:09.220148
3,0.155515,0.096555,0.241066,vectorsearch_multifield,622.987846,2024-06-23 12:50:09.220148
4,0.167586,0.10611,0.254867,vectorsearch_multifield_tuned,662.439398,2024-06-23 12:50:09.220148
5,0.238384,0.16287,0.324941,hybrid,662.608457,2024-06-23 13:25:22.725108
6,0.25135,0.170474,0.342187,hybrid_boosted,716.31887,2024-06-23 13:25:22.725108
7,0.260202,0.170474,0.342187,rrf,4026.238381,2024-06-25 14:29:42.651482
8,0.226787,0.148293,0.327549,rrf_multi,7013.143374,2024-06-25 15:52:18.000973
0,0.275745,0.18204,0.38229,vectorsearch_finetune,203.419367,2024-07-10 14:36:51.463666


In [27]:
all_results.to_csv('query_runs/query_results.csv', index=False)