### Imports

In [1]:
import pandas as pd
import psycopg2
import time
import os

from io import StringIO
from PIL import Image
# from transformers import CLIPModel, CLIPProcessor, AutoTokenizer, AutoModel
import sys
# Add the parent directory of 'code' to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

print("Python search paths:", sys.path)  # Debugging line
from utils.db_connection import create_db_connection
from utils.generate_embeddings import generate_ollama_embeddings

Python search paths: ['/Users/bilge.ince/.pyenv/versions/3.10.4/lib/python310.zip', '/Users/bilge.ince/.pyenv/versions/3.10.4/lib/python3.10', '/Users/bilge.ince/.pyenv/versions/3.10.4/lib/python3.10/lib-dynload', '', '/Users/bilge.ince/.local/lib/python3.10/site-packages', '/Users/bilge.ince/.pyenv/versions/3.10.4/lib/python3.10/site-packages', '/Users/bilge.ince/projects']


In [2]:
conn = create_db_connection()

In [3]:
text_embeddings = generate_ollama_embeddings("summer outfit for thailand")
# 10.5s

In [None]:
with conn.cursor() as cursor:
    cursor.execute(
        f"""EXPLAIN ANALYSE SELECT img_id, 1-(embedding <=> '{text_embeddings}') AS score FROM products_embeddings_ollama ORDER BY embedding <=> '{text_embeddings}' LIMIT 10;"""
    )  # Create the database
    # 13.2 s
    cursor.fetchall()

[(43666, 0.45893446733961896),
 (13510, 0.4566795844783741),
 (39720, 0.4514953570806024),
 (43655, 0.4487756464496022),
 (30139, 0.44834972554618724),
 (30142, 0.44834972554618724),
 (30146, 0.44729700056278787),
 (30141, 0.44729700056278787),
 (22666, 0.44379710463770283),
 (24845, 0.4427767332231144)]

In [18]:
cursor.execute("""CREATE TABLE IF NOT EXISTS products_embeddings_pgvector (img_id INTEGER PRIMARY KEY REFERENCES products(img_id) ON DELETE CASCADE,
            embedding vector(4096));""")

In [None]:
cursor.execute("""CREATE INDEX ON products_embeddings_pgvector USING hnsw (embedding vector_l2_ops)
WITH (m = 16, ef_construction = 64);""")

In [None]:
# Hybrid search over Images in S3 bucket
hybrid_search_query = """WITH filtered_products AS (
            -- First get all men's products
            SELECT img_id, productdisplayname
            FROM products 
            WHERE gender = '{selected_gender}'
        )
        SELECT 
            result.key as id,
            fp.productdisplayname as description,
            result.distance as score
        FROM filtered_products fp
        CROSS JOIN LATERAL aidb.retrieve_key('{st.session_state.text_retriever_name}', '{text_query}', 40) AS result
        WHERE result.key = CONCAT(fp.img_id, '.jpg')
        ORDER BY score ASC LIMIT 5;"""

# Benchmarking Pgvector vs Vchord

In [3]:
conn = create_db_connection() # Connect to the database
conn.autocommit = True  # Enable autocommit for creating the database
cursor = conn.cursor()

## Pgvector

In [27]:
cursor.execute("""
        CREATE TABLE IF NOT EXISTS benchmark_embeddings_pgvec(
            img_id INTEGER PRIMARY KEY REFERENCES products(img_id) ON DELETE CASCADE,
            embedding vector(384));""")



cursor.execute("""CREATE INDEX ON benchmark_embeddings_pgvec USING hnsw (embedding vector_l2_ops)
WITH (m = 16, ef_construction = 64);""")



In [4]:
import time

from transformers import AutoTokenizer, AutoModel
# Load the model and processor
text_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
text_model = AutoModel.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

## Embedding Generation in batches

In [None]:
# Initialize timing variables for overall function performance tracking
function_start_time = time.time()

# Fetch data from the database
fetch_start = time.time()
cursor = conn.cursor()
cursor.execute("SELECT img_id, productdisplayname FROM products;")
result = cursor.fetchall()
fetch_end = time.time()

batch_size = 1000
total_rows_inserted = 0

for i in range(0, len(result), batch_size):
    batch_data = [(row[0], row[1]) for row in result[i:i+batch_size] if row[1]]
    if not batch_data:
        continue

    batch_ids, batch_texts = zip(*batch_data)
    batch_texts = [text for text in batch_texts if text]

    if not batch_texts:
        continue

    text_inputs = text_tokenizer(list(batch_texts), return_tensors="pt", padding=True, truncation=True)
    text_model_output = text_model(**text_inputs)
    text_embeddings = text_model_output.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy().tolist()

    with conn.cursor() as cursor:
        for idx, embedding in enumerate(text_embeddings):
            cursor.execute(
                "INSERT INTO benchmark_embeddings_pgvec (img_id, embedding) VALUES (%s, %s)",
                (batch_ids[idx], embedding)
            )
            total_rows_inserted += 1

function_end_time = time.time()
total_time = function_end_time - function_start_time

print(f"Total Rows: {total_rows_inserted}")
print(f"Total function execution time: {total_time} seconds")
print(f"Fetching time: {fetch_end - fetch_start} seconds")
# Total Rows: 44433
# Total function execution time: 309.5544641017914 seconds
# Embedding Insertion time: 308.15446400642395 seconds ~ 5 minutes
# Model loading time: 1.4021379947662354 seconds
# Fetching time: 0.04068493843078613 seconds


Total Rows: 44433
Total function execution time: 309.5544641017914 seconds
Model loading time: 1.4021379947662354 seconds
Fetching time: 0.04068493843078613 seconds


## QPS Measurement

In [5]:
def measure_qps(QUERY, iterations=100):
    conn = create_db_connection() # Connect to the database
    conn.autocommit = True  # Enable autocommit for creating the database
    cursor = conn.cursor()
    start_time = time.time()
    for _ in range(iterations):
        cursor.execute(QUERY)  # Create the database
        cursor.fetchall()
    elapsed_time = time.time() - start_time
    qps = iterations / elapsed_time
    cursor.close()
    conn.close()

    return qps

### Generate Text Embedding for the Sample Text

In [5]:
sample_text = "summer outfit for thailand"
text_inputs = text_tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True)
text_model_output = text_model(**text_inputs)
text_embeddings = text_model_output.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy().tolist()

In [11]:
print(text_embeddings)

[0.09742061048746109, 0.49918133020401, -0.03261876106262207, 0.2645748257637024, 0.18288983404636383, 0.4575275778770447, 0.14676101505756378, 0.3150811493396759, -0.517536997795105, -0.17801591753959656, 0.2541957497596741, -0.5055475831031799, 0.0705580860376358, 0.4711122512817383, 0.6484673619270325, -0.19533346593379974, 0.261547327041626, 0.186905637383461, 0.3899439871311188, -0.4635244905948639, 0.21711668372154236, -0.07681413739919662, 0.25348740816116333, 0.12105589359998703, -0.03783831745386124, -0.5198584198951721, 0.48314517736434937, 0.4928768575191498, 0.07294569164514542, -0.21590955555438995, -0.2320374697446823, -0.021451549604535103, -0.22492942214012146, 0.2875055968761444, 0.15024526417255402, 0.23689620196819305, -0.3735382556915283, -0.08480574190616608, 0.07344142347574234, 0.6083762049674988, -0.23195263743400574, -0.2714826762676239, -0.19037440419197083, -0.11485058814287186, 0.2223883420228958, 0.4921382963657379, 0.0323888435959816, -0.23294684290885925,

## PGConf DE Experiments

### Partial Index on pgvector

In [None]:
# QUERY = f"""SELECT img_id, 1-(embedding <-> '{text_embeddings}') AS score FROM benchmark_embeddings_pgvec ORDER BY embedding <-> '{text_embeddings}' LIMIT 10;"""
QUERY = f"""SELECT img_id,gender, productdisplayname , 1-(embeddings <-> '{text_embeddings}') AS score FROM product_vector WHERE gender='Men' ORDER BY score LIMIT 10""";
 
conn = create_db_connection() # Connect to the database
cursor = conn.cursor()


cursor.execute(QUERY)  # Create the database
cursor.fetchall()
# print(f"Queries Per Second (QPS): {qps:.2f}")

### QPS Measurement for pgvector

In [None]:
# QUERY = f"""SELECT img_id, 1-(embedding <-> '{text_embeddings}') AS score FROM benchmark_embeddings_pgvec ORDER BY embedding <-> '{text_embeddings}' LIMIT 10;"""
QUERY = f"""SELECT img_id,gender, productdisplayname , 1-(embeddings <-> '{text_embeddings}') AS score FROM product_vector WHERE gender='Men' ORDER BY score LIMIT 10""";
 
# Run the benchmark
iterations = 100
qps = measure_qps(QUERY, iterations)
print(f"Queries Per Second (QPS): {qps:.2f}")

[('14170', 'Men', 'Belmonte Men Rough Finish Black Belts', -5.024000802028648),
 ('17031', 'Men', 'Gas Men Black Wave Flip Flops', -5.010693557505351),
 ('25794', 'Men', 'Arrow Men Navy Blue Suspenders', -4.990658163298269),
 ('49867', 'Men', 'Q&Q Men Black Digital Watch', -4.980545930470298),
 ('53598', 'Men', 'Q&Q Men Black Digital Watch', -4.980545930470298),
 ('44126', 'Men', 'Boss Men After Shave', -4.9755864960828715),
 ('55869',
  'Women',
  'Colorbar All Fired Up 2 Nail Lacquer 83V',
  -4.973074579651758),
 ('40779', 'Men', 'Quiksilver Men Brown Flip Flops', -4.973047756339448),
 ('38003', 'Men', 'Q&Q Men Black Watch', -4.969948213950109),
 ('17380',
  'Men',
  'Reid & Taylor Men Check Multi Muffler',
  -4.9677183847536375)]

In [5]:
conn = create_db_connection() # Connect to the database
conn.autocommit = True  # Enable autocommit for creating the database
cursor = conn.cursor()

In [10]:
text_embeddings = generate_ollama_embeddings("Party dress for summer")

In [72]:
QUERY = f"""SELECT img_id, 1-(embedding <-> '{text_embeddings}') AS score FROM benchmark_embeddings_pgvec ORDER BY embedding <-> '{text_embeddings}' LIMIT 1000;"""
start_time = time.time()
cursor.execute(QUERY)  # Create the database
cursor.fetchall()
time.time() - start_time

0.007653951644897461

## Vchord

In [12]:
cursor.execute("""
        CREATE TABLE IF NOT EXISTS benchmark_embeddings_vchord_llama(
            img_id INTEGER PRIMARY KEY REFERENCES products_pgconf(img_id) ON DELETE CASCADE,
            embedding vector(4096));
    """)

cursor.execute("""CREATE INDEX ON benchmark_embeddings_vchord_llama USING vchordrq (embedding vector_l2_ops) WITH (options = $$
residual_quantization = false
[build.internal]
lists = [1000]
spherical_centroids = true
$$);""")

In [None]:
# Initialize timing variables for overall function performance tracking
function_start_time = time.time()

# Fetch data from the database
fetch_start = time.time()
cursor = conn.cursor()
cursor.execute("SELECT img_id, productdisplayname FROM products_pgconf;")
result = cursor.fetchall()
fetch_end = time.time()

batch_size = 1000
total_rows_inserted = 0

for i in range(0, len(result), batch_size):
    batch_data = [(row[0], row[1]) for row in result[i:i+batch_size] if row[1]]
    if not batch_data:
        continue

    batch_ids, batch_texts = zip(*batch_data)
    batch_texts = [text for text in batch_texts if text]

    if not batch_texts:
        continue

    text_embeddings = []
    for text in batch_texts:
        try:
            embedding = generate_ollama_embeddings(text)
            text_embeddings.append(embedding)
        except Exception as e:
            print(f"Embedding generation failed for text: {text[:30]}..., error: {e}")
            continue

    with conn.cursor() as cursor:
        for idx, embedding in enumerate(text_embeddings):
            cursor.execute(
                "INSERT INTO benchmark_embeddings_vchord_llama (img_id, embedding) VALUES (%s, %s)",
                (batch_ids[idx], embedding)
            )
            total_rows_inserted += 1

function_end_time = time.time()
total_time = function_end_time - function_start_time

print(f"Total Rows: {total_rows_inserted}")
print(f"Total function execution time: {total_time} seconds")
print(f"Fetching time: {fetch_end - fetch_start} seconds")

# Total Rows: 44433
# Total function execution time: 249.79474711418152 seconds
# Embedding Insertion time: 248.98874711990356 seconds ~ 4 minutes
# Model loading time: 1.8063030242919922 seconds
# Fetching time: 0.04993581771850586 seconds

### QPS Vchord

In [None]:
conn = create_db_connection() # Connect to the database
conn.autocommit = True  # Enable autocommit for creating the database
cursor = conn.cursor()

In [8]:
QUERY = f""" SELECT img_id, productdisplayname FROM products_pgconf LIMIT 10;"""
cursor.execute(QUERY)  # Create the database
cursor.fetchall()

[(15970, 'Turtle Check Men Navy Blue Shirt'),
 (39386, 'Peter England Men Party Blue Jeans'),
 (59263, 'Titan Women Silver Watch'),
 (21379, 'Manchester United Men Solid Black Track Pants'),
 (53759, 'Puma Men Grey T-shirt'),
 (1855, 'Inkfruit Mens Chain Reaction T-shirt'),
 (30805, 'Fabindia Men Striped Green Shirt'),
 (26960, 'Jealous 21 Women Purple Shirt'),
 (29114, 'Puma Men Pack of 3 Socks'),
 (30039, 'Skagen Men Black Watch')]

In [11]:
QUERY = f"""SELECT img_id, 1-(embedding <-> '{text_embeddings}') AS score FROM benchmark_embeddings_vchord_llama ORDER BY embedding <-> '{text_embeddings}' LIMIT 10;"""
# Run the benchmark
iterations = 100
# qps = measure_qps(QUERY, iterations)
# print(f"Queries Per Second (QPS): {qps:.2f}")
cursor.execute(QUERY)  # Create the database
cursor.fetchall()


[(57042, 0.01935138300493011),
 (57043, 0.01935138300493011),
 (38329, -0.0010694507205859516),
 (5772, -0.008079552569072446),
 (9519, -0.012507186675762894),
 (27010, -0.01214790815632738),
 (17620, -0.01648822192917687),
 (9453, -0.01655278010958927),
 (27891, -0.02644831285075555),
 (41322, -0.02745206428701752)]

In [73]:
QUERY = f"""SELECT img_id, 1-(embedding <-> '{text_embeddings}') AS score FROM benchmark_embeddings_vchord2 ORDER BY embedding <-> '{text_embeddings}' LIMIT 1000;"""
start_time = time.time()
cursor.execute(QUERY)  # Create the database
cursor.fetchall()
time.time() - start_time

0.01081395149230957

In [68]:
QUERY = f"""EXPLAIN ANALYZE SELECT img_id, 1-(embedding <-> '{text_embeddings}') AS score FROM benchmark_embeddings_vchord2 ORDER BY embedding <-> '{text_embeddings}' LIMIT 10;"""
cursor.execute(QUERY)  # Create the database
cursor.fetchall()

[('Limit  (cost=0.00..2.18 rows=10 width=20) (actual time=1.370..1.914 rows=10 loops=1)',),
 ('  ->  Index Scan using benchmark_embeddings_vchord2_embedding_idx on benchmark_embeddings_vchord2  (cost=0.00..9667.58 rows=44433 width=20) (actual time=1.367..1.888 rows=10 loops=1)',),
 ("        Order By: (embedding <-> '[0.09742063,0.4991813,-0.032618724,0.2645749,0.18288991,0.45752755,0.14676102,0.31508112,-0.51753694,-0.17801586,0.25419584,-0.50554764,0.070558235,0.47111228,0.6484674,-0.19533338,0.26154727,0.18690552,0.38994405,-0.46352467,0.21711674,-0.07681413,0.25348747,0.12105579,-0.037838425,-0.5198583,0.4831451,0.49287698,0.07294557,-0.21590959,-0.23203763,-0.021451745,-0.22492945,0.2875056,0.15024522,0.23689626,-0.37353808,-0.084805526,0.07344134,0.6083763,-0.23195252,-0.27148265,-0.19037423,-0.11485064,0.22238834,0.4921384,0.032388713,-0.232947,-0.46515298,0.47612762,0.50035065,-0.49673364,-0.31666833,-0.38258243,0.104849204,0.0826336,-0.042856835,0.16741274,-0.20582621,-0.44907

In [69]:
QUERY = f"""EXPLAIN ANALYZE SELECT img_id, 1-(embedding <-> '{text_embeddings}') AS score FROM benchmark_embeddings_pgvec ORDER BY embedding <-> '{text_embeddings}' LIMIT 10;"""
cursor.execute(QUERY)  # Create the database
cursor.fetchall()

[('Limit  (cost=72.60..80.78 rows=10 width=20) (actual time=1.579..1.607 rows=10 loops=1)',),
 ('  ->  Index Scan using benchmark_embeddings_pgvec_embedding_idx on benchmark_embeddings_pgvec  (cost=72.60..36398.18 rows=44433 width=20) (actual time=1.576..1.602 rows=10 loops=1)',),
 ("        Order By: (embedding <-> '[0.09742063,0.4991813,-0.032618724,0.2645749,0.18288991,0.45752755,0.14676102,0.31508112,-0.51753694,-0.17801586,0.25419584,-0.50554764,0.070558235,0.47111228,0.6484674,-0.19533338,0.26154727,0.18690552,0.38994405,-0.46352467,0.21711674,-0.07681413,0.25348747,0.12105579,-0.037838425,-0.5198583,0.4831451,0.49287698,0.07294557,-0.21590959,-0.23203763,-0.021451745,-0.22492945,0.2875056,0.15024522,0.23689626,-0.37353808,-0.084805526,0.07344134,0.6083763,-0.23195252,-0.27148265,-0.19037423,-0.11485064,0.22238834,0.4921384,0.032388713,-0.232947,-0.46515298,0.47612762,0.50035065,-0.49673364,-0.31666833,-0.38258243,0.104849204,0.0826336,-0.042856835,0.16741274,-0.20582621,-0.44907

In [None]:
EXPLAIN ANALYZE SELECT id, 1-(embeddings <-> '[0.09742061048746109, 0.49918133020401, -0.03261876106262207, 0.2645748257637024, 0.18288983404636383, 0.4575275778770447, 0.14676101505756378, 0.3150811493396759, -0.517536997795105, -0.17801591753959656, 0.2541957497596741, -0.5055475831031799, 0.0705580860376358, 0.4711122512817383, 0.6484673619270325, -0.19533346593379974, 0.261547327041626, 0.186905637383461, 0.3899439871311188, -0.4635244905948639, 0.21711668372154236, -0.07681413739919662, 0.25348740816116333, 0.12105589359998703, -0.03783831745386124, -0.5198584198951721, 0.48314517736434937, 0.4928768575191498, 0.07294569164514542, -0.21590955555438995, -0.2320374697446823, -0.021451549604535103, -0.22492942214012146, 0.2875055968761444, 0.15024526417255402, 0.23689620196819305, -0.3735382556915283, -0.08480574190616608, 0.07344142347574234, 0.6083762049674988, -0.23195263743400574, -0.2714826762676239, -0.19037440419197083, -0.11485058814287186, 0.2223883420228958, 0.4921382963657379, 0.0323888435959816, -0.23294684290885925, -0.46515294909477234, 0.47612762451171875, 0.5003504157066345, -0.4967336058616638, -0.3166685104370117, -0.3825823664665222, 0.10484935343265533, 0.0826335996389389, -0.04285689815878868, 0.16741278767585754, -0.20582620799541473, -0.44907599687576294, -0.19572339951992035, 0.1849067509174347, -0.17568647861480713, 0.17995236814022064, -0.8226770162582397, -0.18839490413665771, -0.2881605327129364, -0.1557323932647705, 0.04663849622011185, -0.3750746548175812, -0.03378496691584587, -0.20285938680171967, 0.39729684591293335, 0.14579446613788605, -0.08095365017652512, 0.24584898352622986, -0.2795751988887787, 0.07985680550336838, -0.050914112478494644, -0.23840458691120148, 0.2677540183067322, -0.13852332532405853, 0.29585808515548706, 0.00037054717540740967, 0.2931496798992157, -0.07576090097427368, 0.08842658996582031, -0.205800399184227, -0.18899250030517578, 0.05351603031158447, 0.0211434755474329, 0.03480493277311325, -0.0491352379322052, 0.19342274963855743, -0.05919945612549782, 0.012879066169261932, -0.18481504917144775, 0.0847981721162796, 0.024114975705742836, 0.6506922841072083, 0.2830871641635895, -0.011874520219862461, 0.0783054381608963, 0.37222686409950256, -0.30468398332595825, -0.1249045804142952, 0.27261027693748474, 0.04294919967651367, 0.10253135114908218, 0.09791062772274017, -0.40885213017463684, 0.2269219607114792, -0.1324959248304367, -0.31449365615844727, -0.5837432742118835, -0.11458013951778412, -0.31908485293388367, 0.0749528631567955, -0.22645001113414764, 0.3527763783931732, 0.03525138646364212, -0.2134314328432083, 0.20275351405143738, -0.30526605248451233, -0.20725658535957336, 0.03189234808087349, 0.257747083902359, 0.06441027671098709, -0.2354181855916977, 0.07375039160251617, 0.29976436495780945, 0.14016787707805634, 0.24771352112293243, 0.11568445712327957, 0.23281040787696838, -0.6697320938110352, -0.14333605766296387, -0.22851133346557617, -0.07686376571655273, -0.2929718792438507, -0.4832497239112854, 0.18909795582294464, 0.23915229737758636, 0.06444760411977768, 0.3612528443336487, -0.28061744570732117, -0.34963110089302063, 0.1341032236814499, -0.12405551224946976, 0.14210402965545654, -0.042316678911447525, -0.08450962603092194, -0.1054842621088028, 0.0445297434926033, 0.2207842320203781, 0.3338475525379181, 0.03512226417660713, -0.10965047776699066, 0.2541898190975189, -0.12353336811065674, -0.05699590966105461, -0.008258900605142117, -0.6047337651252747, -0.21312212944030762, -0.8504300117492676, -0.2758425176143646, 0.07257810980081558, -0.1454063355922699, 0.37744468450546265, 0.12360472977161407, -0.26687148213386536, 0.13603074848651886, -0.1675504744052887, 0.33096781373023987, 0.12106738239526749, 0.41872525215148926, 0.16633501648902893, -0.11639005690813065, -0.5440191030502319, -0.04950413480401039, -0.13078364729881287, 0.02122735045850277, 0.3775106966495514, -0.05092832073569298, 0.8121853470802307, -0.30769094824790955, -0.2251589298248291, -0.10606713593006134, 0.6708601713180542, 0.16386376321315765, -0.07680841535329819, -0.23243831098079681, 0.2773345708847046, -0.19003595411777496, 0.25410205125808716, 0.08244002610445023, -0.2042507827281952, -0.29764920473098755, 0.08964954316616058, -0.2617628574371338, -0.14929327368736267, 0.16900435090065002, 0.03733310475945473, -0.4650423228740692, 0.6575135588645935, 0.10821462422609329, 0.17851734161376953, -0.09836733341217041, -0.13090252876281738, 0.5817781686782837, 0.08921795338392258, 0.33738213777542114, 0.027142737060785294, -0.19084562361240387, 0.3151596188545227, 0.2705049216747284, 0.49612754583358765, -0.17094863951206207, 0.11387448757886887, -0.26544225215911865, -0.0679672509431839, -0.2990885078907013, -0.11352521926164627, -0.31062254309654236, -0.03342089429497719, 0.1407465785741806, 0.1589237004518509, -0.12846295535564423, 0.7312979102134705, -0.33030885457992554, 0.06577388197183609, -0.023090535774827003, 0.07473407685756683, 0.21322713792324066, -0.26613375544548035, -0.21842963993549347, 0.16432499885559082, -0.3307705223560333, 0.6663755178451538, 0.12737169861793518, 0.042011115700006485, 0.6635159254074097, 0.23050925135612488, 0.3029860854148865, 0.5909188389778137, -0.2663329243659973, -0.2242400348186493, -0.2894763648509979, -0.35085517168045044, 0.31769174337387085, -0.14334890246391296, -0.057607702910900116, -0.35041096806526184, -0.03047189675271511, 0.07677770406007767, -0.49500247836112976, 0.25333085656166077, 0.4992930591106415, -0.6854051351547241, -0.11764056980609894, -0.035133134573698044, -0.09737343341112137, -0.12262594699859619, 0.13834674656391144, 0.404145747423172, -0.6062812805175781, -0.0983092412352562, -0.007142066955566406, 0.07905064523220062, -0.6069747805595398, 0.03486069664359093, -0.48754364252090454, 0.18549667298793793, -0.47465118765830994, 0.005454686004668474, 0.20652072131633759, -0.33504706621170044, -0.04989254102110863, -0.01778552122414112, 0.28419211506843567, 0.01871858723461628, -0.002907342044636607, 0.628482460975647, -0.11766643077135086, 0.0694955363869667, -0.07737861573696136, -0.2639698088169098, 0.057474713772535324, -0.19631923735141754, 0.12694032490253448, 0.6070435643196106, 0.4372637867927551, -0.14021356403827667, -0.6632922291755676, -0.010569900274276733, -0.025874817743897438, 0.14566883444786072, 0.4095064699649811, 0.04999594762921333, -0.19702740013599396, 0.34836164116859436, 0.6217816472053528, 0.5985726714134216, 0.08155538141727448, -0.23967719078063965, -0.5381646156311035, -0.2985968291759491, -0.23037953674793243, 0.4289696514606476, -0.13720981776714325, -0.2953493893146515, 0.04264624044299126, -0.6227273344993591, 0.09450741112232208, 0.3377222716808319, 0.25773605704307556, 0.2412605583667755, 0.48334187269210815, 0.012269268743693829, -0.3037653863430023, -0.1127106100320816, -0.4580943286418915, 0.1458543837070465, 0.5470634698867798, -0.15304161608219147, -0.00611581327393651, 0.010426853783428669, -0.4795088469982147, 0.03731180354952812, 0.24375446140766144, -0.09793317317962646, -0.026712587103247643, 0.5632129311561584, 0.4737153947353363, -0.010395048186182976, 0.10017663985490799, -0.1694290190935135, 0.44417962431907654, -0.07020236551761627, -0.30777058005332947, 0.021759631112217903, 0.3220398724079132, 0.14791782200336456, -0.18612386286258698, -0.3011143207550049, -0.2600925862789154, -0.1899072825908661, 0.058319222182035446, 0.2052849978208542, -0.027517450973391533, 0.04485439509153366, -0.01156635768711567, 0.10609831660985947, -0.42378607392311096, -0.31804895401000977, -0.3698336184024811, -0.11658240854740143, -0.08494962006807327, -0.14628545939922333, 0.49972718954086304, -0.12965838611125946, -0.27054521441459656, 0.07990990579128265, 0.2490413635969162, 0.5580453276634216, -0.1816195398569107, -0.3293829560279846, -0.24719223380088806, -0.345032274723053, 0.08064467459917068, -0.1954721212387085, -0.5179261565208435, 0.3804040253162384, -0.22377589344978333, -0.2558828294277191, 0.23088569939136505, -0.01315454114228487, -0.14998462796211243, 0.0770958811044693, 0.5944129824638367, 0.28078633546829224, -0.4612378776073456, -0.8002312779426575, 0.17086894810199738]') AS score FROM pgconf_vector_vector ORDER BY score LIMIT 10;

Results for 'outfit for a beach party':

Querying similar catalog took 1.2464 seconds.

Number of elements retrieved: 10

Vchord outputs were very correct like bikinis and dresses

pgvector outputs were very similar too!

Results for 'outfit for galentine day':

Querying similar catalog took 1.1322 seconds.

Number of elements retrieved: 10
pgvector - vchord same output but vchord was faster!
Avirate Black Polka Dot Dress
Vero Moda Women Beige Dress
Elle Yellow Vintage Shift Dress

Results for 'men outfit for a valentine day':

Querying similar catalog took 1.1899 seconds.

Number of elements retrieved: 10

Vchord - pgvector same!
Park Avenue Men Cream Tie
HUGO Men Fragrance Gift Set
Parx Men Lavender Tie
Hakashi Men Lavender Silk Tie
Park Avenue Men Lavender Tie
Reid & Taylor Men Cream Tie
Park Avenue Men Lavender Tie
Playboy Men Pack of 2 Innerwear Vests
Reid & Taylor Men Cream Tie
Rasasi Men Pour Homme Perfume

In [None]:
# Initialize timing variables for overall function performance tracking
function_start_time = time.time()

# Fetch data from the database
fetch_start = time.time()
cursor = conn.cursor()
cursor.execute("SELECT img_id, productdisplayname FROM products_pgconf;")
result = cursor.fetchall()
fetch_end = time.time()

batch_size = 1000
total_rows_inserted = 0

for i in range(0, len(result), batch_size):
    batch_data = [(row[0], row[1]) for row in result[i:i+batch_size] if row[1]]
    if not batch_data:
        continue

    batch_ids, batch_texts = zip(*batch_data)
    batch_texts = [text for text in batch_texts if text]

    if not batch_texts:
        continue

    text_inputs = text_tokenizer(list(batch_texts), return_tensors="pt", padding=True, truncation=True)
    text_model_output = text_model(**text_inputs)
    text_embeddings = text_model_output.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy().tolist()

    with conn.cursor() as cursor:
        for idx, embedding in enumerate(text_embeddings):
            cursor.execute(
                "INSERT INTO benchmark_embeddings_vchord_llama (img_id, embedding) VALUES (%s, %s)",
                (batch_ids[idx], embedding)
            )
            total_rows_inserted += 1

function_end_time = time.time()
total_time = function_end_time - function_start_time

print(f"Total Rows: {total_rows_inserted}")
print(f"Total function execution time: {total_time} seconds")
print(f"Fetching time: {fetch_end - fetch_start} seconds")

# Total Rows: 44433
# Total function execution time: 249.79474711418152 seconds
# Embedding Insertion time: 248.98874711990356 seconds ~ 4 minutes
# Model loading time: 1.8063030242919922 seconds
# Fetching time: 0.04993581771850586 seconds