# Finding Similar Shoes Using Vector Search in Db2

In [None]:
import pandas as pd
import random
from faker import Faker
import os
from dotenv import dotenv_values, load_dotenv
from ibm_watsonx_ai import APIClient, Credentials
from ibm_watsonx_ai.foundation_models import Embeddings
import csv
from IPython.display import Image, display
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import ast
import matplotlib.image as mpimg
from utils import *
from typing import List, Tuple, Any
import ibm_db
import ibm_db_dbi as dbi

pd.set_option('display.max_colwidth', None)  # Show full column content

# Enable Db2 Magic Commands Extensions for Jupyter Notebook
if not os.path.isfile('db2.ipynb'):
    os.system('wget https://raw.githubusercontent.com/IBM/db2-jupyter/master/db2.ipynb')


%run db2.ipynb

Db2 Extensions Loaded. Version: 2024-09-16


## Setting up wx.ai API for embedding generation

In [None]:
load_dotenv(os.getcwd()+"/.env", override=True)

#credentials = Credentials(
#                url = "https://us-south.ml.cloud.ibm.com",
#                api_key = os.getenv("WATSONX_APIKEY", "")
#                )
#
#client = APIClient(credentials)
#
#project_id = os.getenv("WATSONX_PROJECT", "")
#client.set.default_project(project_id)
#
#embeddings = Embeddings(
#    model_id=client.foundation_models.EmbeddingModels.MULTILINGUAL_E5_LARGE,
#    credentials=credentials,
#    project_id=project_id,
#)

## Setting up Db2 Connection

In [None]:
db2creds = dotenv_values('.env')
db2creds


In [None]:
%sql CONNECT CREDENTIALS db2creds

# Setting up a Shoes Table at Db2

In [None]:
%sql DROP TABLE SQ_SHOES

# Create the table
sql = get_create_sq_shoes_table_sql()
%sql {sql}

In [None]:
#%%capture output
#sql = f'''"IMPORT FROM 'shoes-vectors.csv' OF DEL skipcount 1 INSERT INTO SQ_SHOES"'''
_ = ! db2 "connect to testdb"

#output = %system db2 {sql}
#print(output)

In [38]:
shoes = pd.read_csv('shoes.csv')
#shoes['EMBEDDING'] = shoes['EMBEDDING'].apply(json.loads)
print(shoes)

          SKU            PRODUCT_NAME      BRAND  CLASS     TYPE   MATERIAL  \
0    ZEN-5999       Zentrax X Walking    Zentrax  Women  Walking  Synthetic   
1    RUN-4723   RunXpress Max Walking  RunXpress  Women  Walking  Synthetic   
2    ZEN-8968     Zentrax Pro Running    Zentrax    Men  Running  Synthetic   
3    RUN-7569     RunXpress X Walking  RunXpress    Men  Walking  Synthetic   
4    RUN-1083     RunXpress X Walking  RunXpress  Women  Walking  Synthetic   
..        ...                     ...        ...    ...      ...        ...   
495  ZEN-7398       Zentrax X Walking    Zentrax    Men  Walking       Knit   
496  ZEN-4049    Zentrax Flex Running    Zentrax    Men  Running  Synthetic   
497  RUN-6703  RunXpress Flex Running  RunXpress  Women  Running  Synthetic   
498  ZEN-2650     Zentrax Max Running    Zentrax  Women  Running  Synthetic   
499  RUN-3313     RunXpress X Walking  RunXpress  Women  Walking  Synthetic   

     COLOR WEATHER_RESISTANCE ARCH_SUPPORT  SIZE   

In [None]:
connection = f"DATABASE={os.getenv('database')};HOSTNAME={os.getenv('hostname')};PORT={os.getenv('port')};PROTOCOL=TCPIP;UID={os.getenv('uid')};PWD={os.getenv('pwd')};"
conn = ibm_db.connect(connection, "", "")

sql_insert = "INSERT INTO SQ_SHOES(SKU,PRODUCT_NAME,BRAND,CLASS,TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,SIZE,PRICE,RATING,STORE_ID,CITY) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)"

preparedStmt = None
try:
    preparedStmt = ibm_db.prepare(dbConnection, sql_insert)
except Exception:
    print(Exception.tostring())
  
if preparedStmt is False:
    print("\nERROR: Unable to prepare the SQL statement specified.\n")
    ibm_db.close(conn)
else:
      
  #pmValues: List[Tuple[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]
  #pmValues = []
  #for row in shoes.itertuples():
  #    pmValues.insert(len(pmValues), (row.SKU, row.PRODUCT_NAME, row.BRAND, row.CLASS, row.TYPE, row.MATERIAL,
  #                  row.COLOR, row.WEATHER_RESISTANCE, row.ARCH_SUPPORT, row.SIZE, row.PRICE,
  #                  row.RATING, int(row.STORE_ID), row.CITY))
  tuple_of_tuples = tuple([tuple(x) for x in shoes.values])
  print(tuple_of_tuples)

try:
  ibm_db.execute_many(preparedStmt, tuple(tuple_of_tuples))
  print(returnCode)
except Exception:
  print(Exception)
  
ibm_db.close(conn)

  


AttributeError: type object 'Exception' has no attribute 'tostring'

In [None]:
%sql USING shoes APPEND TABLE SQ_SHOES

## Searching for a Running Shoe of Size `12`, for `Men`

In [None]:
%sql SELECT count(*) FROM SQ_SHOES

In [None]:
sql = get_men_shoes_sql()
df_shoe_search = %sql {sql}

sku_list = df_shoe_search['SKU'].tolist()
display_sku_images(sku_list)

df_shoe_search.head(6)

### My chosen shoe is available at the Ottawa location

In [None]:
my_choice_sku = 'ZEN-2061'

## Searching for a similar shoes at the Toronto Location

In [None]:
sql = get_similar_shoes_sql()
df_shoes_results = %sql {sql}
sku_list = df_shoes_results['SKU'].tolist()

display_sku_images(sku_list)
df_shoes_results.head(3)

## Comparing the Search Results with my Preferred Shoe

In [None]:
sku_list = df_shoes_results.iloc[[0, 2]]['SKU'].tolist()
sku_list.append(my_choice_sku)

sku_sql_in = ', '.join(f"'{sku}'" for sku in sku_list)

sql = get_distance_for_sku_list_sql(sku_sql_in)

df_result = %sql {sql}

cols_show = ['SKU', 'TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT']
df_result[cols_show].transpose()

## Looking under the hood

### 1. original table without a `VECTOR` column

In [None]:
%sql DROP TABLE SQ_SHOES

# Create the table
sql = get_create_sq_shoes_table_data_only()
%sql {sql}

sql = f'''"IMPORT FROM 'shoes.csv' OF DEL skipcount 1 INSERT INTO SQ_SHOES"'''
_ = ! db2 "connect to TPCDS"

output = %system db2 {sql}

%sql SELECT * FROM SQ_SHOES FETCH FIRST 3 ROWS ONLY

### 2. Adding a `VECTOR` column

In [None]:
%%sql
ALTER TABLE SQ_SHOES
ADD COLUMN EMBEDDING VECTOR(1024, FLOAT32);

In [None]:
sql = f'''"DESCRIBE TABLE SQ_SHOES"'''
_ = ! db2 "connect to TPCDS"

%system db2 {sql}

In [None]:
%sql SELECT * FROM SQ_SHOES FETCH FIRST 3 ROWS ONLY

### 1. selected shoe features for generating shoe vectors

In [None]:
embedding_cols = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT']
shoe_sample = %sql SELECT * FROM SQ_SHOES WHERE SKU = :my_choice_sku
shoe_sample[embedding_cols]

### 2. combined the text features of each shoe as follows

In [None]:
# Combine all columns into a single string for each row, including column names
shoe_sample['combined'] = shoe_sample.apply(
    lambda row: ' [SEP] '.join([f"{col_name}: {row[col_name]}" for col_name in embedding_cols]), 
    axis=1
)

shoe_sample.iloc[0]['combined']

### 3. generated embedding vectors for the shoe text obtained in the previous step. For generating embedding vectors, used `watsonx.ai`

In [None]:

row_combined = shoe_sample['combined'].tolist()
print('input text: ', row_combined[0])

shoe_vectors = embeddings.embed_documents(texts=row_combined)
print('generated embedding vector: ', shoe_vectors[0])

shoe_sample['embedding'] = shoe_vectors
shoe_sample['embedding'] = shoe_sample['embedding'].apply(lambda x: '[' + ', '.join(map(str, x)) + ']')

### 4. stored vectors in Db2 in a Vector column

In [None]:
%sql SELECT EMBEDDING FROM SQ_SHOES WHERE SKU = :my_choice_sku

In [None]:
update_sql = f""" 
UPDATE SQ_SHOES SET EMBEDDING = VECTOR('{shoe_vectors[0]}', 1024, FLOAT32) WHERE SKU = :my_choice_sku
"""

%sql {update_sql}

In [None]:
%sql SELECT EMBEDDING FROM SQ_SHOES WHERE SKU = :my_choice_sku

### 5. Loading the vectors of all the shoes - to save demo time

In [None]:
%sql DROP TABLE SQ_SHOES

# Create the table
sql = get_create_sq_shoes_table_sql()
%sql {sql}

sql = f'''"IMPORT FROM 'shoes-vectors.csv' OF DEL skipcount 1 INSERT INTO SQ_SHOES"'''
_ = ! db2 "connect to TPCDS"

output = %system db2 {sql}

### 6. Finding matching shoes using the `VECTOR_DISTANCE` function

In [None]:
sql = f"""
SELECT 
    SKU, 
    PRODUCT_NAME, 
    BRAND, 
    TYPE, 
    MATERIAL, 
    COLOR, 
    WEATHER_RESISTANCE, 
    ARCH_SUPPORT, 
    PRICE, 
    RATING,
    VECTOR_DISTANCE(
        (SELECT EMBEDDING FROM SQ_SHOES WHERE SKU = :my_choice_sku), 
        EMBEDDING, 
        EUCLIDEAN
    ) AS DISTANCE
FROM 
    SQ_SHOES
WHERE 
    SKU <> :my_choice_sku
    AND CITY = 'Toronto'
    AND SIZE = 12
    AND CLASS = 'Men'
ORDER BY 
    DISTANCE ASC
FETCH FIRST 5 ROWS ONLY
"""

top_shoes = %sql {sql}
top_shoes.head()

### 7. Visualizing vector search

In [None]:

sku_list = top_shoes['SKU'].tolist()
sku_list.append(my_choice_sku)
sku_sql_in = ', '.join(f"'{sku}'" for sku in sku_list)
sql = get_similarity_query(sku_sql_in)

top_matching_vectors = %sql {sql}

plot_similarity_tsne(top_matching_vectors, my_choice_sku)

In [None]:
%sql DROP TABLE SQ_SHOES

In [None]:
%sql CONNECT RESET