# Create names table and vectorize them

## Define catalog and schemas to use & table to create

In [0]:
dbutils.widgets.text("catalog", "amitabh_arora_catalog")
dbutils.widgets.text("schema", "demo_vectorization")

In [0]:
catalog = dbutils.widgets.get("catalog")
spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")

schema = dbutils.widgets.get("schema")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.`{schema}`")

In [0]:
retailer_names_tbl = "retailer_names"
# retailer_names_tbl_path = f"{catalog}.{schema}.{retailer_names_tbl}"

## Create dummy dataset

In [0]:
retailer_names = [
    "Walmart",
    "Amazon",
    "Costco",
    "Target",
    "The Home Depot",
    "Walgreens",
    "Kroger",
    "CVS",
    "Lowe's",
    "Best Buy",
    "Macy's",
    "Nordstrom",
    "Sears",
    "Sam's Club",
    "Aldi",
    "IKEA",
    "Publix",
    "Dollar General",
    "Trader Joe's",
    "7-Eleven",
]

In [0]:
retailer_names += [
    "AMAZILES MARIA GONCALVES",
    "AMAZING - STADIUM",
    "AMAZING STYLE SLU",
    "AMAZON EU SARL",
    "AMAZON EU SARL",
    "AMAZON EU SARL   **IA**",
    "AMAZON EU SARL -CZE-",
    "AMAZON EU SARL -POLEN-",
    "AMAZON EU SARL -POLEN-",
    "AMAZON EU SARL / BUY   **IA**",
    "AMAZON EU SARL / MUTTER *IA*",
    "AMAZON EU SARL SUCURSAL ESPANA",
    "AMAZON EU SARL SUCURSAL ESPANA",
    "AMAZON EU SARL/MEDIA EU SARL",
    "AMAZON EU SARL/MEDIA EU SARL",
    "AMAZON FOOTWEAR",
    "AMAZON PAN EU PARENT",
    "AMAZON SERVS VAREJO DO BR LTDA",
    "AMAZON SMART HOME SERVICES",
    "AMAZON VINE PROMO",
    "AMAZON.COM LLC",
    "AMAZON.COM LLC",
    "AMAZON.COM.CA INC",
    "AMAZONE",
    "ARIZONA/AMAZONE",
    "Amazon Japan K.K."
]

In [0]:
retailer_names += [
    "KOHL's",
    "Kohl's",
    "Kohls",
    "Kohls"
]

In [0]:
retailer_names += [
    "Macy's",
    "Macys"
]

## Store dataset in a table

In [0]:
df = spark.createDataFrame([(name,) for name in retailer_names], ["cust_name"])
df.write.mode("overwrite").saveAsTable(f"`{catalog}`.`{schema}`.`{retailer_names_tbl}`")

## Display table with embeddings

In [0]:
query = f"""
  select * from `{catalog}`.`{schema}`.`{retailer_names_tbl}`
"""
spark.sql(query).display()

## Vectorize names and save them as embedings column
We will use Databricks hosted model suitable for such tasks

In [0]:
query = f"""
  SELECT
    cust_name,
    ai_query('databricks-gte-large-en', cust_name) AS vector_embedding
  FROM `{catalog}`.`{schema}`.`{retailer_names_tbl}`
"""
df = spark.sql(query)

In [0]:
df.write.mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"`{catalog}`.`{schema}`.`{retailer_names_tbl}`")

In [0]:
display(spark.table(f"`{catalog}`.`{schema}`.`{retailer_names_tbl}`"))

## Define dot_product UDF

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import numpy as np

def dot_product(vec1, vec2):
    # Both vec1 and vec2 should be iterable (e.g., list or array)
    return float(np.dot(vec1, vec2))

# Register as a regular UDF for Spark DataFrames/sql
dot_product_udf = udf(dot_product, DoubleType())
spark.udf.register("dot_product", dot_product_udf)

## Define vector_norm UDF

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import numpy as np

def vector_norm(vec, ord=2):
    # vec should be a list or array of numbers
    return float(np.linalg.norm(vec, ord))

vector_norm_udf = udf(vector_norm, DoubleType())

# Register UDF for SQL use in Databricks
spark.udf.register("vector_norm", vector_norm_udf)

## Set default catalog and schema

In [0]:
spark.sql(f"USE CATALOG `{catalog}`")
spark.sql(f"USE SCHEMA `{schema}`")

## Test the embeddings

In [0]:
%sql
WITH
query_embedding AS (
  SELECT
    ai_query('databricks-gte-large-en', 'kohls') AS emb
)
SELECT
  t.cust_name,
  t.vector_embedding,
  q.emb,
  -- Calculate cosine similarity
  dot_product(t.vector_embedding, q.emb) AS cosine_sim
FROM
  amitabh_arora_catalog.`demo-vectorization`.retailer_names t, query_embedding q
WHERE
  dot_product(t.vector_embedding, q.emb) > 0.99
ORDER BY
  cosine_sim DESC