# Visualizing Vector Embedings 


### Install required binaries. 

In [33]:
!pip install -U  boto3 psycopg2-binary pgvector




### Import required libraries

In [34]:
import json, pandas as pd
import boto3

### Load sample product catalog data into dataframe.

In [35]:
df = pd.read_csv("./product_catalog.csv", sep="|")
df.head(5)

Unnamed: 0,p_category,p_name,p_description
0,Fruit,Apple,"Juicy and crisp apple, perfect for snacking or..."
1,Fruit,Banana,"Sweet and creamy banana, a nutritious addition..."
2,Fruit,Mango,"Exotic and flavorful mango, delicious eaten fr..."
3,Fruit,Orange,"Refreshing and citrusy orange, packed with vit..."
4,Fruit,Pineapple,"Fresh and tropical pineapple, known for its sw..."


### Review unique product categories in the sample data.

In [36]:
categories = sorted(df["p_category"].unique())
categories

['Electronics', 'Fruit', 'Furniture', 'Sport']

### Creating Amazon Bedrock client. Later we will be using this client to invoke one of the text embedding model.

In [37]:
def create_beddrock_client(region):
        bedrock_client = boto3.client("bedrock-runtime", region_name='us-east-1')
        return bedrock_client
    

bedrock_client = create_beddrock_client('us-east-1')


### Declaring function to create text embeddings. We are passing bedrock client and data to this function.

In [38]:
def create_description_embedding( desc,bedrock_client):
        payload = {"inputText": f"{desc}"}
        body = json.dumps(payload)
        model = "amazon.titan-embed-text-v1"
        accept = "application/json"
        contentType = "application/json"

        response = bedrock_client.invoke_model(
           body=body, modelId=model, accept=accept, contentType=contentType
        )
        response_body = json.loads(response.get("body").read())

        embeddings = response_body.get("embedding")
        return  embeddings



### Generating embeddings for each product (description) 

In [39]:

all_records = []

for records in df['p_description']:
    embedded_data = create_description_embedding(records,bedrock_client)
    all_records.append(embedded_data)
    


### Displaying top 2 records from the list and adding one new column 'embed' to it. This new column holds the embeddings generated in previous step.

In [40]:
#all_records[1]

df.head(2)
df.insert(2,'p_embeddings',all_records)
df.head(2)


Unnamed: 0,p_category,p_name,p_embeddings,p_description
0,Fruit,Apple,"[-0.41796875, 0.7578125, -0.16308594, 0.045898...","Juicy and crisp apple, perfect for snacking or..."
1,Fruit,Banana,"[0.8515625, 0.036376953, 0.31835938, 0.1318359...","Sweet and creamy banana, a nutritious addition..."


In [41]:
len(df)

60

### Connect with Aurora PostgreSQL database. Create vector extension. Create new table and load embedded product catalog data into PostgreSQL table.

In [42]:
import psycopg2
from pgvector.psycopg2 import register_vector

client = boto3.client('secretsmanager')

response = client.get_secret_value(
    SecretId='aupg-vector-secret'
)
database_secrets = json.loads(response['SecretString'])

dbhost = database_secrets['host']
dbport = database_secrets['port']
dbuser = database_secrets['username']
dbpass = database_secrets['password']

dbconn = psycopg2.connect(host=dbhost, user=dbuser, password=dbpass, port=dbport)
dbconn.set_session(autocommit=True)

cur = dbconn.cursor()
cur.execute("create extension if not exists vector;")
register_vector(dbconn)
cur.execute("drop table if exists product_catalog;")
cur.execute("""create table if not exists product_catalog(
               p_id serial primary key,  
               p_category varchar(15),
               p_name varchar(50),
               p_description text,
               p_embeddings vector(1536));""")


for index, row in df.iterrows():
     cur.execute("""INSERT INTO product_catalog (p_category,p_name, p_description,p_embeddings)  values(%s, %s, %s, %s);""",( row.p_category, row.p_name, row.p_description,row.p_embeddings))


cur.execute("""CREATE INDEX ON product_catalog 
               USING ivfflat (p_embeddings vector_l2_ops) WITH (lists = 100);""")
cur.execute("vacuum analyze product_catalog;")

cur.close()
dbconn.close()
print ("Data loaded successfully!")


Data loaded successfully!


### Now read the records back from the PostgreSQL table for visualization.

In [43]:
with psycopg2.connect("host='{}' port={} user={} password={}".format(dbhost, dbport, dbuser, dbpass)) as conn:
    sql = "select p_category,p_name,p_description,p_embeddings from product_catalog ;"
    df_data = pd.read_sql_query(sql, conn)

 

df.head(3)


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



Unnamed: 0,p_category,p_name,p_embeddings,p_description
0,Fruit,Apple,"[-0.41796875, 0.7578125, -0.16308594, 0.045898...","Juicy and crisp apple, perfect for snacking or..."
1,Fruit,Banana,"[0.8515625, 0.036376953, 0.31835938, 0.1318359...","Sweet and creamy banana, a nutritious addition..."
2,Fruit,Mango,"[0.6328125, 0.73046875, 0.3046875, -0.72265625...","Exotic and flavorful mango, delicious eaten fr..."


### There are few algorithms which supports dimensionality reduction. We are going to use PCA (Principal Component Analysis) in this demo.

In [44]:
from sklearn.decomposition import PCA

### Embeddings generated by Amazon Bedrock model had 1536 dimentions. however, It is not posssible to visualize them. Hence we are reducing them to 3 dimention embeddings so later we can visualize these embeddings.

In [45]:
pca = PCA(n_components=3)
vis_dims = pca.fit_transform(df_data['p_embeddings'].to_list())
vis_dims


array([[ 3.56268566e-01,  1.15016430e+01,  4.42933646e+00],
       [-3.54746662e-01,  1.01054964e+01,  2.81543477e+00],
       [ 1.71470682e-01,  1.17202911e+01,  4.28195533e+00],
       [ 8.32021309e-01,  1.09130511e+01,  3.71712425e+00],
       [-8.17363944e-04,  1.10186798e+01,  3.62481896e+00],
       [ 2.79766486e-02,  1.08029233e+01,  4.68497927e+00],
       [-7.09940170e-01,  1.07609600e+01,  4.35195849e+00],
       [-1.22574021e+00,  1.04870281e+01,  2.45737229e+00],
       [ 6.01603944e-01,  1.08489812e+01,  3.92227170e+00],
       [ 4.41474741e-01,  1.21007592e+01,  3.99378858e+00],
       [ 6.16746435e+00, -1.75437462e+00, -1.42068579e+00],
       [ 7.77806047e+00, -1.92270712e+00, -3.04329989e+00],
       [ 4.61780955e+00, -1.55658909e+00, -1.08092755e+00],
       [ 7.80199931e+00, -2.24629252e+00, -3.94739974e+00],
       [ 5.68970398e+00, -3.78039493e+00, -1.62332420e+00],
       [ 6.07675432e+00, -1.72189904e+00, -4.19152178e+00],
       [ 5.92949794e+00, -2.49914325e+00

#### Keeping it all together. Adding new column in previous list, adding new column 'pca_embed' which will have 3 dimentioal embeddings. 

In [46]:
df_data['pca_embed'] = vis_dims.tolist()
df_data.head(5)

Unnamed: 0,p_category,p_name,p_description,p_embeddings,pca_embed
0,Fruit,Apple,"Juicy and crisp apple, perfect for snacking or...","[-0.41796875, 0.7578125, -0.16308594, 0.045898...","[0.35626856571655474, 11.501643004047386, 4.42..."
1,Fruit,Banana,"Sweet and creamy banana, a nutritious addition...","[0.8515625, 0.036376953, 0.31835938, 0.1318359...","[-0.3547466621907463, 10.105496442467032, 2.81..."
2,Fruit,Mango,"Exotic and flavorful mango, delicious eaten fr...","[0.6328125, 0.73046875, 0.3046875, -0.72265625...","[0.17147068159548648, 11.720291050641865, 4.28..."
3,Fruit,Orange,"Refreshing and citrusy orange, packed with vit...","[0.921875, 0.69921875, 0.29101562, 0.061523438...","[0.8320213087523731, 10.913051113510148, 3.717..."
4,Fruit,Pineapple,"Fresh and tropical pineapple, known for its sw...","[0.33984375, 0.70703125, 0.24707031, -0.605468...","[-0.0008173639438334911, 11.01867977558647, 3...."


### Now we are simply creating a 3D scattered graph to visualize 3 dimentional embeddings from above list. This data is color coded to 'category'. This help in showing how embeddings are clustered based on their semantic meaning.

In [47]:
import plotly.graph_objs as go
import numpy as np
fig = go.Figure()

for i, cat in enumerate(categories):
    sub_matrix = np.array(df_data[df_data["p_category"] == cat]["pca_embed"].to_list())
    x = sub_matrix[:, 0]
    y = sub_matrix[:, 1]
    z = sub_matrix[:, 2]

    fig.add_trace(
        go.Scatter3d(
            x=x,
            y=y,
            z=z,
            mode="markers",
            marker=dict(size=5, color=i, colorscale="Viridis", opacity=0.8),
            name=cat,
        )
    )

fig.update_layout(
    autosize=False,
    title="3D Scatter Plot of Categories",
    width=800,
    height=500,
    margin=dict(l=50, r=50, b=100, t=100, pad=10),
    scene=dict(
        xaxis=dict(title="x"),
        yaxis=dict(title="y"),
        zaxis=dict(title="z"),
    ),
)

fig.show()

#### Note - In some cases, you might observe that 3D graph is not generating. In that case, restart the kernel and run the notebook again.