<a href="https://colab.research.google.com/github/anshupandey/Working_with_Large_Language_models/blob/main/WWL_C7_Gemini_Embedding_model_Vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using transformer based vectorizer from Vertex AI

### Install Vertex AI SDK for Python


In [19]:
! pip3 install --upgrade --user --quiet google-cloud-aiplatform

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[0m

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.


In [20]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [21]:
PROJECT_ID = "jrproject-402905"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
MODEL_ID = "text-embedding-preview-0409"  # @param {type:"string"}
import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [22]:
from vertexai.language_models import TextEmbeddingModel

text_embedding_model = TextEmbeddingModel.from_pretrained(MODEL_ID)
out = text_embedding_model.get_embeddings(["Hello world"])
print(out)

[TextEmbedding(values=[0.013551234267652035, -0.008901020511984825, -0.04676566272974014, 0.00038600710104219615, -0.00909352581948042, -0.0086405323818326, 0.06041082739830017, 0.024694429710507393, 0.0258732121437788, 0.0540466271340847, -0.03776068240404129, -0.0014106014277786016, 0.030819091945886612, -0.015377873554825783, -0.0128002455458045, -0.028744326904416084, -0.007652867119759321, 0.012698023580014706, -0.11359784007072449, 0.0105278380215168, 0.0050337123684585094, -0.0012501087039709091, -0.029953207820653915, -0.05981229618191719, -0.015305274166166782, -0.003599149640649557, 0.006672598887234926, 0.031088395044207573, 0.021406792104244232, 0.036884136497974396, -0.036871105432510376, 0.045773014426231384, 0.0024617472663521767, -0.03195348009467125, 0.009668494574725628, 0.012424473650753498, -0.05078602582216263, 0.021258793771266937, 0.01449132151901722, -0.057637784630060196, -0.02721235901117325, 0.03704335168004036, 0.0014064606511965394, 0.008559545502066612, 0.

In [23]:
len(out[0].values)

768

In [24]:
doc1 = "Python is an amazing programming language and works quite well."
vec1 = text_embedding_model.get_embeddings([doc1])[0].values
print(len(vec1))

768


In [25]:
doc2 = "An easy programming language like python makes it easy to code and develop solutions"
vec2 = text_embedding_model.get_embeddings([doc2])[0].values
print(len(vec2))

768


In [26]:
doc3 = "Manila is a city with bustling streets, amazing night life and a vibrant culture"
vec3 = text_embedding_model.get_embeddings([doc3])[0].values
print(len(vec3))

768


In [27]:
# cosine similarity check
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([vec1], [vec2])

array([[0.74369858]])

In [28]:
cosine_similarity([vec1], [vec3])

array([[0.29274314]])

In [29]:
cosine_similarity([vec2], [vec3])

array([[0.27190266]])

In [30]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/anshupandey/Working_with_Large_Language_models/main/GroceryDataset.csv")
df.shape

(1757, 8)

In [31]:
df.head()

Unnamed: 0,Sub Category,Price,Discount,Rating,Title,Currency,Feature,Product Description
0,Bakery & Desserts,$56.99,No Discount,Rated 4.3 out of 5 stars based on 265 reviews.,"David’s Cookies Mile High Peanut Butter Cake, ...",$,"""10"""" Peanut Butter Cake\nCertified Kosher OU-...",A cake the dessert epicure will die for!Our To...
1,Bakery & Desserts,$159.99,No Discount,Rated 5 out of 5 stars based on 1 reviews.,"The Cake Bake Shop 8"" Round Carrot Cake (16-22...",$,Spiced Carrot Cake with Cream Cheese Frosting ...,"Due to the perishable nature of this item, ord..."
2,Bakery & Desserts,$44.99,No Discount,Rated 4.1 out of 5 stars based on 441 reviews.,"St Michel Madeleine, Classic French Sponge Cak...",$,100 count\nIndividually wrapped\nMade in and I...,Moist and buttery sponge cakes with the tradit...
3,Bakery & Desserts,$39.99,No Discount,Rated 4.7 out of 5 stars based on 9459 reviews.,"David's Cookies Butter Pecan Meltaways 32 oz, ...",$,Butter Pecan Meltaways\n32 oz 2-Pack\nNo Prese...,These delectable butter pecan meltaways are th...
4,Bakery & Desserts,$59.99,No Discount,Rated 4.5 out of 5 stars based on 758 reviews.,"David’s Cookies Premier Chocolate Cake, 7.2 lb...",$,"""10"" Four Layer Chocolate Cake\nCertified Kosh...",A cake the dessert epicure will die for!To the...


In [13]:
# drop rows where product description is missing
df = df[df["Product Description"].notna()]
df.shape

(1715, 8)

In [17]:
import time
vectors = []

num_docs = 40
for i in range(0,df.shape[0],num_docs):
  docs = df["Product Description"][i:i+num_docs].tolist()
  vec = text_embedding_model.get_embeddings(docs)
  time.sleep(10)
  vectors.extend(vec)

In [18]:
len(vectors)

1715