In [None]:
## Lesson 3: Visualizing Embeddings

In [None]:
#### Project environment setup

- Load credentials and relevant Python Libraries

In [None]:
from utils import authenticate
credentials, PROJECT_ID = authenticate() #Get credentials and project ID

In [None]:
REGION = 'us-central1'

In [None]:
#### Enter project details

In [None]:
# Import and initialize the Vertex AI Python SDK

import vertexai
vertexai.init(project=PROJECT_ID,
              location=REGION,
              credentials = credentials)

In [None]:
## Embeddings capture meaning

In [None]:
in_1 = "Missing flamingo discovered at swimming pool"

in_2 = "Sea otter spotted on surfboard by beach"

in_3 = "Baby panda enjoys boat ride"


in_4 = "Breakfast themed food truck beloved by all!"

in_5 = "New curry restaurant aims to please!"


in_6 = "Python developers are wonderful people"

in_7 = "TypeScript, C++ or Java? All are great!"


input_text_lst_news = [in_1, in_2, in_3, in_4, in_5, in_6, in_7]

In [None]:
import numpy as np
from vertexai.language_models import TextEmbeddingModel

embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko@001")

In [None]:
- Get embeddings for all pieces of text.
- Store them in a 2D NumPy array (one row for each embedding).

In [None]:
embeddings = []
for input_text in input_text_lst_news:
    emb = embedding_model.get_embeddings(
        [input_text])[0].values
    embeddings.append(emb)

embeddings_array = np.array(embeddings)

In [None]:
print("Shape: " + str(embeddings_array.shape))
print(embeddings_array)

In [None]:
#### Reduce embeddings from 768 to 2 dimensions for visualization
- We'll use principal component analysis (PCA).
- You can learn more about PCA in [this video](https://www.coursera.org/learn/unsupervised-learning-recommenders-reinforcement-learning/lecture/73zWO/reducing-the-number-of-features-optional) from the Machine Learning Specialization.

In [None]:
from sklearn.decomposition import PCA

# Perform PCA for 2D visualization
PCA_model = PCA(n_components = 2)
PCA_model.fit(embeddings_array)
new_values = PCA_model.transform(embeddings_array)

In [None]:
print("Shape: " + str(new_values.shape))
print(new_values)

In [None]:
import matplotlib.pyplot as plt
import mplcursors
%matplotlib ipympl

from utils import plot_2D
plot_2D(new_values[:,0], new_values[:,1], input_text_lst_news)

In [None]:
#### Embeddings and Similarity
- Plot a heat map to compare the embeddings of sentences that are similar and sentences that are dissimilar.

In [None]:
in_1 = """He couldn’t desert
          his post at the power plant."""

in_2 = """The power plant needed
          him at the time."""

in_3 = """Cacti are able to
          withstand dry environments."""

in_4 = """Desert plants can
          survive droughts."""

input_text_lst_sim = [in_1, in_2, in_3, in_4]

In [None]:
embeddings = []
for input_text in input_text_lst_sim:
    emb = embedding_model.get_embeddings([input_text])[0].values
    embeddings.append(emb)

embeddings_array = np.array(embeddings)

In [None]:
from utils import plot_heatmap

y_labels = input_text_lst_sim

# Plot the heatmap
plot_heatmap(embeddings_array, y_labels = y_labels, title = "Embeddings Heatmap")

In [None]:
Note: the heat map won't show everything because there are 768 columns to show.  To adjust the heat map with your mouse:
- Hover your mouse over the heat map.  Buttons will appear on the left of the heatmap.  Click on the button that has a vertical and horizontal double arrow (they look like axes).
- Left click and drag to move the heat map left and right.
- Right click and drag up to zoom in.
- Right click and drag down to zoom out.

In [None]:
#### Compute cosine similarity
- The `cosine_similarity` function expects a 2D array, which is why we'll wrap each embedding list inside another list.
- You can verify that sentence 1 and 2 have a higher similarity compared to sentence 1 and 4, even though sentence 1 and 4 both have the words "desert" and "plant".

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def compare(embeddings,idx1,idx2):
    return cosine_similarity([embeddings[idx1]],[embeddings[idx2]])

In [None]:
print(in_1)
print(in_2)
print(compare(embeddings,0,1))

In [None]:
print(in_1)
print(in_4)
print(compare(embeddings,0,3))