### Getting Started With Text Embeddings
Basic introduction to history of text embeddings and how they actually work

In [1]:
import vertexai

vertexai.init(
    project = "", 
    location = "", 
    credentials = ""
)

In [2]:
from vertexai.language_models import TextEmbeddingModel

In [None]:
embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko@001")

In [None]:
embedding = embedding_model.get_embeddings(
    ["life"])

In [None]:
vector = embedding[0].values
print(f"Length = {len(vector)}")
print(vector[:10])

Create full sentence embedding

In [None]:
embedding = embedding_model.get_embeddings(
    ["What is the meaning of life?"])

In [None]:
vector = embedding[0].values
print(f"Length = {len(vector)}")
print(vector[:10])

#### Similarity

- Calculate the similarity between two sentences as a number between 0 and 1.
- Try out your own sentences and check if the similarity calculations match your intuition.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity # measures the distance between two vectors

In [None]:
emb_1 = embedding_model.get_embeddings(
    ["What is the meaning of life?"]) # 42!

emb_2 = embedding_model.get_embeddings(
    ["How does one spend their time well on Earth?"])

emb_3 = embedding_model.get_embeddings(
    ["Would you like a salad?"])

vec_1 = [emb_1[0].values]
vec_2 = [emb_2[0].values]
vec_3 = [emb_3[0].values]

- Note: the reason we wrap the embeddings (a Python list) in another list is because the `cosine_similarity` function expects either a 2D numpy array or a list of lists.
```Python
vec_1 = [emb_1[0].values]
```

In [None]:
print(cosine_similarity(vec_1,vec_2)) 
print(cosine_similarity(vec_2,vec_3))
print(cosine_similarity(vec_1,vec_3))

In [None]:
in_1 = "The kids play in the park."
in_2 = "The play was for kids in the park."

In [None]:
in_pp_1 = ["kids", "play", "park"]
in_pp_2 = ["play", "kids", "park"]

In [None]:
embeddings_1 = [emb.values for emb in embedding_model.get_embeddings(in_pp_1)]

In [None]:
import numpy as np
emb_array_1 = np.stack(embeddings_1)
print(emb_array_1.shape)

In [None]:
embeddings_2 = [emb.values for emb in embedding_model.get_embeddings(in_pp_2)]
emb_array_2 = np.stack(embeddings_2)
print(emb_array_2.shape)

In [None]:
emb_1_mean = emb_array_1.mean(axis = 0) 
print(emb_1_mean.shape)

In [None]:
emb_2_mean = emb_array_2.mean(axis = 0)

In [None]:
print(emb_1_mean[:4])
print(emb_2_mean[:4])

In [None]:
print(in_1)
print(in_2)

In [None]:
embedding_1 = embedding_model.get_embeddings([in_1])
embedding_2 = embedding_model.get_embeddings([in_2])

In [None]:
# what this shows is that embeddings encapsulate the semantic meaning of an entire sentence and not just an aggregation of tokenization of the words within the sentence
vector_1 = embedding_1[0].values
print(vector_1[:4])
vector_2 = embedding_2[0].values
print(vector_2[:4])