In [None]:
sentences = [  
    # Botanic context  
    "I planted a maple tree in the backyard.",  
    "The tree provides shade during hot summer days.",  
    "We sat under the old oak tree.",  
    "The tree blossoms in the spring.",  
    "The apple tree bears fruit every autumn.",  
    "The tree's roots were deep in the ground.",  
    "The tree's leaves turned yellow and fell off.",  
    "The pine tree stood tall in the forest.",  
    "The willow tree hung over the pond.",  
    "The tree was cut down to make room for new construction.",  
      
    # Computer programming context  
    "The binary tree is a fundamental data structure in computer science.",  
    "Each node in the tree stores a piece of data.",  
    "The tree structure allows efficient search and sort operations.",  
    "The tree is traversed in a pre-order, in-order, or post-order manner.",  
    "A balanced binary tree offers optimal performance.",  
    "The tree's root node has no parent.",  
    "Each node in the tree has a link to its parent and children.",  
    "The tree's leaf nodes have no children.",  
    "A tree in computer science is not necessarily rooted.",  
    "The tree algorithm was implemented recursively.",  
      
    # Family tree context  
    "My family tree traces back to the 16th century.",  
    "I am researching my family tree.",  
    "My family tree has branches all over the world.",  
    "The family tree shows our genealogy.",  
    "I found an interesting ancestor in our family tree.",  
    "My family tree is quite complex.",  
    "Our family tree includes several notable individuals.",  
    "The family tree reveals our heritage.",  
    "I discovered distant relatives through the family tree.",  
    "The family tree helps us understand our roots.",  
]  


In [None]:
import os
from sentence_transformers import SentenceTransformer, util
from visualization_utils import openai_plot_2D, plot_embeddings

# https://www.sbert.net/docs/pretrained_models.html
transformers_cache = os.environ.get('TRANSFORMERS_CACHE')
print(transformers_cache)


# https://www.sbert.net/docs/pretrained_models.html
#model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder=transformers_cache)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', cache_folder=transformers_cache)
#model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1', cache_folder=transformers_cache)
#model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2', cache_folder=transformers_cache)

In [None]:
from pprint import pprint
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Now we want to transform all sentences with bert
embeddings = model.encode(sentences)

# calculate the module of all vectors in embeddings
vectors_module = np.linalg.norm(embeddings, axis=1)
vectors_module = np.round(vectors_module, decimals=5)

# Verify if the vectors are normalized vectors or not. This is important
# because the cosine similarity is defined as the dot product
# between two normalized vectors. 
pprint(vectors_module)

In [None]:

#and now I want to print a matrix with the cosine similarity of each sentence with each other sentence
#df = pd.DataFrame(cosine_similarity(embeddings))
df = pd.DataFrame(util.cos_sim(embeddings, embeddings))
dfdot = pd.DataFrame(util.dot_score(embeddings, embeddings))

df.columns = range(len(sentences))
df.index = range(len(sentences))

df


In [None]:
# now we can plot the embeddings
chart  = plot_embeddings(sentences, embeddings)
chart.interactive()


In [None]:

openai_plot_2D(sentences, embeddings, show_labels=False)

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
#print out the value of the environment variable OPENAI_API_KEY

from openai import AzureOpenAI
client = AzureOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),  
    api_version="2023-12-01-preview",
    azure_endpoint=os.getenv("AZURE_ENDPOINT")
)

embeddings_ada = []

response = client.embeddings.create(
    input=sentences,
        model="text-embedding-ada-002"
)

for data in response.data:
    embeddings_ada.append(data.embedding)

# Now we can plot the embeddings
chart = plot_embeddings(sentences, embeddings_ada)
chart.interactive()


In [None]:
openai_plot_2D(sentences, embeddings_ada, show_labels=True)


In [None]:
# now use openai new models
from openai import OpenAI
client = OpenAI()
client.api_key = os.getenv('OPENAI_API_KEY_NOT_AZURE')

response_te3_large = client.embeddings.create(
    input=sentences,
    model="text-embedding-3-large"
)

embeddings_te3_large = []
for data in response_te3_large.data:
    embeddings_te3_large.append(data.embedding)

openai_plot_2D(sentences, embeddings_te3_large, show_labels=False)


In [None]:
#Now reduce dimensions
# now use openai new models
from openai import OpenAI
client = OpenAI()
client.api_key = os.getenv('OPENAI_API_KEY_NOT_AZURE')

response_te3_large_512 = client.embeddings.create(
    input=sentences,
    dimensions=512,
    model="text-embedding-3-large"
)

embeddings_te3_large_512 = []
for data in response_te3_large_512.data:
    embeddings_te3_large_512.append(data.embedding)

openai_plot_2D(sentences, embeddings_te3_large_512, show_labels=False)

In [None]:
#Now reduce dimensions a lot
# now use openai new models
from openai import OpenAI
client = OpenAI()
client.api_key = os.getenv('OPENAI_API_KEY_NOT_AZURE')

response_te3_large_128 = client.embeddings.create(
    input=sentences,
    dimensions=128,
    model="text-embedding-3-large"
)

embeddings_te3_large_128 = []
for data in response_te3_large_128.data:
    embeddings_te3_large_128.append(data.embedding)

openai_plot_2D(sentences, embeddings_te3_large_128, show_labels=False)

In [None]:
chart  = plot_embeddings(sentences, embeddings_te3_large_128)

chart.interactive()

In [None]:
import random
#last time with bert

# Encode the shuffled sentences using the model
embeddings_bert = model.encode(sentences)

# Plot the embeddings in 2D
openai_plot_2D(embeddings_bert)

