In [None]:
sentences = [  
    # Botanic context  
    "I planted a maple tree in the backyard.",  
    "The tree provides shade during hot summer days.",  
    "We sat under the old oak tree.",  
    "The tree blossoms in the spring.",  
    "The apple tree bears fruit every autumn.",  
    "The tree's roots were deep in the ground.",  
    "The tree's leaves turned yellow and fell off.",  
    "The pine tree stood tall in the forest.",  
    "The willow tree hung over the pond.",  
    "The tree was cut down to make room for new construction.",  
      
    # Computer programming context  
    "The binary tree is a fundamental data structure in computer science.",  
    "Each node in the tree stores a piece of data.",  
    "The tree structure allows efficient search and sort operations.",  
    "The tree is traversed in a pre-order, in-order, or post-order manner.",  
    "A balanced binary tree offers optimal performance.",  
    "The tree's root node has no parent.",  
    "Each node in the tree has a link to its parent and children.",  
    "The tree's leaf nodes have no children.",  
    "A tree in computer science is not necessarily rooted.",  
    "The tree algorithm was implemented recursively.",  
      
    # Family tree context  
    "My family tree traces back to the 16th century.",  
    "I am researching my family tree.",  
    "My family tree has branches all over the world.",  
    "The family tree shows our genealogy.",  
    "I found an interesting ancestor in our family tree.",  
    "My family tree is quite complex.",  
    "Our family tree includes several notable individuals.",  
    "The family tree reveals our heritage.",  
    "I discovered distant relatives through the family tree.",  
    "The family tree helps us understand our roots.",  
]  


In [None]:
import os
from sentence_transformers import SentenceTransformer, util

# https://www.sbert.net/docs/pretrained_models.html
transformers_cache = os.environ.get('TRANSFORMERS_CACHE')
print(transformers_cache)


# https://www.sbert.net/docs/pretrained_models.html
#model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder=transformers_cache)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', cache_folder=transformers_cache)
#model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1', cache_folder=transformers_cache)
#model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2', cache_folder=transformers_cache)

In [None]:
from pprint import pprint
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Now we want to transform all sentences with bert
embeddings = model.encode(sentences)

# calculate the module of all vectors in embeddings
vectors_module = np.linalg.norm(embeddings, axis=1)
# Verify if the vector are unit vector or not. This is important
# because the cosine similarity is defined as the dot product
# between two unit vectors. If the vectors are not unit vectors
# the cosine similarity cannot be calculated with dot product
pprint(vectors_module)

In [None]:

#and now I want to print a matrix with the cosine similarity of each sentence with each other sentence
#df = pd.DataFrame(cosine_similarity(embeddings))
df = pd.DataFrame(util.cos_sim(embeddings, embeddings))
dfdot = pd.DataFrame(util.dot_score(embeddings, embeddings))

df.columns = range(len(sentences))
df.index = range(len(sentences))

df


In [None]:
import numpy as np
import umap
import altair as alt

# declare a function to plot the embeddings
def plot_embeddings(sentences, embeddings):

    panda_sentences = pd.DataFrame({'text':sentences})

    # UMAP reduces dimension to a plottable 2DE
    reducer = umap.UMAP(n_neighbors=2)
    umap_embeds = reducer.fit_transform(embeddings)
    # create a dataframe with the umap embeddings and the corresponding sentences
    df_plot = pd.DataFrame({'x': umap_embeds[:, 0], 'y': umap_embeds[:, 1], 'text': panda_sentences['text']})

    # create the interactive scatter plot with labels
    return alt.Chart(df_plot, width=1100, height=600).mark_circle(size=60).encode(
        x='x',
        y='y',
        tooltip=['text']
    )


In [None]:
# now we can plot the embeddings
chart  = plot_embeddings(sentences, embeddings)

chart.interactive()

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
#print out the value of the environment variable OPENAI_API_KEY

import openai

openai.api_type = "azure"
openai.api_key = os.environ.get("OPENAI_API_KEY")   
openai.api_base = os.environ.get("OPENAI_API_BASE")
openai.api_version = "2023-05-15"

embeddings_ada = []
for sentence in sentences:
    response = openai.Embedding.create(
        input=sentence,
        engine="text-embedding-ada-002"
    )
    embeddings_ada.append(response['data'][0]['embedding'])
    
# Now we can plot the embeddings
chart  = plot_embeddings(sentences, embeddings_ada)
chart.interactive()



In [None]:
df = pd.DataFrame(cosine_similarity(embeddings_ada))
df.columns = range(len(sentences))
df.index = range(len(sentences))
#print only the first sentence
df