In [13]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

load_dotenv()

True

In [14]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=32)

documents = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "To be or not to be, that is the question.",
    "All that glitters is not gold.",
    "The only thing we have to fear is fear itself.",
    "In the end, we will remember not the words of our enemies, but the silence of our friends.",
    "The greatest glory in living lies not in never falling, but in rising every time we fall.",
    "The future belongs to those who believe in the beauty of their dreams.",
    "Life is what happens when you're busy making other plans.",
    "The purpose of our lives is to be happy."]

In [16]:
query = "What is the purpose of life?"

document_embeddings = embeddings.embed_documents(documents)
query_embedding = embeddings.embed_query(query)

In [30]:
scores = cosine_similarity(
    [query_embedding],
    document_embeddings 
)[0]

indexs, score = sorted(list(enumerate(scores)), key=lambda x: x[1])[-1]
print(f"Query: {query}")
print(documents[indexs])
print(f"Score: {score}")

Query: What is the purpose of life?
The purpose of our lives is to be happy.
Score: 0.6772683836862361


In [24]:
scores = cosine_similarity(
    [query_embedding],
    document_embeddings 
)[0]
scores = list(enumerate(scores))
scores = sorted(scores, key=lambda x: x[1], reverse=True)
print("Top 3 most similar documents:")
for i, score in scores[:3]:
    print(f"Document: {documents[i]}\nScore: {score}\n")

Top 3 most similar documents:
Document: The purpose of our lives is to be happy.
Score: 0.6772683836862361

Document: To be or not to be, that is the question.
Score: 0.5383528423622682

Document: The only thing we have to fear is fear itself.
Score: 0.37163735462843417



In [19]:
cosine_similarities = cosine_similarity(
    np.array(query_embedding).reshape(1, -1),
    np.array(document_embeddings)
)
similarities = cosine_similarities.flatten()
similarity_scores = list(zip(documents, similarities))
similarity_scores.sort(key=lambda x: x[1], reverse=True)    
print("Top 3 most similar documents:")
for doc, score in similarity_scores[:3]:
    print(f"Document: {doc}\nScore: {score:.4f}\n")
print("Top 3 least similar documents:")


Top 3 most similar documents:
Document: The purpose of our lives is to be happy.
Score: 0.6773

Document: To be or not to be, that is the question.
Score: 0.5384

Document: The only thing we have to fear is fear itself.
Score: 0.3716

Top 3 least similar documents:
