In [5]:
import os
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

embedding_model = OpenAIEmbeddings(model='text-embedding-3-large')

In [6]:
text = "Hello, I am learning embeddings"

embedding_txt = embedding_model.embed_query(text)

print(f"input text: {text}")
print(f"embedding length: {len(embedding_txt)}")
print(f"embeddings: {embedding_txt}")

input text: Hello, I am learning embeddings
embedding length: 3072
embeddings: [-0.03266306221485138, 0.04774260148406029, -0.01857120357453823, -0.0234539695084095, 0.03302474692463875, -0.025290224701166153, 0.019739728420972824, 0.013702348805963993, -0.019211111590266228, 0.009591644629836082, -0.010141129605472088, -0.025916218757629395, -0.03513922169804573, 0.004573245998471975, 0.024052143096923828, 0.020560478791594505, 0.02416343241930008, 0.02057439088821411, -0.0049071102403104305, -0.03269088268280029, 0.021854203194379807, -0.04966232180595398, -0.025137202814221382, -0.005327918566763401, -0.0031925777439028025, 0.004813211038708687, 0.008437030017375946, 0.013535416685044765, -0.03703112155199051, 0.024052143096923828, -0.002803069306537509, -0.033386435359716415, 0.01983710564672947, 0.0048201666213572025, 0.010968835093080997, 0.00445500249043107, 0.04201126471161842, 0.013945791870355606, -0.03483317792415619, 0.017402678728103638, 0.03653032332658768, 0.051164709031

### Embed Documents

In [9]:
# Example 1: Finding similar sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]


embedding_sent = embedding_model.embed_documents(sentences)

print(f"input text: {sentences}")
print(f"embedding length: {len(embedding_sent)}")
print(f"embeddings: {embedding_sent}")

input text: ['The cat sat on the mat', 'A feline rested on the rug', 'The dog played in the yard', 'I love programming in Python', 'Python is my favorite programming language']
embedding length: 5
embeddings: [[-0.026963824406266212, -0.001286483253352344, -0.003552714828401804, 0.026718435809016228, 0.03083229623734951, -0.012406534515321255, -0.005571753717958927, 0.0067084780894219875, 0.037587687373161316, 0.014073730446398258, -0.006798694375902414, -0.008494758978486061, -0.05459164083003998, -0.035047195851802826, 0.00621048454195261, 0.011756977997720242, -0.03319956734776497, 0.03680821880698204, 0.010226910933852196, -0.021680761128664017, -0.015502755530178547, -0.0376165546476841, -0.004478333052247763, 0.009303096681833267, -0.01592135801911354, -0.02683391235768795, -0.012817920185625553, 0.016036834567785263, -0.002378099597990513, 0.025635840371251106, 0.016787433996796608, 0.03435433655977249, -0.006621870677918196, 0.0010898119071498513, -0.01015473809093237, -0.00335

In [8]:
# Different OpenAI embedding models
models_comparison = {
    "text-embedding-3-small": {
        "dimensions": 1536,
        "description": "Good balance of performance and cost",
        "cost_per_1m_tokens": 0.02,
        "use_case": "General purpose, cost-effective"
    },
    "text-embedding-3-large": {
        "dimensions": 3072,
        "description": "Highest quality embeddings",
        "cost_per_1m_tokens": 0.13,
        "use_case": "When accuracy is critical"
    },
    "text-embedding-ada-002": {
        "dimensions": 1536,
        "description": "Previous generation model",
        "cost_per_1m_tokens": 0.10,
        "use_case": "Legacy applications"
    }
}

# Display comparison
print("📊 OpenAI Embedding Models Comparison:\n")
for model_name, details in models_comparison.items():
    print(f"Model: {model_name}")
    print(f"🔑 Dimensions: {details['dimensions']}")
    print(f"💰 Cost: ${details['cost_per_1m_tokens']}/1M tokens")
    print(f"📝 Description: {details['description']}")
    print(f"🎯 Use case: {details['use_case']}\n")


📊 OpenAI Embedding Models Comparison:

Model: text-embedding-3-small
🔑 Dimensions: 1536
💰 Cost: $0.02/1M tokens
📝 Description: Good balance of performance and cost
🎯 Use case: General purpose, cost-effective

Model: text-embedding-3-large
🔑 Dimensions: 3072
💰 Cost: $0.13/1M tokens
📝 Description: Highest quality embeddings
🎯 Use case: When accuracy is critical

Model: text-embedding-ada-002
🔑 Dimensions: 1536
💰 Cost: $0.1/1M tokens
📝 Description: Previous generation model
🎯 Use case: Legacy applications



### Cosine Similarity with OpenAI Embeddings

In [11]:
import numpy as np

def cosine_similarity(v1, v2):

    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)

    return dot_product / (norm_v1 * norm_v2)

In [20]:
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]

for i in range(len(sentences)):
    for j in range(len(sentences)):
        result = cosine_similarity(embedding_sent[i], embedding_sent[j])
        print(f"text 1: {sentences[i]}\ntext 2: {sentences[j]}\nsimilarity score: {result}")
        print("_"*80)

text 1: The cat sat on the mat
text 2: The cat sat on the mat
similarity score: 1.0
________________________________________________________________________________
text 1: The cat sat on the mat
text 2: A feline rested on the rug
similarity score: 0.530197739961889
________________________________________________________________________________
text 1: The cat sat on the mat
text 2: The dog played in the yard
similarity score: 0.33271521312995117
________________________________________________________________________________
text 1: The cat sat on the mat
text 2: I love programming in Python
similarity score: 0.11049137767293883
________________________________________________________________________________
text 1: The cat sat on the mat
text 2: Python is my favorite programming language
similarity score: 0.1505469502228837
________________________________________________________________________________
text 1: A feline rested on the rug
text 2: The cat sat on the mat
similarity sco

### Example - 2

In [41]:
import numpy as np

def cosine_similarity2(v1, v2):

    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)

    return dot_product / (norm_v1 * norm_v2)

def semantic_search(query, documents, embedding_model, top_k=3):
    query_embedding = embedding_model.embed_query(query)
    doc_embedding = embedding_model.embed_documents(documents)

    similarity = []

    for i in range(len(documents)):
        result = cosine_similarity2(query_embedding, doc_embedding[i])
        similarity.append((result, documents[i]))

    similarity.sort(reverse=True)

    return similarity[:top_k]

### Call 1

In [43]:
### Example - Semantic Search
# Test semantic search
documents = [
    "LangChain is a framework for developing applications powered by language models",
    "Python is a high-level programming language",
    "Machine learning is a subset of artificial intelligence",
    "Embeddings convert text into numerical vectors",
    "The weather today is sunny and warm"
]

query = "What is Langchain?"
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

result = semantic_search(query, documents, embedding_model, top_k=3)

for score, doc in result:
    print(f"score: {score:.3f} | {doc}")

score: 0.628 | LangChain is a framework for developing applications powered by language models
score: 0.127 | Embeddings convert text into numerical vectors
score: 0.120 | Machine learning is a subset of artificial intelligence


### Call 2

In [None]:
query = "What is Embeddings?"

result = semantic_search(query, documents, embedding_model, top_k=3)

for score, doc in result:
    print(f"score: {score:.3f} | {doc}")

score: 0.631 | Embeddings convert text into numerical vectors
score: 0.218 | LangChain is a framework for developing applications powered by language models
score: 0.193 | Machine learning is a subset of artificial intelligence
