### OpenAI's Text Embeddings

In [25]:
# Loading the ENV File and Setting up the API Key
import os
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(), override=True)
print("ENV File is Loaded!")
print()

api_key = os.environ.get("OPENAI_API_KEY")
if api_key:
    print("API Key loaded successfully")
else:
    print("API Key Not Found, recheck your ENV File")

ENV File is Loaded!

API Key loaded successfully


In [26]:
# Loading the OpenAI Model
from openai import OpenAI
client = OpenAI()

In [None]:
text = """This is a random text
OpenAI o3 is a GPT Model developed by OpenAI as a successor to OpenAI o1."""
# print(f"Original Text\n{text}")

text = text.replace("\n", " ")
# print(f"\nAfter Text.replace\n{text}")

embedding = client.embeddings.create(
    input=text,
    model="text-embedding-3-small",
)

print(embedding)

CreateEmbeddingResponse(data=[Embedding(embedding=[0.002379980869591236, 0.018024902790784836, 0.056888289749622345, -0.034199781715869904, 0.001613954664207995, -0.03987833112478256, -0.0008471254841424525, 0.01355400774627924, 0.0023831927683204412, -0.01649606227874756, 0.022033290937542915, -0.014684578403830528, -0.027185099199414253, 0.0034398913849145174, -0.026851067319512367, -0.014273461885750294, -0.00880046933889389, -0.09507076442241669, -0.030756676569581032, 0.010284343734383583, 0.014915832318365574, 0.003764288267120719, 0.01121578086167574, -0.02351074293255806, 0.009333636611700058, -0.012275691144168377, 0.01175537146627903, 0.036152586340904236, 0.001181960804387927, -0.051389601081609726, 0.035253267735242844, -0.016354741528630257, -0.0856664627790451, 0.004021236207336187, -0.014144987799227238, -0.002137486357241869, -0.0008535491651855409, 0.01564813405275345, -0.0022322358563542366, 0.051595158874988556, 0.0022354477550834417, -0.016149181872606277, 0.0061924

In [None]:
print(embedding.data[0].embedding) # Vectors of floating point numbers

[0.002379980869591236, 0.018024902790784836, 0.056888289749622345, -0.034199781715869904, 0.001613954664207995, -0.03987833112478256, -0.0008471254841424525, 0.01355400774627924, 0.0023831927683204412, -0.01649606227874756, 0.022033290937542915, -0.014684578403830528, -0.027185099199414253, 0.0034398913849145174, -0.026851067319512367, -0.014273461885750294, -0.00880046933889389, -0.09507076442241669, -0.030756676569581032, 0.010284343734383583, 0.014915832318365574, 0.003764288267120719, 0.01121578086167574, -0.02351074293255806, 0.009333636611700058, -0.012275691144168377, 0.01175537146627903, 0.036152586340904236, 0.001181960804387927, -0.051389601081609726, 0.035253267735242844, -0.016354741528630257, -0.0856664627790451, 0.004021236207336187, -0.014144987799227238, -0.002137486357241869, -0.0008535491651855409, 0.01564813405275345, -0.0022322358563542366, 0.051595158874988556, 0.0022354477550834417, -0.016149181872606277, 0.006192447151988745, 0.036101195961236954, -0.012314232997

### Embedding the Dataset for Similarity Searches

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("words.csv")
df = df.sample(frac=1) # Shuffling
df

Unnamed: 0,text
24,sandwich
0,fox
28,bear
7,hare
14,hamster
40,water
17,wolf
6,rabbit
11,deer
10,cappuccino


### Estimating Embedding Costs With Tiktoken

In [37]:
# Extract the text columns as a list for token processing
words = list(df['text'])
words


['sandwich',
 'fox',
 'bear',
 'hare',
 'hamster',
 'water',
 'wolf',
 'rabbit',
 'deer',
 'cappuccino',
 'raccoon',
 'guinea pig',
 'badger',
 'burger',
 'blue',
 'coyote',
 'purple',
 'lizard',
 'salad',
 'dog',
 'bird',
 'yellow',
 'pizza',
 'brown',
 'soda',
 'milk',
 'red',
 'gray',
 'coffee',
 'turtle',
 'cat',
 'squirrel',
 'snake',
 'tea',
 'orange',
 'pasta',
 'fish',
 'opossum',
 'ferret',
 'black',
 'green',
 'white']

In [48]:
import tiktoken
import pandas as pd

## Provide the Embedding details, which model was used to embedding. 
enc = tiktoken.encoding_for_model('text-embedding-3-small')
total_tokens = sum([len(enc.encode(word)) for word in words])
print(f"Total Tokens: {total_tokens}")

# Based on the below total tokens we can check the price of 'text-embedding-3-small' per tokens.
# As of now it 'text-embedding-3-small' costs $0.020/1M tokens

Total Tokens: 62


In [56]:
# Cost Calculation - Formula
per_token = 0.020/1_000_000
print(f"It will cost ${per_token:.10f} per token")

cost = per_token * total_tokens
print(f"Total Cost for calculating our total tokens will be: ${cost:.10f}")

It will cost $0.0000000200 per token
Total Cost for calculating our total tokens will be: $0.0000012400


In [None]:
# To calculate embedding, we need to define user functions
def get_embedding(text:str,
                  model:str="text-embedding-3-small"):
    """The model (text-embedding-3-small) converts text into a 1536 Dim float vector."""
    client = OpenAI()
    text = text.replace("\n", " ")
    embedding = client.embeddings.create(
        input=text,
        model=model
    )
    return embedding.data[0].embedding

In [30]:
df["embedding"] = df["text"].apply(lambda x: get_embedding(x))
df

Unnamed: 0,text,embedding
24,sandwich,"[-0.005681606009602547, -0.04887460917234421, ..."
0,fox,"[-0.03349510580301285, 0.0016073230654001236, ..."
28,bear,"[0.046153146773576736, -0.01701897382736206, -..."
7,hare,"[0.018302442505955696, -0.022041788324713707, ..."
14,hamster,"[0.03870227187871933, -0.027436034753918648, 0..."
40,water,"[0.0030297308694571257, 0.017433997243642807, ..."
17,wolf,"[-0.0030021839775145054, -0.026638340204954147..."
6,rabbit,"[0.006233165040612221, -0.0157419815659523, 9...."
11,deer,"[0.060789555311203, -0.03292562812566757, 0.00..."
10,cappuccino,"[-0.02482164278626442, -0.030885322019457817, ..."


In [None]:
## Saving the embedding files so that we don't need to re-run and waste cost on doing the same embeddings again
df.to_csv("words_embedding.csv", index=False)

### Performing Semantic Searches

In [65]:
import pandas as pd
df = pd.read_csv("words_embedding.csv")
df

Unnamed: 0,text,embedding
0,sandwich,"[-0.005681606009602547, -0.04887460917234421, ..."
1,fox,"[-0.03349510580301285, 0.0016073230654001236, ..."
2,bear,"[0.046153146773576736, -0.01701897382736206, -..."
3,hare,"[0.018302442505955696, -0.022041788324713707, ..."
4,hamster,"[0.03870227187871933, -0.027436034753918648, 0..."
5,water,"[0.0030297308694571257, 0.017433997243642807, ..."
6,wolf,"[-0.0030021839775145054, -0.026638340204954147..."
7,rabbit,"[0.006233165040612221, -0.0157419815659523, 9...."
8,deer,"[0.060789555311203, -0.03292562812566757, 0.00..."
9,cappuccino,"[-0.02482164278626442, -0.030885322019457817, ..."


In [66]:
# Convert the embedding column into numpy array
df["embedding"] = df["embedding"].apply(eval).apply(np.array)

In [67]:
search_term = 'red'
search_term_vector = get_embedding(search_term)

The Three Main Similarity Metrics in Semantic Search: 

- Cosine Similarity: Measures the angle between vectors: Scale -1 to +1 : OpenAI, Pinecone, Chroma, FAISS

- Dot Product Similarity: Measures raw alignment (no normalization): -∞ to +∞ : Google Vertex AI, some Hugging Face APIs

- Euclidean Distance (L2 Distance): Measures straight-line distance between vectors: 0 → ∞

In [68]:
import numpy as np

def cosine_similarity(vector_x, vector_y):
    """
    Compute the cosine similarity between two vectors.

    Parameters:
    vector_x (array-like): First input vector.
    vector_y (array-like): Second input vector.

    Returns:
    float: Cosine similarity between vector_x and vector_y.

    Raises:
    ValueError: If the input vectors have different dimensions or if any of the vectors is zero.
    """
    # Convert inputs to NumPy arrays
    x = np.array(vector_x)
    y = np.array(vector_y)
    
    # Ensure the vectors are one-dimensional
    if x.ndim != 1 or y.ndim != 1:
        raise ValueError("Both vectors must be one-dimensional.")
    
    # Check if vectors have the same dimensions
    if x.shape[0] != y.shape[0]:
        raise ValueError("Vectors must be of the same dimensions.")
    
    # Compute the dot product of the two vectors
    dot_product = np.dot(x, y)
    
    # Compute the norm (magnitude) of each vector
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)
    
    # Check for zero vectors to avoid division by zero
    if norm_x == 0 or norm_y == 0:
        raise ValueError("One of the vectors is zero; cosine similarity is not defined.")
    
    # Compute cosine similarity
    similarity = dot_product / (norm_x * norm_y)
    
    return similarity

In [69]:
# Fetch Cosine Similarity helper functions 
df["similarities"] = df["embedding"].apply(lambda x: cosine_similarity(x, search_term_vector))
df.sort_values("similarities", ascending=False).head(10)

Unnamed: 0,text,embedding,similarities
26,red,"[-0.02211996167898178, -0.010933708399534225, ...",0.999999
14,blue,"[-0.0011275681899860501, -0.016529185697436333...",0.641011
16,purple,"[0.02476203814148903, -0.03475233167409897, -0...",0.565341
41,white,"[0.0032487320713698864, -0.02826567552983761, ...",0.547562
34,orange,"[-0.02592204324901104, -0.00554656470194459, -...",0.544668
8,deer,"[0.060789555311203, -0.03292562812566757, 0.00...",0.504744
27,gray,"[0.004851989448070526, -0.02022680640220642, -...",0.504005
39,black,"[0.011193670332431793, -0.012095698155462742, ...",0.485174
23,brown,"[-0.0307017732411623, -0.007146547082811594, 0...",0.481941
1,fox,"[-0.03349510580301285, 0.0016073230654001236, ...",0.473315
