In [None]:
#Semantic Search App

In [1]:
import os
from openai import OpenAI
import pandas as pd
import numpy as np
from typing import List, Optional
from scipy import spatial
import csv

In [2]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your key>"))

In [3]:
#dataset

text = """
Word
Pen
Pencil
Car
Auto
Vehicle
"""

words_list = text.split()

with open('dataset.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ', quotechar=' ', quoting=csv.QUOTE_MINIMAL)
    for word in words_list:
        spamwriter.writerow(word)

In [4]:
words_dataframe = pd.read_csv("dataset.csv")
words_dataframe

Unnamed: 0,W o r d
0,P e n
1,P e n c i l
2,C a r
3,A u t o
4,V e h i c l e


In [5]:
def get_embedding(text):
    embedding_response = client.embeddings.create(
        input = [text], 
        model="text-embedding-3-small"
    )
    return embedding_response.data[0].embedding

In [6]:
key = words_dataframe.keys()[0]
words_dataframe["embedding"] = words_dataframe[key].apply(lambda w: get_embedding(w))

words_dataframe

Unnamed: 0,W o r d,embedding
0,P e n,"[0.05212598666548729, 0.005622430704534054, -0..."
1,P e n c i l,"[0.06025363504886627, 0.01388348825275898, -0...."
2,C a r,"[0.08718986809253693, 0.015374479815363884, 0...."
3,A u t o,"[-0.021905595436692238, 0.0021016716491431, -0..."
4,V e h i c l e,"[0.03211624547839165, -0.031236981973052025, 0..."


In [7]:
words_dataframe.to_csv("dataset_embeddings.csv", index=False)

In [8]:
df = pd.read_csv("dataset_embeddings.csv")
df

Unnamed: 0,W o r d,embedding
0,P e n,"[0.05212598666548729, 0.005622430704534054, -0..."
1,P e n c i l,"[0.06025363504886627, 0.01388348825275898, -0...."
2,C a r,"[0.08718986809253693, 0.015374479815363884, 0...."
3,A u t o,"[-0.021905595436692238, 0.0021016716491431, -0..."
4,V e h i c l e,"[0.03211624547839165, -0.031236981973052025, 0..."


In [9]:
df["embedding"] = df["embedding"].apply(eval).apply(np.array)
df

Unnamed: 0,W o r d,embedding
0,P e n,"[0.05212598666548729, 0.005622430704534054, -0..."
1,P e n c i l,"[0.06025363504886627, 0.01388348825275898, -0...."
2,C a r,"[0.08718986809253693, 0.015374479815363884, 0...."
3,A u t o,"[-0.021905595436692238, 0.0021016716491431, -0..."
4,V e h i c l e,"[0.03211624547839165, -0.031236981973052025, 0..."


In [10]:
search_word = input("Enter a a word to search: ")
search_vector = get_embedding(search_word)
print(search_vector)

Enter a a word to search:  cat


[0.025525547564029694, -0.023443017154932022, -0.016090169548988342, 0.03939357399940491, 0.020976554602384567, -0.02636321447789669, 0.0018542088801041245, 0.030621349811553955, -0.015950558707118034, 0.005302309989929199, 0.02216324955224991, -0.00016297042020596564, 0.010424288921058178, 0.003088893834501505, 0.029853489249944687, 0.006329032592475414, -0.021383754909038544, -0.01069769449532032, -0.030528275296092033, 0.05756627768278122, 0.03408835828304291, 0.04581568390130997, 0.020487917587161064, -0.04663008078932762, -0.006846757140010595, 0.038044001907110214, -0.009278315119445324, 0.04402400925755501, 0.05179568752646446, -0.013495732098817825, 0.003350664395838976, -0.04316307231783867, -0.01131430920213461, -0.029062360525131226, -0.022942744195461273, 0.01775386743247509, 0.017544452100992203, -0.028061814606189728, -0.015741141512989998, 0.013763319700956345, -0.03725287318229675, -0.00877804309129715, 0.045792412012815475, 0.011407383717596531, 0.009464463219046593, -

In [11]:
def distances_from_embeddings(
    query_embedding: List[float],
    embeddings: List[List[float]],
    distance_metric="cosine",
) -> List[List]:
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances

In [13]:
distances_result = distances_from_embeddings(
    search_vector,
    df["embedding"],
    "cosine",
)

distances_result

[0.7566929070465483,
 0.7700464029284367,
 0.6729896873706137,
 0.7568889530301204,
 0.7837764418407377]

In [14]:
distances_result = distances_from_embeddings(
    search_vector,
    df["embedding"],
    "L1"
)

distances_result

[38.107831702901876,
 38.863292441057794,
 35.922479439162544,
 37.64340366337501,
 39.19055610107307]

In [None]:
distances_result = distances_from_embeddings(
    search_vector,
    df["embedding"],
    "L1"
)

distances_result