## open ai embedding

In [1]:
import os
import numpy as np
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
embeddings=OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
## single text embedding
text = "Hello, I am Learning about embedding"
text_embedding = embeddings.embed_query(text)
print(f"Text: {text}")
print(f"Embedding length: {len(text_embedding)}")
print(text_embedding)

Text: Hello, I am Learning about embedding
Embedding length: 1536
[-0.0004855790757574141, -0.030097153037786484, 0.035780616104602814, 0.0005166386254131794, 0.0035189171321690083, -0.0007336181006394327, 0.03093707375228405, 0.0052810003980994225, -0.04319991171360016, 0.03751645237207413, 0.017932303249835968, -0.02067604474723339, -0.02171194553375244, -0.056526653468608856, 0.0077902632765471935, 0.02476365678012371, 0.0005980059504508972, 0.02838931418955326, 0.02574356459081173, 0.07777664065361023, 0.02791335992515087, -0.014768603257834911, -0.03552863746881485, 0.06025030091404915, 0.006866350304335356, -0.06836953014135361, 0.03421276435256004, 0.06086624413728714, -0.005120015703141689, -0.0032669410575181246, 0.01646244339644909, -0.031665004789829254, -0.05294299125671387, -0.011261934414505959, -0.013991676270961761, 0.012052860110998154, 0.0052740010432899, 0.032140959054231644, -0.023447781801223755, 0.03396078571677208, 0.03519267216324806, -0.014390639029443264, -0.0

In [5]:
## multiple embedding
sentences = [
    "Hello, I am Learning about embedding",
    "Machine learning is fascinating",
    "Python is great for data science",
    "I love coding in python",
    "Embeddings convert text to vectors"
]

sentences_embedding = embeddings.embed_documents(sentences)
print(f"Text: {sentences}")
print(f"Embedding length: {len(sentences_embedding)}")
for i in range(len(sentences_embedding)):
    print(sentences_embedding[i])


Text: ['Hello, I am Learning about embedding', 'Machine learning is fascinating', 'Python is great for data science', 'I love coding in python', 'Embeddings convert text to vectors']
Embedding length: 5
[-0.0004731746739707887, -0.030087262392044067, 0.035740867257118225, 0.0005138449487276375, 0.003496769582852721, -0.0007600532262586057, 0.03101086989045143, 0.005328244064003229, -0.043213702738285065, 0.037476133555173874, 0.017954397946596146, -0.02068324387073517, -0.021690817549824715, -0.05650807544589043, 0.007801697123795748, 0.02474152483046055, 0.0006240482907742262, 0.028393978253006935, 0.025735104456543922, 0.07775108516216278, 0.02790418639779091, -0.01479173731058836, -0.03551696240901947, 0.06028647720813751, 0.0067871264182031155, -0.06834706664085388, 0.034201521426439285, 0.06090221554040909, -0.005079849157482386, -0.003246625419706106, 0.016415050253272057, -0.031682588160037994, -0.052897606045007706, -0.011265230365097523, -0.013980081304907799, 0.01204190216958

## Cosine Similairty

In [6]:
## Cosine Similarity
def cosine_similarity(vec1, vec2):
    dot_product=np.dot(vec1, vec2)
    norm_a=np.linalg.norm(vec1)
    norm_b=np.linalg.norm(vec2)
    return dot_product/(norm_a * norm_b)

In [7]:
## Calculating the similarity
for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        similarity = cosine_similarity(sentences_embedding[i], sentences_embedding[j])
        print(f"{sentences[i]} and {sentences[j]} have {similarity:.3f} similarity")

Hello, I am Learning about embedding and Machine learning is fascinating have 0.354 similarity
Hello, I am Learning about embedding and Python is great for data science have 0.182 similarity
Hello, I am Learning about embedding and I love coding in python have 0.285 similarity
Hello, I am Learning about embedding and Embeddings convert text to vectors have 0.468 similarity
Machine learning is fascinating and Python is great for data science have 0.392 similarity
Machine learning is fascinating and I love coding in python have 0.317 similarity
Machine learning is fascinating and Embeddings convert text to vectors have 0.256 similarity
Python is great for data science and I love coding in python have 0.601 similarity
Python is great for data science and Embeddings convert text to vectors have 0.142 similarity
I love coding in python and Embeddings convert text to vectors have 0.139 similarity


## Semantic Search

In [8]:
documents = [
    "Python is a high-level programming language that is widely used in data science, web development, and automation.",
    "Machine learning is a subset of artificial intelligence that focuses on training models from data to make predictions or decisions.",
    "Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and human language.",
    "Embeddings are numerical vector representations of text that allow semantic comparisons between words, sentences, or documents.",
    "A neural network is a computing system inspired by the human brain's network of neurons, commonly used in deep learning.",
    "Data science involves collecting, analyzing, and interpreting large amounts of data to extract insights.",
    "Semantic search improves search results by understanding the contextual meaning of queries rather than relying on exact keyword matches.",
    "OpenAI provides APIs for language models, embeddings, and other AI tools to enable advanced natural language understanding.",
    "Version control systems like Git help track changes in code and collaborate efficiently with other developers.",
    "Cloud computing allows users to access computing resources over the internet, providing flexibility, scalability, and cost efficiency."
]
query = "What is Neural Network?"

In [9]:
def semantic_search(query, documnets, embeddings_models, top_k=3):
    # embed query and document
    query_embedding=embeddings_models.embed_query(query)
    doc_embedding = embeddings_models.embed_documents(documents)

    # store similarity values
    similarities=[]

    for i, doc_emb in enumerate(doc_embedding):
        similarity=cosine_similarity(query_embedding, doc_emb)
        similarities.append((similarity, documents[i]))
    
    # sort by similarity
    similarities.sort(reverse=True)
    return similarities[:top_k]

In [10]:
results = semantic_search(query, documents, embeddings)
results

[(np.float64(0.6840187536981414),
  "A neural network is a computing system inspired by the human brain's network of neurons, commonly used in deep learning."),
 (np.float64(0.4025651095930529),
  'Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and human language.'),
 (np.float64(0.33420356012284536),
  'Machine learning is a subset of artificial intelligence that focuses on training models from data to make predictions or decisions.')]