##### Open-AI Embeddings

In [1]:
from langchain_openai import OpenAIEmbeddings

In [3]:
import os
from dotenv import load_dotenv

In [5]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [6]:
embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small'
)

In [9]:
# Single Text Embeddings
single_text = "Langchain and RAG are amazing framwork and projects to work on"
embedding = embeddings.embed_query(single_text)

print('Dimension:',len(embedding))
print(embedding)

Dimension: 1536
[-0.03409690782427788, -0.020163342356681824, 0.009740980342030525, -0.003601590171456337, 0.009796603582799435, -0.033512864261865616, 0.004119579680263996, -0.006135913543403149, -0.015435386449098587, -0.0350981205701828, 0.021859845146536827, 0.01695111393928528, -0.024529749527573586, 0.04772454500198364, 0.008732813410460949, -0.013300852850079536, 0.004435935523360968, -0.05092287063598633, 0.03796270862221718, 0.07230991870164871, -0.03234478458762169, 0.0011411409359425306, 0.005037359427660704, 0.033095695078372955, -0.01743781566619873, -0.02123408578336239, -0.001485308283008635, 0.05428806319832802, -0.0025377999991178513, -0.021247992292046547, 0.0004919160273857415, -0.03345724195241928, 0.0035911607556045055, 0.020844724029302597, -0.008864917792379856, 0.022374358028173447, -0.0035807315725833178, -0.014906967990100384, -0.013092267327010632, 0.02458537183701992, 0.021609541028738022, 0.019328996539115906, 0.0031374855898320675, 0.0030314542818814516, 0

In [10]:
### Multiple texts at once
multiple_texts = [
    'Python is a programming language',
    'Langchain is a framwork for LLM applications',
    'Embeddings convert text to numbers',
    'Vectors can be compared for similarity'
]

In [12]:
multiple_embedding = embeddings.embed_documents(multiple_texts) 
print(multiple_embedding)
print(multiple_embedding[0])

[[-0.010949756018817425, -0.02036002092063427, 0.01881033554673195, -0.0028623314574360847, 0.015659986063838005, -0.0266709141433239, 0.0005158186540938914, 0.037233248353004456, -0.0016529136337339878, 0.01300920732319355, 0.02157326228916645, -0.024713415652513504, -0.009456144645810127, 0.0018568197265267372, 0.003955777734518051, 0.01548666600137949, -0.032951220870018005, 0.02979067713022232, -0.02720107138156891, 0.010378818958997726, -0.0014171472284942865, -0.00994042120873928, -0.05391276627779007, 0.01541529968380928, 0.036866217851638794, -0.042861055582761765, 0.005500366445630789, 0.03629527986049652, -0.019462835043668747, 0.0010934462770819664, 0.012958231382071972, -0.03233950212597847, -0.03645840659737587, 0.051261987537145615, -0.03117723949253559, -0.04506324231624603, 0.04583808407187462, -0.010434893891215324, 0.06839010119438171, -0.01505846343934536, 0.004029694013297558, -0.03914996609091759, 0.03134036436676979, -0.00037818204145878553, -0.0020530791953206062

In [13]:
multiple_embedding[0]

[-0.010949756018817425,
 -0.02036002092063427,
 0.01881033554673195,
 -0.0028623314574360847,
 0.015659986063838005,
 -0.0266709141433239,
 0.0005158186540938914,
 0.037233248353004456,
 -0.0016529136337339878,
 0.01300920732319355,
 0.02157326228916645,
 -0.024713415652513504,
 -0.009456144645810127,
 0.0018568197265267372,
 0.003955777734518051,
 0.01548666600137949,
 -0.032951220870018005,
 0.02979067713022232,
 -0.02720107138156891,
 0.010378818958997726,
 -0.0014171472284942865,
 -0.00994042120873928,
 -0.05391276627779007,
 0.01541529968380928,
 0.036866217851638794,
 -0.042861055582761765,
 0.005500366445630789,
 0.03629527986049652,
 -0.019462835043668747,
 0.0010934462770819664,
 0.012958231382071972,
 -0.03233950212597847,
 -0.03645840659737587,
 0.051261987537145615,
 -0.03117723949253559,
 -0.04506324231624603,
 0.04583808407187462,
 -0.010434893891215324,
 0.06839010119438171,
 -0.01505846343934536,
 0.004029694013297558,
 -0.03914996609091759,
 0.03134036436676979,
 -0.00

##### Cosine Similarity with Open-AI Embeddings

In [15]:
import numpy as np

In [16]:
def cosine_similarity(vec1,vec2):
    dot_product = np.dot(vec1,vec2)
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)
    return dot_product/(norm_a * norm_b)

In [17]:
for i in range(len(multiple_texts)):
    for j in range(i+1,len(multiple_texts)):
        similarity = cosine_similarity(multiple_embedding[i],multiple_embedding[j])

        print(f"'{multiple_texts[i]}' vs '{multiple_texts[j]}'")
        print(f'similarity {similarity:.3f}\n')

'Python is a programming language' vs 'Langchain is a framwork for LLM applications'
similarity 0.134

'Python is a programming language' vs 'Embeddings convert text to numbers'
similarity 0.143

'Python is a programming language' vs 'Vectors can be compared for similarity'
similarity 0.147

'Langchain is a framwork for LLM applications' vs 'Embeddings convert text to numbers'
similarity 0.140

'Langchain is a framwork for LLM applications' vs 'Vectors can be compared for similarity'
similarity 0.115

'Embeddings convert text to numbers' vs 'Vectors can be compared for similarity'
similarity 0.257



In [19]:
## Example- Semantic Search --> Retrive the similar sentence
documents = [
    'Langchain is a framwork for developing applications powered by language models',
    'Python is a High-Level programming language',
    'Machine learning is a subset of artificial intelligence',
    'Embeddings convert text into numerical vectors',
    'The Weather today is sunny and warm'
]

query = 'What is Langchain?'

In [23]:
def sementice_search(query,documents,embedding_model,top_k=3):
    '''simple semantic search implementation'''

    query_embedding = embedding_model.embed_query(query)
    doc_embeddings = embedding_model.embed_documents(documents)

    similarities = []
    # calculate similarity score
    for i,doc_emb in enumerate(doc_embeddings):
        similarity = cosine_similarity(query_embedding,doc_emb)
        similarities.append((similarity,documents[i]))
    
    similarities.sort(reverse=True)
    return similarities[:top_k]


In [24]:
results = sementice_search(query,documents,embeddings)
print(results)

[(np.float64(0.5966289544536071), 'Langchain is a framwork for developing applications powered by language models'), (np.float64(0.14669387712873092), 'Python is a High-Level programming language'), (np.float64(0.1010465537332217), 'Embeddings convert text into numerical vectors')]


In [25]:
for score,doc in results:
    print(f'{score:.3f} | {doc}')

0.597 | Langchain is a framwork for developing applications powered by language models
0.147 | Python is a High-Level programming language
0.101 | Embeddings convert text into numerical vectors


In [26]:
query = 'What is Embeddings?'
results = sementice_search(query,documents,embeddings)
# print(results)
for score,doc in results:
    print(f'{score:.3f} | {doc}')


0.623 | Embeddings convert text into numerical vectors
0.258 | Langchain is a framwork for developing applications powered by language models
0.252 | Machine learning is a subset of artificial intelligence
