## Creating First Embeddings

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings

#Initialize a simple embedding model (no API key needed)

# # using sentence-transformers/all-MiniLM-L6-v2
    # This is a sentence-transformers model: It maps sentences & 
    # paragraphs to a 384 dimensional dense vector space 
    # and can be used for tasks like clustering or semantic search.

embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)
embeddings

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [7]:
# Creating the first embeddings
text = "Hello i am learning about embeddings"
embding = embeddings.embed_query(text)
print (f"text: {text}")
print(f"embedding length: {len(embding)}")
print(embding)

text: Hello i am learning about embeddings
embedding length: 384
[-0.00793467741459608, -0.09170758724212646, -0.004743022844195366, -0.004440774209797382, 0.010724031366407871, 0.0731835812330246, 0.014126702211797237, 0.016582047566771507, 0.0470489040017128, -0.027279920876026154, 0.027608409523963928, 0.060199227184057236, 0.048200931400060654, 0.003347194055095315, -0.05125788226723671, 0.02134580723941326, 0.04995120316743851, 0.07593204826116562, -0.0768774077296257, -0.0082407770678401, -0.029051505029201508, -0.050096023827791214, -0.003639490343630314, -0.0949680283665657, 0.018798956647515297, -0.023366272449493408, -0.02464318834245205, 0.048010218888521194, 0.10514663904905319, -0.05323879420757294, 0.039965711534023285, -0.047687653452157974, -0.016098104417324066, 0.061894550919532776, -0.07510850578546524, 0.11249525845050812, 0.0443122535943985, -0.006981412880122662, -0.08516158908605576, -0.0019317783880978823, 0.02246932126581669, 0.019911324605345726, -0.0291814990

In [8]:
# creating multiple embeddings
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]

embding2 = embeddings.embed_documents(sentences)
print (f"text: {sentences}")
print(f"embedding length: {len(embding2)}")
print(embding2)

text: ['The cat sat on the mat', 'A feline rested on the rug', 'The dog played in the yard', 'I love programming in Python', 'Python is my favorite programming language']
embedding length: 5
[[0.1304018646478653, -0.011870092712342739, -0.028117021545767784, 0.05123866721987724, -0.05597447603940964, 0.030191542580723763, 0.0301612988114357, 0.024698395282030106, -0.018370550125837326, 0.05876676365733147, -0.02495318278670311, 0.0601542592048645, 0.039831746369600296, 0.033230509608983994, -0.061311349272727966, -0.049373116344213486, -0.05486350879073143, -0.04007606953382492, 0.056429121643304825, 0.039156582206487656, -0.03473709151148796, -0.013247676193714142, 0.031966209411621094, -0.06349921226501465, -0.06017857789993286, 0.07823451608419418, -0.02830391190946102, -0.04744282737374306, 0.04035929590463638, -0.006630908697843552, -0.0667409598827362, -0.004191359970718622, -0.025311656296253204, 0.053341712802648544, 0.01742810197174549, -0.09792360663414001, 0.0060612857341766

In [11]:
import numpy as np
def cosine_similarity (vec1, vec2):
    dot = np.dot(vec1, vec2)
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)
    return dot/(norm_a * norm_b)


In [16]:
for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        similarity = cosine_similarity(embding2[i], embding2[j])

        print(f"{sentences[i]} vs {sentences[j]}")
        print(f"Similarity: {similarity} \n")

The cat sat on the mat vs A feline rested on the rug
Similarity: 0.5643377313623638 

The cat sat on the mat vs The dog played in the yard
Similarity: 0.18758601860648283 

The cat sat on the mat vs I love programming in Python
Similarity: 0.020349816875755627 

The cat sat on the mat vs Python is my favorite programming language
Similarity: 0.003496191401958436 

A feline rested on the rug vs The dog played in the yard
Similarity: 0.27530901596251256 

A feline rested on the rug vs I love programming in Python
Similarity: 0.0652239488698381 

A feline rested on the rug vs Python is my favorite programming language
Similarity: 0.06269834843048634 

The dog played in the yard vs I love programming in Python
Similarity: 0.12161602094837093 

The dog played in the yard vs Python is my favorite programming language
Similarity: 0.09844986986053741 

I love programming in Python vs Python is my favorite programming language
Similarity: 0.8773668054630241 



### Example - Semantic search
#### Test semantic search


In [17]:
documents = [
    "Langchain is a framework for developing application powered by language models",
    "python is a high-level programming language",
    "Machine learning is a subset of artificial intelligence",
    "Embeddings convert text into numerical vectors",
    "The weather today is sunny and warm"
]

In [24]:
query = "what are embeddings ?"

In [25]:
def semantic_search(query, documents, embeddings_model, top_k = 3):
    """Simple semantic search implementation"""
    
    #Embed query and document
    
    query_embedding = embeddings_model.embed_query(query)
    documents_embedding = embeddings_model.embed_documents(documents)

    #Calculate the similarity score

    similarities = []

    for i, doc_emb in enumerate(documents_embedding):
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarities.append((similarity, documents[i]))

    #sort y similarity
    similarities.sort(reverse=True)
    return similarities[:top_k]


In [26]:
results = semantic_search(query=query, documents=documents, embeddings_model=embeddings)

In [27]:
results

[(np.float64(0.5660611954539129),
  'Embeddings convert text into numerical vectors'),
 (np.float64(0.22014129943933583),
  'Machine learning is a subset of artificial intelligence'),
 (np.float64(0.17860666444083748),
  'Langchain is a framework for developing application powered by language models')]