## What Are Embeddings?
Think of embeddings as a way to translate words into a language that computers understand - numbers!

In [None]:
import numpy as np
import matplotlib.pyplot as plt


In [None]:
# simplified 2D example with hardcoded embedding vectors (real embeddings have hundreds of dimensions)
word_embeddings = {
    "cat": [0.8, 0.6],
    "kitten": [0.75, 0.65],
    "dog": [0.7, 0.3],
    "puppy": [0.65, 0.35],
    "car": [-0.5, 0.2],
    "truck": [-0.45, 0.15]
}

In [None]:
# plot this embedding in 2d plane
fig, ax = plt.subplots(figsize=(8, 6))

for word, coords in word_embeddings.items():
    ax.scatter(coords[0], coords[1], s=100)
    ax.annotate(word, (coords[0], coords[1]), xytext=(5, 5), 
                textcoords='offset points')

ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
ax.set_title('Simplified Word Embeddings in 2D Space')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Measuring Similarity of vectors / embeddings

In [None]:
# find the cosine similarity between two vectors (embeddings)
def cosine_similarity(vec1, vec2):
    """
    Cosine similarity measures the angle between two vectors.
    - Result close to 1: Very similar
    - Result close to 0: Not related
    - Result close to -1: Opposite meanings
    """

    dot_product=np.dot(vec1,vec2)
    norm_a=np.linalg.norm(vec1)
    norm_b=np.linalg.norm(vec2)
    return dot_product/(norm_a * norm_b)



In [None]:
# Example hardcoded vectors
cat_vector = [0.8, 0.6, 0.3]
kitten_vector = [0.75, 0.65, 0.35]
car_vector = [-0.5, 0.2, 0.1]

cat_kitten_similarity=cosine_similarity(cat_vector,kitten_vector)
print("cat vs kitten similarity=",cat_kitten_similarity)

cat_car_similarity=cosine_similarity(cat_vector,car_vector)
print("cat vs car similarity=",cat_car_similarity)

### Creating Your First Embeddings with HuggingFaceEmbeddings

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

## Initialize a simple Embedding model(no API Key needed!)
hf_embeddings=HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
hf_embeddings


In [None]:
## create first embeddings from sentence using HF embedding
text="Hello, I am learning about embeddings!"

embedding=hf_embeddings.embed_query(text)
print(f"Text: {text}")
print(f"Embedding length : {len(embedding)}")
print(embedding)


In [None]:
## create first embeddings from list of sentence
sentences = [
    "The cat sat on the mat",
    "The cat sat on the mat",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]

embedded_sentences=hf_embeddings.embed_documents(sentences)

print(embedded_sentences[0])
print(embedded_sentences[1])

## Similarity between sentence pairs

In [None]:
for i in range(len(sentences)):
    for j in range(i+1,len(sentences)):
        similarity=cosine_similarity(embedded_sentences[i],embedded_sentences[j])

        print(f"'{sentences[i]}' vs '{sentences[j]}'")
        print(f"Similarity: {similarity:.3f}\n")

## Vector search 

In [None]:
# vector search search - retrieve the similar sentence using vector search using a algorithm like cosine function
# semantic search - use LLM to find meaning in search
documents = [
    "LangChain is a framework for developing applications powered by language models",
    "Python is a high-level programming language",
    "Machine learning is a subset of artificial intelligence",
    "Embeddings convert text into numerical vectors",
    "The weather today is sunny and warm"
]
query="What is Langchain?"

In [60]:
def vector_search(query,documents,embeddings_models,top_k=3):
    """Simple semantic search implementation"""

    ## embed query and document
    query_embedding=embeddings_models.embed_query(query)
    doc_embeddings = embeddings_models.embed_documents(documents)

    ## Calculate the similarity score across all doc for query
    similarities=[]
    for i,doc_emb in enumerate(doc_embeddings):
        similarity=cosine_similarity(query_embedding,doc_emb)
        similarities.append((similarity,documents[i]))

    ## Sort by similarity to get top similarity
    similarities.sort(reverse=True)
    return similarities[:top_k]

In [61]:
results = vector_search(query,documents,hf_embeddings)
results

[(np.float64(0.560072625106014),
  'Embeddings convert text into numerical vectors'),
 (np.float64(0.2350027453874689),
  'Machine learning is a subset of artificial intelligence'),
 (np.float64(0.17217240684423185),
  'LangChain is a framework for developing applications powered by language models')]

In [None]:
# format result for better displaying
print(f"\n🔎 Semantic Search Results for: '{query}'")
for score, doc in results:
    print(f"Score: {score:.3f} | {doc}")

In [62]:
# new query
query="What is Embeddings?"
results=vector_search(query,documents,hf_embeddings)
results

[(np.float64(0.560072625106014),
  'Embeddings convert text into numerical vectors'),
 (np.float64(0.2350027453874689),
  'Machine learning is a subset of artificial intelligence'),
 (np.float64(0.17217240684423185),
  'LangChain is a framework for developing applications powered by language models')]

### HuggingFaceEmbeddings models use cases

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
import time

# Popular models comparison
models = {
    "all-MiniLM-L6-v2": {
        "size": 384,
        "description": "Fast and efficient, good quality",
        "use_case": "General purpose, real-time applications"
    },
    "all-mpnet-base-v2": {
        "size": 768,
        "description": "Best quality, slower than MiniLM",
        "use_case": "When quality matters more than speed"
    },
    "all-MiniLM-L12-v2": {
        "size": 384,
        "description": "Slightly better than L6, bit slower",
        "use_case": "Good balance of speed and quality"
    },
    "multi-qa-MiniLM-L6-cos-v1": {
        "size": 384,
        "description": "Optimized for question-answering",
        "use_case": "Q&A systems, semantic search"
    },
    "paraphrase-multilingual-MiniLM-L12-v2": {
        "size": 384,
        "description": "Supports 50+ languages",
        "use_case": "Multilingual applications"
    }
}

print("📊 Popular Open Source Embedding Models:\n")
for model_name, info in models.items():
    print(f"Model: sentence-transformers/{model_name}")
    print(f"  📏 Embedding size: {info['size']} dimensions")
    print(f"  📝 Description: {info['description']}")
    print(f"  🎯 Use case: {info['use_case']}\n")
