### Embeddings
![image.png](attachment:image.png)

In [2]:
#### Cosine similarity

import numpy as np

def cosine_similarity(vec1, vec2):

    dot_product = np.dot(vec1, vec2)

    norm_1 = np.linalg.norm(vec1)
    norm_2 = np.linalg.norm(vec2)

    return dot_product / (norm_1 * norm_2)


vec1 = [0.8,0.6,0.9,0.5]
vec2 = [0.7,0.6,0.8,0.55]

result = cosine_similarity(vec1, vec2)

print(result)

0.9965648979951588


In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
import warnings
warnings.filterwarnings('ignore')

embeddings_hf = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

In [11]:
text = "Hello, I am learning embeddings"

embedding = embeddings_hf.embed_query(text)

print(f"input text: {text}")
print(f"embedding length: {len(embedding)}")
print(f"embeddings: {embedding}")

input text: Hello, I am learning embeddings
embedding length: 384
embeddings: [-0.01584898866713047, -0.11090175062417984, 0.03238550201058388, -0.015212000347673893, -0.003252527443692088, 0.05293719097971916, 0.029464062303304672, 0.021722912788391113, 0.029988732188940048, -0.027794089168310165, 0.028750218451023102, 0.044066172093153, 0.032790906727313995, 0.020626069977879524, -0.0603734627366066, -0.000455527740996331, 0.06386067718267441, 0.053981125354766846, -0.0771317109465599, -0.015008224174380302, -0.056320782750844955, -0.030005132779479027, 0.04447673261165619, -0.07640555500984192, 0.03385816141963005, -0.020521001890301704, -0.007360861636698246, 0.07030625641345978, 0.11591470241546631, -0.07865741103887558, 0.0540429912507534, -0.05686982348561287, -0.019562896341085434, 0.0679268091917038, -0.0449586883187294, 0.1063663586974144, 0.023690320551395416, -0.011288436129689217, -0.05472814291715622, 0.00464883865788579, 0.02011788822710514, 0.03533409535884857, -0.00515

In [None]:
sentences = [
    "Knowledge grows when shared with curious minds",
    "Discipline today creates freedom for tomorrow",
    "Data without context leads to confusion",
    "Innovation thrives where failure feels safe",
    "Small consistent actions build massive results",
    "Clean data is the foundation of intelligence"
]

embedding_sent = embeddings_hf.embed_documents(sentences)

print(f"input text: {sentences}")
print(f"embedding length: {len(embedding)}")
print(f"embeddings: {embedding_sent}")

input text: ['Knowledge grows when shared with curious minds', 'Discipline today creates freedom for tomorrow', 'Data without context leads to confusion', 'Innovation thrives where failure feels safe', 'Small consistent actions build massive results', 'Clean data is the foundation of intelligence']
embedding length: 384
embeddings: [[0.06215032935142517, 0.007896987721323967, -0.013816650956869125, 0.06508704274892807, 0.032566286623477936, -0.011093550361692905, 0.08774979412555695, -0.026661375537514687, 0.06071711331605911, 0.04730943590402603, 0.018508726730942726, 0.05764250457286835, 0.0018013354856520891, 0.03365438058972359, -0.029482556506991386, 0.037788212299346924, -0.03983409330248833, -0.06168409436941147, -0.11482202261686325, -0.06947097182273865, -0.02472805790603161, -0.00044400180922821164, 0.013626075349748135, -0.01994127780199051, -0.020003791898489, -0.01568247377872467, 0.0333840548992157, -0.06986796855926514, 0.07318110018968582, -0.035917460918426514, 0.11477

In [12]:
models = {
    "all-MiniLM-L6-v2": {
        "size": 384,
        "description": "Fast and efficient, good quality",
        "use_case": "General purpose, real-time applications"
    },
    "all-mpnet-base-v2": {
        "size": 768,
        "description": "Best quality, slower than MiniLM",
        "use_case": "When quality matters more than speed"
    },
    "all-MiniLM-L12-v2": {
        "size": 384,
        "description": "Slightly better than L6, bit slower",
        "use_case": "Good balance of speed and quality"
    },
    "multi-qa-MiniLM-L6-cos-v1": {
        "size": 384,
        "description": "Optimized for question-answering",
        "use_case": "Q&A systems, semantic search"
    },
    "paraphrase-multilingual-MiniLM-L12-v2": {
        "size": 384,
        "description": "Supports 50+ languages",
        "use_case": "Multilingual applications"
    }
}

print("📊 Popular Open Source Embedding Models:\n")

for model_name, info in models.items():
    print(f"Model: sentence-transformers/{model_name}")
    print(f"🔑 Embedding size: {info['size']} dimensions")
    print(f"📝 Description: {info['description']}")
    print(f"🎯 Use case: {info['use_case']}\n")


📊 Popular Open Source Embedding Models:

Model: sentence-transformers/all-MiniLM-L6-v2
🔑 Embedding size: 384 dimensions
📝 Description: Fast and efficient, good quality
🎯 Use case: General purpose, real-time applications

Model: sentence-transformers/all-mpnet-base-v2
🔑 Embedding size: 768 dimensions
📝 Description: Best quality, slower than MiniLM
🎯 Use case: When quality matters more than speed

Model: sentence-transformers/all-MiniLM-L12-v2
🔑 Embedding size: 384 dimensions
📝 Description: Slightly better than L6, bit slower
🎯 Use case: Good balance of speed and quality

Model: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
🔑 Embedding size: 384 dimensions
📝 Description: Optimized for question-answering
🎯 Use case: Q&A systems, semantic search

Model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
🔑 Embedding size: 384 dimensions
📝 Description: Supports 50+ languages
🎯 Use case: Multilingual applications



### OpenAI Embeddings

In [13]:
import os
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')


TypeError: str expected, not NoneType

In [14]:
os.getenv('OPENAI_API_KEY')