In [7]:
import os
import warnings
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

os.environ['LANGSMITH_PROJECT'] = os.getenv("LANGSMITH_PROJECT")
os.environ['LANGSMITH_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
os.environ['LANGSMITH_ENDPOINT'] = os.getenv("LANGSMITH_ENDPOINT")
os.environ['LANGSMITH_TRACING'] = os.getenv("LANGSMITH_TRACING")

OpenAI Embeddings

In [4]:
embeddings_openai = OpenAIEmbeddings(model="text-embedding-3-large")

text = 'My name is Bhagwat Chate'

# query_result = embeddings_openai.embed_query(text=text)
# print(len(query_result))

# query_result = embeddings_openai.embed_query(text=text, dimensions=1024)
# print(len(query_result))

In [5]:
embeddings_hf = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
query_result = embeddings_hf.embed_query(text)
print(len(query_result))

384


In [22]:
docs = ["India is great", 
        "India own 1983,2011 ODI world cup", 
        "India own T20 world cup in 2007, 2025"]
query1 = "who is the 2011 odi world cup winner"
query2 = "For Nuclear power what is the policy of Republic of India"

docs_embed = embeddings_hf.embed_documents(docs)

query_embed = embeddings_hf.embed_query(query1)

cosine_similarity([query_embed], docs_embed)

array([[0.29896229, 0.68161931, 0.43833797]])

#### Meaning:
* 0.2989 = similarity between query and doc[0] ("India is great")
* 0.6816 = similarity between query and doc[1] ("India own 1983,2011 ODI world cup")
* 0.4383 = similarity between query and doc[2] ("India own T20 world cup in 2007, 2025")

#### How to interpret these values:
* +1        → perfect match in meaning/direction
* 0.7+      → strong semantic similarity 
* 0.5–0.7   → moderate similarity
* 0.3–0.5   → weak relevance
* < 0.3     → not very related

Cosine similarity ranges from -1 to +1
