In [41]:
import os
import faiss
import warnings
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

warnings.filterwarnings('ignore')
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

os.environ['LANGSMITH_PROJECT'] = os.getenv("LANGSMITH_PROJECT")
os.environ['LANGSMITH_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
os.environ['LANGSMITH_ENDPOINT'] = os.getenv("LANGSMITH_ENDPOINT")
os.environ['LANGSMITH_TRACING'] = os.getenv("LANGSMITH_TRACING")

OpenAI Embeddings

In [42]:
embeddings_openai = OpenAIEmbeddings(model="text-embedding-3-large")

text = 'My name is Bhagwat Chate'

# query_result = embeddings_openai.embed_query(text=text)
# print(len(query_result))

# query_result = embeddings_openai.embed_query(text=text, dimensions=1024)
# print(len(query_result))

In [43]:
embeddings_hf = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
query_result = embeddings_hf.embed_query(text)
print(len(query_result))

384


In [44]:
docs = ["India is great", 
        "India own 1983,2011 ODI world cup", 
        "India own T20 world cup in 2007, 2025"]
query1 = "who is the 2011 odi world cup winner"
query2 = "For Nuclear power what is the policy of Republic of India"

docs_embed = embeddings_hf.embed_documents(docs)

query_embed = embeddings_hf.embed_query(query1)

cosine_similarity([query_embed], docs_embed)

array([[0.29896229, 0.68161931, 0.43833797]])

#### Meaning:
* 0.2989 = similarity between query and doc[0] ("India is great")
* 0.6816 = similarity between query and doc[1] ("India own 1983,2011 ODI world cup")
* 0.4383 = similarity between query and doc[2] ("India own T20 world cup in 2007, 2025")

#### How to interpret these values:
* +1        → perfect match in meaning/direction
* 0.7+      → strong semantic similarity 
* 0.5–0.7   → moderate similarity
* 0.3–0.5   → weak relevance
* < 0.3     → not very related

Cosine similarity ranges from -1 to +1


In [51]:
index = faiss.IndexFlatL2(384)

| Feature               | `Flat`                | `IVF` (Inverted File Index)        | `HNSW` (Graph-based Index)          |
| --------------------- | --------------------- | ---------------------------------- | ----------------------------------- |
| Type of Search     | Exact                 | Approximate (cluster-based)        | Approximate (graph-based traversal) |
| Speed               | Slow (linear scan)    | Fast (search only in top clusters) | Very Fast (graph walk)              |


| Dataset Size              | Recommended Index                 |
| ------------------------- | --------------------------------- |
| UPTO 1L                     | `IndexFlatL2` or `IndexFlatIP`    |
| UPTO 1M                  | `IndexIVFFlat` or `IndexHNSWFlat` |
| > 1M                      | `IndexIVFPQ` or `IndexHNSWFlat`   |


In [55]:
vector_store = FAISS(embedding_function=embeddings_hf,
                     index=index,
                     docstore=InMemoryDocstore(),
                     index_to_docstore_id={})

In [53]:
docs

['India is great',
 'India own 1983,2011 ODI world cup',
 'India own T20 world cup in 2007, 2025']

In [56]:
vector_store.add_texts(['India is great',
 'India own 1983,2011 ODI world cup',
 'India own T20 world cup in 2007, 2025'])

['e868b3ea-516c-4390-b120-04981ec4ea25',
 '02a6f4f4-b765-49b0-a245-90a032391448',
 '3febb929-8e6d-4fff-b6a1-be84966b26b6']

means that your three input texts were successfully embedded and stored in the FAISS vector store, and they were each assigned a unique UUID-based document ID internally by LangChain.

In [57]:
vector_store.index_to_docstore_id

{0: 'e868b3ea-516c-4390-b120-04981ec4ea25',
 1: '02a6f4f4-b765-49b0-a245-90a032391448',
 2: '3febb929-8e6d-4fff-b6a1-be84966b26b6'}

In [58]:
results = vector_store.similarity_search("About which country we are talking here?", k=1)
results

[Document(id='e868b3ea-516c-4390-b120-04981ec4ea25', metadata={}, page_content='India is great')]

Example: 2

In [74]:
# from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

In [75]:
index2 = faiss.IndexFlatIP(384)

vector_store2 = FAISS.from_documents(
    documents=documents,
    embedding=embeddings_hf
)

vector_store2.add_documents(documents=documents)

['91fd5d2e-d433-468e-b720-a8004d8412e2',
 '1bb8c2b6-ff08-40a1-9842-1a27854091af',
 '3bfa8a9d-57ee-4d4a-aff2-7624dcffff11',
 '3b6eea5f-2f75-4b40-936f-0a7de0d27aa6',
 'a8a75944-88f5-41b3-9fa0-ae65a39f9d3b',
 '3b2f6980-b024-4a77-9990-3e61e2f004c4',
 '0e2c119d-0782-4bf8-9c5c-50ecd02ebc8c',
 '2c27f9e3-57f2-4ecf-a3cc-feab411245a4',
 '9f6183b5-f7a2-4715-9315-973e6e5712cb',
 '54e7081f-4271-4d64-a3ef-53e969ab90d9']

In [78]:
vector_store2.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=3 #hyperparameter
)

[Document(id='5712654a-e14b-48dd-8d63-10587f2cae54', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='3bfa8a9d-57ee-4d4a-aff2-7624dcffff11', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='0fea67ef-53e8-4aa3-a68f-b2aba29f8a19', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [82]:
result = vector_store2.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=3, #hyperparameter
    filter={'source':{"$eq":"tweet"}}
)

In [83]:
result[0].page_content

'Building an exciting new project with LangChain - come check it out!'