### Load HuggingFace API Key

In [1]:
import os
from dotenv import load_dotenv

os.environ["HUGGINGFACEHUB_API_TOKEN"]=os.getenv("HF_TOKEN")

### Specify a location where HuggingFace models will be downloaded

In [2]:

os.environ["HF_HOME"] = "/home/abhishek/ad-workspace/huggingface"

### Load HuggingFace all-MiniLM-L6-v2 embedding model

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

### Documents

In [4]:
documents = [
    "Kolkata is the capital of West Bengal",
    "Dispur is the capital of Assam",
    "Mumbai is the capital of Maharashtra",
    "Chennai is the capital of Tamil Nadu",
    "Kohima is the capital of Nagaland",
    "Bengaluru is known as the Silicon Valley of India",
    "Hyderabad is famous for its IT industry and biryani",
    "The Ganges is one of the longest rivers in India",
    "The Himalayas are the highest mountain range in the world",
    "New Delhi is the capital of India"
]

In [5]:
from langchain_core.documents import Document

docs = [Document(page_content=text) for text in documents]

In [6]:
docs

[Document(metadata={}, page_content='Kolkata is the capital of West Bengal'),
 Document(metadata={}, page_content='Dispur is the capital of Assam'),
 Document(metadata={}, page_content='Mumbai is the capital of Maharashtra'),
 Document(metadata={}, page_content='Chennai is the capital of Tamil Nadu'),
 Document(metadata={}, page_content='Kohima is the capital of Nagaland'),
 Document(metadata={}, page_content='Bengaluru is known as the Silicon Valley of India'),
 Document(metadata={}, page_content='Hyderabad is famous for its IT industry and biryani'),
 Document(metadata={}, page_content='The Ganges is one of the longest rivers in India'),
 Document(metadata={}, page_content='The Himalayas are the highest mountain range in the world'),
 Document(metadata={}, page_content='New Delhi is the capital of India')]

In [7]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(docs))]

In [8]:
uuids

['eba4eed4-02a7-49a5-a5d5-0637bdd2c737',
 'acd4b0ac-cc5b-4454-afa0-f5ec3b49fe0d',
 '2bf75e48-54da-4a0a-8ec5-ec12c94c460e',
 'a6ab30c2-e902-4445-bbd6-96bdaedfa411',
 'f7ccc3e8-bd27-474e-8e25-fc28ba4e701e',
 '9f4bfc1a-9a61-49c6-a5c2-286165b9b7f8',
 '8b66ccf0-3542-48af-93f7-0811e971fb62',
 '975c9d04-0adc-49e5-8ce6-1a79f11dda0d',
 '68f0598a-92fe-4ed5-8866-f1efda225ada',
 'c5f39ceb-5b27-4cb1-8984-85b420ef014a']

### Get Text embeddings of documents

In [9]:
vector_embeddings=embeddings.embed_documents(documents)

In [10]:
print("No. of embedding vectors : ", len(vector_embeddings))
print("Dimension of each embedding vector : ", len(vector_embeddings[0]))

No. of embedding vectors :  10
Dimension of each embedding vector :  384


## FAISS vector store

### Initialization

In [11]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

### Add documents

In [12]:
vector_store.add_documents(documents=docs, ids=uuids)

['eba4eed4-02a7-49a5-a5d5-0637bdd2c737',
 'acd4b0ac-cc5b-4454-afa0-f5ec3b49fe0d',
 '2bf75e48-54da-4a0a-8ec5-ec12c94c460e',
 'a6ab30c2-e902-4445-bbd6-96bdaedfa411',
 'f7ccc3e8-bd27-474e-8e25-fc28ba4e701e',
 '9f4bfc1a-9a61-49c6-a5c2-286165b9b7f8',
 '8b66ccf0-3542-48af-93f7-0811e971fb62',
 '975c9d04-0adc-49e5-8ce6-1a79f11dda0d',
 '68f0598a-92fe-4ed5-8866-f1efda225ada',
 'c5f39ceb-5b27-4cb1-8984-85b420ef014a']

### Query

In [13]:
query = "What is the capital of India?"

### Similarity Search

In [14]:
similar_docs = vector_store.similarity_search(query,k=3)

In [15]:
similar_docs

[Document(id='c5f39ceb-5b27-4cb1-8984-85b420ef014a', metadata={}, page_content='New Delhi is the capital of India'),
 Document(id='2bf75e48-54da-4a0a-8ec5-ec12c94c460e', metadata={}, page_content='Mumbai is the capital of Maharashtra'),
 Document(id='a6ab30c2-e902-4445-bbd6-96bdaedfa411', metadata={}, page_content='Chennai is the capital of Tamil Nadu')]

In [16]:
similar_docs[0].page_content

'New Delhi is the capital of India'