### Load HuggingFace API Key

In [1]:
import os
from dotenv import load_dotenv

os.environ["HUGGINGFACEHUB_API_TOKEN"]=os.getenv("HF_TOKEN")

### Specify a location where HuggingFace models will be downloaded

In [2]:

os.environ["HF_HOME"] = "/home/abhishek/ad-workspace/huggingface"

### Load HuggingFace all-MiniLM-L6-v2 embedding model

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

### Documents

In [4]:
documents = [
    "Kolkata is the capital of West Bengal",
    "Dispur is the capital of Assam",
    "Mumbai is the capital of Maharashtra",
    "Chennai is the capital of Tamil Nadu",
    "Kohima is the capital of Nagaland",
    "Bengaluru is known as the Silicon Valley of India",
    "Hyderabad is famous for its IT industry and biryani",
    "The Ganges is one of the longest rivers in India",
    "The Himalayas are the highest mountain range in the world",
    "New Delhi is the capital of India"
]

In [14]:
from langchain_core.documents import Document

docs = [Document(page_content=text) for text in documents]

In [15]:
docs

[Document(metadata={}, page_content='Kolkata is the capital of West Bengal'),
 Document(metadata={}, page_content='Dispur is the capital of Assam'),
 Document(metadata={}, page_content='Mumbai is the capital of Maharashtra'),
 Document(metadata={}, page_content='Chennai is the capital of Tamil Nadu'),
 Document(metadata={}, page_content='Kohima is the capital of Nagaland'),
 Document(metadata={}, page_content='Bengaluru is known as the Silicon Valley of India'),
 Document(metadata={}, page_content='Hyderabad is famous for its IT industry and biryani'),
 Document(metadata={}, page_content='The Ganges is one of the longest rivers in India'),
 Document(metadata={}, page_content='The Himalayas are the highest mountain range in the world'),
 Document(metadata={}, page_content='New Delhi is the capital of India')]

In [16]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(docs))]

In [17]:
uuids

['72f21f61-4c65-4d63-af94-43d87fe812e7',
 '4ac7fb3c-c927-44ff-bab8-567e5afdec83',
 'caa06e32-e9e4-4bac-a303-58d685af9802',
 'a02e63cd-7722-4357-89dd-740a51f3d355',
 'bbaefe37-a452-43a8-86dc-c07024795c6a',
 '67dc6ae8-54c3-428a-88b4-5388714b734a',
 '3955e148-7e11-4f77-ae2f-0630d55d832d',
 'a336bbdd-ed1e-4084-b2ea-2d7f7a03a289',
 'da4f5a96-b9da-44a9-8909-6aa4ed8344a2',
 '3883f0f3-05ea-4083-806d-01e9e9129454']

### Get Text embeddings of documents

In [19]:
vector_embeddings=embeddings.embed_documents(documents)

In [6]:
print("No. of embedding vectors : ", len(vector_embeddings))
print("Dimension of each embedding vector : ", len(vector_embeddings[0]))

No. of embedding vectors :  10
Dimension of each embedding vector :  384


## Chroma vector store

### Initialization

In [20]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="india_facts_chroma",
    embedding_function=embeddings,
    persist_directory="/home/abhishek/ad-workspace/chroma_db",
)


### Add documents

In [21]:
vector_store.add_documents(documents=docs, ids=uuids)

['72f21f61-4c65-4d63-af94-43d87fe812e7',
 '4ac7fb3c-c927-44ff-bab8-567e5afdec83',
 'caa06e32-e9e4-4bac-a303-58d685af9802',
 'a02e63cd-7722-4357-89dd-740a51f3d355',
 'bbaefe37-a452-43a8-86dc-c07024795c6a',
 '67dc6ae8-54c3-428a-88b4-5388714b734a',
 '3955e148-7e11-4f77-ae2f-0630d55d832d',
 'a336bbdd-ed1e-4084-b2ea-2d7f7a03a289',
 'da4f5a96-b9da-44a9-8909-6aa4ed8344a2',
 '3883f0f3-05ea-4083-806d-01e9e9129454']

### Query

In [22]:
query = "What is the capital of Assam?"

### Similarity Search

In [23]:
similar_docs = vector_store.similarity_search(query,k=3)

In [24]:
similar_docs

[Document(id='4ac7fb3c-c927-44ff-bab8-567e5afdec83', metadata={}, page_content='Dispur is the capital of Assam'),
 Document(id='3883f0f3-05ea-4083-806d-01e9e9129454', metadata={}, page_content='New Delhi is the capital of India'),
 Document(id='bbaefe37-a452-43a8-86dc-c07024795c6a', metadata={}, page_content='Kohima is the capital of Nagaland')]

In [25]:
similar_docs[0].page_content

'Dispur is the capital of Assam'