### Load HuggingFace API Key

In [1]:
import os
from dotenv import load_dotenv

os.environ["HUGGINGFACEHUB_API_TOKEN"]=os.getenv("HF_TOKEN")

### Specify a location where HuggingFace models will be downloaded

In [2]:

os.environ["HF_HOME"] = "/home/abhishek/ad-workspace/huggingface"

### Load HuggingFace all-MiniLM-L6-v2 embedding model

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

### Documents

In [4]:
documents = [
    "Kolkata is the capital of West Bengal",
    "Dispur is the capital of Assam",
    "Mumbai is the capital of Maharashtra",
    "Chennai is the capital of Tamil Nadu",
    "Kohima is the capital of Nagaland",
    "Bengaluru is known as the Silicon Valley of India",
    "Hyderabad is famous for its IT industry and biryani",
    "The Ganges is one of the longest rivers in India",
    "The Himalayas are the highest mountain range in the world",
    "New Delhi is the capital of India"
]

In [5]:
from langchain_core.documents import Document

docs = [Document(page_content=text) for text in documents]

In [6]:
docs

[Document(metadata={}, page_content='Kolkata is the capital of West Bengal'),
 Document(metadata={}, page_content='Dispur is the capital of Assam'),
 Document(metadata={}, page_content='Mumbai is the capital of Maharashtra'),
 Document(metadata={}, page_content='Chennai is the capital of Tamil Nadu'),
 Document(metadata={}, page_content='Kohima is the capital of Nagaland'),
 Document(metadata={}, page_content='Bengaluru is known as the Silicon Valley of India'),
 Document(metadata={}, page_content='Hyderabad is famous for its IT industry and biryani'),
 Document(metadata={}, page_content='The Ganges is one of the longest rivers in India'),
 Document(metadata={}, page_content='The Himalayas are the highest mountain range in the world'),
 Document(metadata={}, page_content='New Delhi is the capital of India')]

In [7]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(docs))]

In [8]:
uuids

['c8cc3ddd-d2ca-472d-a704-961c1a6ea6ca',
 '82204918-b4c5-421e-aa88-326c5f2052ab',
 '84e07636-e25d-49f1-89de-58f98fc7acec',
 '334fc348-b3f2-47f1-a1de-3e3638f45ad6',
 '54b4a646-5676-4963-958b-76264439013b',
 '616064eb-3e47-44a0-9cca-db50502a8623',
 '6ba332c4-7678-4c39-91eb-1ee44357725d',
 '13b131d6-3874-4cf2-8481-e89e161952ea',
 '963585be-0d56-4b60-b4ad-c45757eb73ba',
 '6a35fbb3-f361-450a-9f2a-312b18d5a9b5']

### Get Text embeddings of documents

In [9]:
vector_embeddings=embeddings.embed_documents(documents)

In [10]:
print("No. of embedding vectors : ", len(vector_embeddings))
print("Dimension of each embedding vector : ", len(vector_embeddings[0]))

No. of embedding vectors :  10
Dimension of each embedding vector :  384


## Pinecone vector store

Note: Create a new [Pinecone](https://docs.pinecone.io/docs/overview) account, or sign into your existing one, and create an API key

### Credentials

In [11]:
from pinecone import Pinecone

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

### Initialization

In [12]:
from pinecone import ServerlessSpec

index_name = "langchain-test-index"  # change if desired

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

In [13]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

### Add documents

In [14]:
vector_store.add_documents(documents=docs, ids=uuids)

['c8cc3ddd-d2ca-472d-a704-961c1a6ea6ca',
 '82204918-b4c5-421e-aa88-326c5f2052ab',
 '84e07636-e25d-49f1-89de-58f98fc7acec',
 '334fc348-b3f2-47f1-a1de-3e3638f45ad6',
 '54b4a646-5676-4963-958b-76264439013b',
 '616064eb-3e47-44a0-9cca-db50502a8623',
 '6ba332c4-7678-4c39-91eb-1ee44357725d',
 '13b131d6-3874-4cf2-8481-e89e161952ea',
 '963585be-0d56-4b60-b4ad-c45757eb73ba',
 '6a35fbb3-f361-450a-9f2a-312b18d5a9b5']

### Query

In [15]:
query = "What is the capital of Tamil Nadu?"

### Similarity Search

In [16]:
similar_docs = vector_store.similarity_search(query,k=3)

In [17]:
similar_docs

[Document(id='334fc348-b3f2-47f1-a1de-3e3638f45ad6', metadata={}, page_content='Chennai is the capital of Tamil Nadu'),
 Document(id='6a35fbb3-f361-450a-9f2a-312b18d5a9b5', metadata={}, page_content='New Delhi is the capital of India'),
 Document(id='c8cc3ddd-d2ca-472d-a704-961c1a6ea6ca', metadata={}, page_content='Kolkata is the capital of West Bengal')]

In [18]:
similar_docs[0].page_content

'Chennai is the capital of Tamil Nadu'