# Vector Databases

- Vector Store
- ChromaDB
- Pinecone

Download the required packages by executing the below commands in either Anaconda Prompt (in Windows) or Terminal (in Linux or Mac OS)

pip install chromadb llama-index-vector-stores-chroma pinecone-client llama-index-vector-stores-pinecone

## Setup

In [1]:
import os

In [2]:
from dotenv import load_dotenv, find_dotenv
load_dotenv('/home/santhosh/Projects/courses/Pinnacle/.env')

True

In [3]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']

## Download Data


In [2]:
!mkdir data
!wget "https://arxiv.org/pdf/1706.03762" -O 'data/transformers.pdf'

--2024-06-11 12:31:22--  https://arxiv.org/pdf/1706.03762
Resolving arxiv.org (arxiv.org)... 151.101.3.42, 151.101.67.42, 151.101.131.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.3.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2215244 (2.1M) [application/pdf]
Saving to: ‘data/transformers.pdf’


2024-06-11 12:31:23 (4.07 MB/s) - ‘data/transformers.pdf’ saved [2215244/2215244]



## Load Data

In [4]:
from pathlib import Path
from llama_index.readers.file import PDFReader

In [5]:
loader = PDFReader()
documents = loader.load_data(file=Path('./data/transformers.pdf'))

In [6]:
len(documents)

15

# VectorStoreIndex

In [7]:
from llama_index.core import VectorStoreIndex

In [8]:
# Construct an index by loading documents into a VectorStoreIndex.
index = VectorStoreIndex.from_documents(documents)

Save index to local disk

In [9]:
pwd

'/home/santhosh/Projects/courses/Pinnacle/RAG systems using LlamaIndex/Module 3 - Components of LlamaIndex/Notebooks'

In [10]:
!mkdir index
index.storage_context.persist(persist_dir="./index")

In [11]:
whos

Variable           Type                Data/Info
------------------------------------------------
NamespaceMagics    MetaHasTraits       <class 'IPython.core.magi<...>mespace.NamespaceMagics'>
OPENAI_API_KEY     str                 sk-CphV0PEUD3S0o7cNZnvoT3<...>lbkFJiekmrh08ZfDqgvjx3Pzc
PDFReader          ABCMeta             <class 'llama_index.reade<...>ile.docs.base.PDFReader'>
PINECONE_API_KEY   str                 b730ade4-81d5-4a16-9960-fdc8ccba6fec
Path               type                <class 'pathlib.Path'>
VectorStoreIndex   ABCMeta             <class 'llama_index.core.<...>e.base.VectorStoreIndex'>
documents          list                n=15
find_dotenv        function            <function find_dotenv at 0x6ffe9b375620>
get_ipython        function            <function get_ipython at 0x6fffe9f8a0c0>
index              VectorStoreIndex    <llama_index.core.indices<...>object at 0x6ffe93852a10>
json               module              <module 'json' from '/hom<...>on3.11/json/__in

In [12]:
del index

In [13]:
whos

Variable           Type             Data/Info
---------------------------------------------
NamespaceMagics    MetaHasTraits    <class 'IPython.core.magi<...>mespace.NamespaceMagics'>
OPENAI_API_KEY     str              sk-CphV0PEUD3S0o7cNZnvoT3<...>lbkFJiekmrh08ZfDqgvjx3Pzc
PDFReader          ABCMeta          <class 'llama_index.reade<...>ile.docs.base.PDFReader'>
PINECONE_API_KEY   str              b730ade4-81d5-4a16-9960-fdc8ccba6fec
Path               type             <class 'pathlib.Path'>
VectorStoreIndex   ABCMeta          <class 'llama_index.core.<...>e.base.VectorStoreIndex'>
documents          list             n=15
find_dotenv        function         <function find_dotenv at 0x6ffe9b375620>
get_ipython        function         <function get_ipython at 0x6fffe9f8a0c0>
json               module           <module 'json' from '/hom<...>on3.11/json/__init__.py'>
load_dotenv        function         <function load_dotenv at 0x6ffe9b375760>
loader             PDFReader        <llama_i

In [14]:
index

NameError: name 'index' is not defined

In [15]:
from llama_index.core import StorageContext, load_index_from_storage

In [16]:
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./index")

In [17]:
# load index
index = load_index_from_storage(storage_context)

In [18]:
whos

Variable                  Type                Data/Info
-------------------------------------------------------
NamespaceMagics           MetaHasTraits       <class 'IPython.core.magi<...>mespace.NamespaceMagics'>
OPENAI_API_KEY            str                 sk-CphV0PEUD3S0o7cNZnvoT3<...>lbkFJiekmrh08ZfDqgvjx3Pzc
PDFReader                 ABCMeta             <class 'llama_index.reade<...>ile.docs.base.PDFReader'>
PINECONE_API_KEY          str                 b730ade4-81d5-4a16-9960-fdc8ccba6fec
Path                      type                <class 'pathlib.Path'>
StorageContext            type                <class 'llama_index.core.<...>_context.StorageContext'>
VectorStoreIndex          ABCMeta             <class 'llama_index.core.<...>e.base.VectorStoreIndex'>
documents                 list                n=15
find_dotenv               function            <function find_dotenv at 0x6ffe9b375620>
get_ipython               function            <function get_ipython at 0x6fffe9f8a0c0>
i

# Using ChromDB

In [21]:
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

In [22]:
# load some documents
documents = SimpleDirectoryReader(input_files=['./data/transformers.pdf']).load_data()

In [23]:
# initialize client, setting path to save data
db = chromadb.PersistentClient(path="./chroma_db")

In [24]:
# create collection
chroma_collection = db.get_or_create_collection("quickstart")

In [25]:
# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [26]:
# create your index
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

# Using PineCone Vector DB

In [32]:
from pinecone import Pinecone, PodSpec

In [None]:
pc = Pinecone(api_key=PINECONE_API_KEY)

In [34]:
pc.create_index(
  name="quickstart",
  dimension=1536,
  metric="cosine",
  spec=PodSpec(
    environment="gcp-starter"
  )
)

In [35]:
pinecone_index = pc.Index("quickstart")

In [36]:
# load some documents
documents = SimpleDirectoryReader(input_files=['./data/transformers.pdf']).load_data()

In [37]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [38]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [39]:
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

Upserted vectors:   0%|          | 0/15 [00:00<?, ?it/s]

https://docs.llamaindex.ai/en/stable/understanding/storing/storing.html