# Chroma Vector Store

#### Creating a Chroma Index

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
import chromadb

In [4]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("quickstart")

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
INFO:chromadb:Running Chroma using direct local API.
Running Chroma using direct local API.
Running Chroma using direct local API.
INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.
Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transien

  from .autonotebook import tqdm as notebook_tqdm


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu
Use pytorch device: cpu
Use pytorch device: cpu


In [5]:
from llama_index import GPTVectorStoreIndex, StorageContext
from llama_index.vector_stores import ChromaVectorStore

In [6]:
from llama_index.data_structs.node import Node

nodes = [
    Node("Michael Jordan is a retired professional basketball player, widely regarded as one of the greatest basketball players of all time.", extra_info={
        "category": "Sports",
        "country": "United States",
    }),
    Node("Angelina Jolie is an American actress, filmmaker, and humanitarian. She has received numerous awards for her acting and is known for her philanthropic work.", extra_info={
        "category": "Entertainment",
        "country": "United States",
    }),
    Node("Elon Musk is a business magnate, industrial designer, and engineer. He is the founder, CEO, and lead designer of SpaceX, Tesla, Inc., Neuralink, and The Boring Company.", extra_info={
        "category": "Business",
        "country": "United States",
    }),
    Node("Rihanna is a Barbadian singer, actress, and businesswoman. She has achieved significant success in the music industry and is known for her versatile musical style.", extra_info={
        "category": "Music",
        "country": "Barbados",
    }),
    Node("Cristiano Ronaldo is a Portuguese professional footballer who is considered one of the greatest football players of all time. He has won numerous awards and set multiple records during his career.", extra_info={
        "category": "Sports",
        "country": "Portugal",
    })
]

In [8]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [9]:
index = GPTVectorStoreIndex(nodes, storage_context=storage_context)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 211 tokens
> [build_index_from_nodes] Total embedding token usage: 211 tokens
> [build_index_from_nodes] Total embedding token usage: 211 tokens


In [10]:
from llama_index.indices.vector_store.retrievers import VectorIndexAutoRetriever
from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo


vector_store_info = VectorStoreInfo(
    content_info='brief biography of celebrities',
    metadata_info=[
        MetadataInfo(
            name='category', 
            type='str', 
            description='Category of the celebrity, one of [Sports, Entertainment, Business, Music]'),
        MetadataInfo(name='country', type='str', description='Country of the celebrity, one of [United States, Barbados, Portugal]'),
    ]
)
retriever = VectorIndexAutoRetriever(index, vector_store_info=vector_store_info)

In [11]:
retriever.retrieve('Tell me about two celebrities from United States')

INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using query str: celebrities
Using query str: celebrities
Using query str: celebrities
INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using filters: {'country': 'United States'}
Using filters: {'country': 'United States'}
Using filters: {'country': 'United States'}
INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using top_k: 2
Using top_k: 2
Using top_k: 2
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 3 tokens
> [retrieve] Total embedding token usage: 3 tokens
> [retrieve] Total embedding token usage: 3 tokens


[NodeWithScore(node=Node(text='Angelina Jolie is an American actress, filmmaker, and humanitarian. She has received numerous awards for her acting and is known for her philanthropic work.', doc_id='7389c8ad-2feb-4cf3-a8da-6c0f08c0f222', embedding=None, doc_hash='1171ef7bb1b89283a1012fecb0ea7a831a0e38c2ed6fac9fbfdd62ad64063934', extra_info={'category': 'Entertainment', 'country': 'United States'}, node_info={}, relationships={}), score=0.3262841090927262),
 NodeWithScore(node=Node(text='Michael Jordan is a retired professional basketball player, widely regarded as one of the greatest basketball players of all time.', doc_id='da13fc89-72cb-401e-8445-9b9680372fbf', embedding=None, doc_hash='44c17458239bdba3c72f8ed6ac12e096b4c7965f6b5154d74b7a10484dad16a4', extra_info={'category': 'Sports', 'country': 'United States'}, node_info={}, relationships={}), score=0.3734403491142674)]

In [12]:
retriever.retrieve('Tell me about Sports celebrities from United States')

INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using query str: Sports celebrities
Using query str: Sports celebrities
Using query str: Sports celebrities
INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using filters: {'category': 'Sports', 'country': 'United States'}
Using filters: {'category': 'Sports', 'country': 'United States'}
Using filters: {'category': 'Sports', 'country': 'United States'}
INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using top_k: 2
Using top_k: 2
Using top_k: 2
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 2 tokens
> [retrieve] Total embedding token usage: 2 tokens
> [retrieve] Total embedding token usage: 2 tokens


[NodeWithScore(node=Node(text='Michael Jordan is a retired professional basketball player, widely regarded as one of the greatest basketball players of all time.', doc_id='da13fc89-72cb-401e-8445-9b9680372fbf', embedding=None, doc_hash='44c17458239bdba3c72f8ed6ac12e096b4c7965f6b5154d74b7a10484dad16a4', extra_info={'category': 'Sports', 'country': 'United States'}, node_info={}, relationships={}), score=0.3328886457329614)]