https://docs.trychroma.com/docs/overview/getting-started

In [1]:
import chromadb

# Initialize the ChromaDB client and create a collection
client = chromadb.Client()
# 
collection = client.create_collection(name="test_collection")

In [2]:
# Add some text documents to the collection
collection.add(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ],
    ids=["id1", "id2"]
)


In [3]:
all_docs = collection.get()
all_docs

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'documents': ['This is a document about pineapple',
  'This is a document about oranges'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [None, None]}

In [5]:
# Query the Collection
results = collection.query(
    query_texts=["This is a query document about florida"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results.keys())
print(results)
print(results["ids"])
print(results["documents"])
print(results["distances"])

# flourida is most sematically similar to oranges
# as Florida is famous for orange production (it's one of the largest orange-producing states in the U.S.)



dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas', 'distances'])
{'ids': [['id2', 'id1']], 'embeddings': None, 'documents': [['This is a document about oranges', 'This is a document about pineapple']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None]], 'distances': [[1.1462137699127197, 1.3015384674072266]]}
[['id2', 'id1']]
[['This is a document about oranges', 'This is a document about pineapple']]
[[1.1462137699127197, 1.3015384674072266]]


In [5]:
# Query the Collection
# check distances b/w query and document
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results.keys())
print(results)
print(results["ids"])
print(results["documents"])
print(results["distances"])

# havaii is semantically most similar to pineapple
# Hawaii is historically known for large pineapple plantations


dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas', 'distances'])
{'ids': [['id1', 'id2']], 'embeddings': None, 'documents': [['This is a document about pineapple', 'This is a document about oranges']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None]], 'distances': [[1.0404009819030762, 1.2430799007415771]]}
[['id1', 'id2']]
[['This is a document about pineapple', 'This is a document about oranges']]
[[1.0404009819030762, 1.2430799007415771]]


In [33]:
# Inspect Results
{
  'documents': [[
      'This is a document about pineapple',
      'This is a document about oranges'
  ]],
  'ids': [['id1', 'id2']],
  'distances': [[1.0404009819030762, 1.243080496788025]],
  'uris': None,
  'data': None,
  'metadatas': [[None, None]],
  'embeddings': None,
}



{'documents': [['This is a document about pineapple',
   'This is a document about oranges']],
 'ids': [['id1', 'id2']],
 'distances': [[1.0404009819030762, 1.243080496788025]],
 'uris': None,
 'data': None,
 'metadatas': [[None, None]],
 'embeddings': None}

In [34]:
# Delete all data
collection.delete(ids=all_docs['ids'])
collection.get()

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}

### Using Metadata

In [6]:
collection.add(
    documents=[
        "This is a document about cpu",
        "This is a document about gpu"
    ],
    ids=["id3", "id4"],
    metadatas=[{"url": "https://en.wikipedia.org/wiki/Central_processing_unit"}, {"url": "https://en.wikipedia.org/wiki/Graphics_processing_unit"}]
)

In [7]:
collection.get()

{'ids': ['id1', 'id2', 'id3', 'id4'],
 'embeddings': None,
 'documents': ['This is a document about pineapple',
  'This is a document about oranges',
  'This is a document about cpu',
  'This is a document about gpu'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [None,
  None,
  {'url': 'https://en.wikipedia.org/wiki/Central_processing_unit'},
  {'url': 'https://en.wikipedia.org/wiki/Graphics_processing_unit'}]}

In [None]:
results = collection.query(
    query_texts=["This is a query document about core"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
results

{'ids': [['id3', 'id4']],
 'embeddings': None,
 'documents': [['This is a document about cpu',
   'This is a document about gpu']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/Central_processing_unit'},
   {'url': 'https://en.wikipedia.org/wiki/Graphics_processing_unit'}]],
 'distances': [[0.9372293949127197, 1.2746198177337646]]}