In [1]:
import chromadb

print(chromadb.__version__)

1.3.7


### Create client

In [2]:
chroma_client = chromadb.Client()

chroma_client

<chromadb.api.client.Client at 0x219a5867a10>

### Create collection
* collections are where we store our embeddings, documents and any additional metadata

In [3]:
collection = chroma_client.create_collection(name = "First_collection",
                                             metadata={"title": "random documents",
                                                       
                                            "description": "This store contains the embeddings of random string"})
collection

Collection(name=First_collection)

In [4]:
collection.metadata

{'description': 'This store contains the embeddings of random string',
 'title': 'random documents'}

In [5]:
collection.name

'First_collection'

### We can use modify() method to modify the name and metadata of the collection we have created

In [7]:
collection.modify(name="First_modified_collection",
                  metadata={
                      "description": "This store contains the random string (modified)",
                      'title': "random documents (modified)"
                  })

In [8]:
collection.name, collection.metadata

('First_modified_collection',
 {'description': 'This store contains the random string (modified)',
  'title': 'random documents (modified)'})

### Add documents to collection

In [9]:
docs = [
    "Claude 4.5 is latest conversational AI model from Anthropic.",
    "Gemini is latest conversational AI model from google.",
    "Llama-4 is latest conversational AI model from Meta.",
    "Mixtral 8x7B is latest conversational AI model from Mistral AI.",
    "GPT-5 is latest conversational AI model from OpenAI"
]

collection.add(
    documents=docs,
    ids = ["anthropic", "google", "meta", "mistral", "openai"],
    metadatas=[{"version": 4.5}, {"version": 3}, {"version": 4}, {"version": 2}, {"version": 5}]
)

C:\Users\Lenovo\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:12<00:00, 6.68MiB/s]


* embeddings: The embedding s to add. If None, embedding will be computed based on the documents or images using the embedding_functions set for the collection. optional
* metadatas: The metadata to associate with the embeddings. When querying. YOu can filter on this metadata. optional
* documents: The documents to associate with the embeddings. optional
* images: The images to associate with the embeddings. optional
* uris: The uris of the images to associate with the embeddings. Optional

In [10]:
collection.count()

5

In [11]:
collection.get(ids=['anthropic', 'openai'])

{'ids': ['anthropic', 'openai'],
 'embeddings': None,
 'documents': ['Claude 4.5 is latest conversational AI model from Anthropic.',
  'GPT-5 is latest conversational AI model from OpenAI'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'version': 4.5}, {'version': 5}]}

* include() :-> what fields to include in result. Default is ['documents', 'metadatas']

In [12]:
result = collection.get(ids=['anthropic', 'openai'], include=['embeddings', 'documents'])

In [13]:
result

{'ids': ['anthropic', 'openai'],
 'embeddings': array([[ 8.84437840e-03, -1.74004920e-02,  3.94670758e-04,
         -1.40978023e-03,  3.81029211e-02,  1.25021103e-03,
          3.79684567e-02,  1.46647310e-02,  6.99088955e-03,
          1.76980924e-02, -2.36694794e-02, -1.79002751e-02,
         -9.04824287e-02,  2.53535882e-02,  2.34696325e-02,
          3.05134766e-02,  1.16425231e-02, -2.79733371e-02,
         -8.66195112e-02, -4.14507240e-02, -1.02093639e-02,
          1.12054832e-02,  4.51803245e-02, -2.50555971e-03,
         -4.33832668e-02, -4.37491909e-02, -6.64262474e-03,
         -8.30394477e-02,  5.00220284e-02, -7.41148228e-03,
          3.42607461e-02,  8.40155631e-02,  6.30196929e-02,
         -1.35988444e-02, -8.93824920e-02,  2.67263632e-02,
         -1.14643248e-02, -2.48200893e-02, -1.42621167e-03,
          4.70247716e-02, -2.13839952e-02, -5.04222959e-02,
         -5.14766723e-02, -2.06556451e-02,  8.03897902e-02,
          3.15873623e-02, -5.92645854e-02, -1.6362590

In [15]:
len(result['embeddings'][0])

384

In [16]:
result = collection.peek(limit=3)

In [17]:
result

{'ids': ['anthropic', 'google', 'meta'],
 'embeddings': array([[ 0.00884438, -0.01740049,  0.00039467, ...,  0.06249589,
         -0.03616665, -0.05505916],
        [-0.08895399, -0.09540191,  0.02952658, ...,  0.07311269,
         -0.01206857, -0.03972095],
        [ 0.00442572, -0.04778809, -0.0161633 , ...,  0.07723493,
         -0.03395461, -0.00226511]], shape=(3, 384)),
 'documents': ['Claude 4.5 is latest conversational AI model from Anthropic.',
  'Gemini is latest conversational AI model from google.',
  'Llama-4 is latest conversational AI model from Meta.'],
 'uris': None,
 'included': ['metadatas', 'documents', 'embeddings'],
 'data': None,
 'metadatas': [{'version': 4.5}, {'version': 3}, {'version': 4}]}

## Query vector store | Search relevent Documents

### 4.1 Search relevant documents
* query_texts - Query based on text
* query_embeddings - Query based on embeddings value

Default distance metric is I2 or euclidean which means low score is near to query

In [18]:
from pprint import pprint

results = collection.query(
    query_texts=["Which is the latest AI model from Anthropic?",], # chroma will embed this for us using embedding_function
    n_results=3
)

pprint(results)

{'data': None,
 'distances': [[0.5319048166275024, 0.9215655326843262, 0.9596002101898193]],
 'documents': [['Claude 4.5 is latest conversational AI model from Anthropic.',
                'Mixtral 8x7B is latest conversational AI model from Mistral '
                'AI.',
                'Llama-4 is latest conversational AI model from Meta.']],
 'embeddings': None,
 'ids': [['anthropic', 'mistral', 'meta']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[{'version': 4.5}, {'version': 2}, {'version': 4}]],
 'uris': None}


In [19]:
results['documents']

[['Claude 4.5 is latest conversational AI model from Anthropic.',
  'Mixtral 8x7B is latest conversational AI model from Mistral AI.',
  'Llama-4 is latest conversational AI model from Meta.']]

* include() : what fields to include in result. Default is ["Metadatas", "documents", "distances"]

In [20]:
results = collection.query(
    query_texts= ["Which is the latest AI model from Anthropic?"],
    n_results= 3,
    include=['embeddings', 'documents', 'distances']
)

pprint(results)

{'data': None,
 'distances': [[0.5319048166275024, 0.9215655326843262, 0.9596002101898193]],
 'documents': [['Claude 4.5 is latest conversational AI model from Anthropic.',
                'Mixtral 8x7B is latest conversational AI model from Mistral '
                'AI.',
                'Llama-4 is latest conversational AI model from Meta.']],
 'embeddings': [array([[ 0.00884438, -0.01740049,  0.00039467, ...,  0.06249589,
        -0.03616665, -0.05505916],
       [-0.03611682, -0.08049719,  0.03304322, ..., -0.02273243,
        -0.01788436,  0.00234854],
       [ 0.00442572, -0.04778809, -0.0161633 , ...,  0.07723493,
        -0.03395461, -0.00226511]], shape=(3, 384))],
 'ids': [['anthropic', 'mistral', 'meta']],
 'included': ['embeddings', 'documents', 'distances'],
 'metadatas': None,
 'uris': None}


### Filter Documents using "where" and "where_documents" clause

Both clauses are supported in query(), get() and  delete() methods of collection

#### "where" clause
Filtering metadata supports the follwing operators:

* $eq - equal to (string, int, float)
* $ne - not equal to (string, int, float)
* $gt - greater than (int, float)
* $gte - greater than or equal to (int, float)
* $lt - less than (int, float)
* $lte - less than or equal to (int, float)

In [21]:
results = collection.query(
    query_texts= ["Which is the latest AI model from Anthropic?"],
    n_results= 3,
    where={"version": {"$eq": 4.5}}
)

pprint(results)

{'data': None,
 'distances': [[0.5319048166275024, 0.9596002101898193]],
 'documents': [['Claude 4.5 is latest conversational AI model from Anthropic.',
                'Llama-4 is latest conversational AI model from Meta.']],
 'embeddings': None,
 'ids': [['anthropic', 'meta']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[{'version': 4.5}, {'version': 4}]],
 'uris': None}


In [22]:
results = collection.query(
    query_texts= ["Which is the latest AI model from Anthropic?"],
    n_results= 3,
    where={"version": {"$gt": 3}}
)

pprint(results)

{'data': None,
 'distances': [[0.5319048166275024, 0.9596002101898193, 1.0497297048568726]],
 'documents': [['Claude 4.5 is latest conversational AI model from Anthropic.',
                'Llama-4 is latest conversational AI model from Meta.',
                'GPT-5 is latest conversational AI model from OpenAI']],
 'embeddings': None,
 'ids': [['anthropic', 'meta', 'openai']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[{'version': 4.5}, {'version': 4}, {'version': 5}]],
 'uris': None}


* The follwing inclusion operators are supported

* $in - a value is in predefined list (string, int, float, bool)
* $nin - value is not in predefined list (string, int, float, bool)

In [23]:
results = collection.query(
    query_texts= ["Which is the latest AI model from Anthropic?"],
    n_results= 3,
    where={"version": {"$in": [3, 4]}}
)

pprint(results)

{'data': None,
 'distances': [[0.9596002101898193, 0.999941349029541]],
 'documents': [['Llama-4 is latest conversational AI model from Meta.',
                'Gemini is latest conversational AI model from google.']],
 'embeddings': None,
 'ids': [['meta', 'google']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[{'version': 4}, {'version': 3}]],
 'uris': None}
