In [2]:
pip install chromadb

# What is Chroma DB?
# Chroma DB is a database optimized for storing, indexing, and querying vectors 
#(usually embeddings). It is built to handle high-dimensional data like word embeddings, 
#image embeddings, or any form of data transformed into a vector format. This makes it ideal for tasks 
#where you need to search, compare, or retrieve data based on similarity in a vector space

Collecting chromadb
  Using cached chromadb-0.5.11-py3-none-any.whl (603 kB)
Collecting mmh3>=4.0.1
  Using cached mmh3-5.0.1-cp39-cp39-win_amd64.whl (39 kB)
Collecting fastapi>=0.95.2
  Using cached fastapi-0.115.0-py3-none-any.whl (94 kB)
Collecting kubernetes>=28.1.0
  Using cached kubernetes-31.0.0-py2.py3-none-any.whl (1.9 MB)
Collecting importlib-resources
  Using cached importlib_resources-6.4.5-py3-none-any.whl (36 kB)
Collecting chroma-hnswlib==0.7.6
  Using cached chroma_hnswlib-0.7.6-cp39-cp39-win_amd64.whl (151 kB)
Collecting build>=1.0.3
  Using cached build-1.2.2-py3-none-any.whl (22 kB)
Collecting bcrypt>=4.0.1
  Using cached bcrypt-4.2.0-cp39-abi3-win_amd64.whl (151 kB)
Note: you may need to restart the kernel to use updated packages.


Installing collected packages: mmh3, kubernetes, importlib-resources, fastapi, chroma-hnswlib, build, bcrypt, chromadb
  Attempting uninstall: bcrypt
    Found existing installation: bcrypt 3.2.0
    Uninstalling bcrypt-3.2.0:
      Successfully uninstalled bcrypt-3.2.0
Successfully installed bcrypt-4.2.0 build-1.2.2 chroma-hnswlib-0.7.6 chromadb-0.5.11 fastapi-0.115.0 importlib-resources-6.4.5 kubernetes-31.0.0 mmh3-5.0.1


In [5]:
import chromadb
client = chromadb.Client()
collection = client.create_collection(name="my_collection")

In [6]:
collection.add(
     documents = [
         "This is Aslam",
         "This is Virat"
     ],
    ids = ["id1","id2"]          # TO STORE DATA IN VECTOR DB(CHROME DB)
)

C:\Users\ahame\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|█████| 79.3M/79.3M [00:23<00:00, 3.51MiB/s]


In [7]:
all_docs = collection.get()
all_docs       # RETRIEVE ALL THE STORED DOCS

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'metadatas': [None, None],
 'documents': ['This is Aslam', 'This is Virat'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [8]:
getDataById = collection.get(ids=['id1'])
getDataById         # GET SPECIFIC DOC BY ID

{'ids': ['id1'],
 'embeddings': None,
 'metadatas': [None],
 'documents': ['This is Aslam'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [9]:
results = collection.query(
   query_texts = ['query is about india'],
   n_results = 2
)
results        #semantic search

{'ids': [['id2', 'id1']],
 'distances': [[1.813114881515503, 1.8394901752471924]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['This is Virat', 'This is Aslam']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [10]:
collection.delete(ids=all_docs['ids'])
collection.get()            #TO DELETE ALL DOCS

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [11]:
collection.add(
     documents = [
         "This is Aslam",
         "This is Virat"
     ],
    ids = ["id1","id2"],          # TO STORE DATA IN VECTOR DB(CHROME DB)
    metadatas = [
        {"url":"https://en.wikipedia.org/wiki/This is Aslam"},
        {"url":"https://en.wikipedia.org/wiki/This is Virat"},

    ]
)

In [12]:
results = collection.query(
   query_texts = ['query is about india'],
   n_results = 2
)
results    

{'ids': [['id2', 'id1']],
 'distances': [[1.813114881515503, 1.8394901752471924]],
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/This is Virat'},
   {'url': 'https://en.wikipedia.org/wiki/This is Aslam'}]],
 'embeddings': None,
 'documents': [['This is Virat', 'This is Aslam']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}