In [4]:
import chromadb
from chromadb.utils import embedding_functions
import uuid

In [6]:
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()


sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")


# Create collection. get_collection, get_or_create_collection, delete_collection also available!
collection = client.create_collection("test_collection", embedding_function=sentence_transformer_ef )

# Add docs to the collection. Can also update and delete. Row-based API coming soon!
collection.add(
    documents=["This is water, juice, and blood", "This is rose, apple"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
    metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
    ids=["doc1", "doc2"], # unique for each doc 
)

# Query/search 2 most similar results. You can also .get by id
# results = collection.query(
#     query_texts=["This is a query document"],
#     n_results=2,
    # where={"metadata_field": "is_equal_to_this"}, # optional filter
    # where_document={"$contains":"search_string"}  # optional filter
# )

Using embedded DuckDB without persistence: data will be transient
  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 348kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 151kB/s]
Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 5.78MB/s]
Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<00:00, 312kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 54.1kB/s]
Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 625kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:13<00:00, 31.7MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 21.7kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 141kB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 6.42MB/s]
Downloading (…)o

In [3]:
results = collection.query(
    query_texts=["This is a query document"],
    n_results=2,
    # where={"metadata_field": "is_equal_to_this"}, # optional filter
    # where_document={"$contains":"search_string"}  # optional filter
)

In [4]:
results

{'ids': [['doc1', 'doc2']],
 'embeddings': None,
 'documents': [['This is document1', 'This is document2']],
 'metadatas': [[{'source': 'notion'}, {'source': 'google-docs'}]],
 'distances': [[0.9026352167129517, 1.0358158349990845]]}

In [5]:
collection.add(
    documents=["This is cat dog, monkey", "This is boba tea, noodle, rice"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
    # metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
    ids=["doc3", "doc4"], # unique for each doc 
)


In [9]:
results = collection.query(
    query_texts=["Do not give me anything related to food or animal?"],
    n_results=2,
)

results

{'ids': [['doc3', 'doc4']],
 'embeddings': None,
 'documents': [['This is cat dog, monkey', 'This is boba tea, noodle, rice']],
 'metadatas': [[None, None]],
 'distances': [[1.5173072814941406, 1.537183403968811]]}

{'ids': ['doc3'],
 'embeddings': None,
 'documents': ['This is cat dog, monkey'],
 'metadatas': [None]}

In [1]:
from db.vector_store import get_collection



collection = get_collection()

Using embedded DuckDB without persistence: data will be transient
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
get_id = lambda : str(uuid.uuid4())[:8]

collection.add(
    ids = [get_id(), get_id()],
    documents=["python code is really hard to learn", "anaconda is a package manager"],
)

In [11]:
collection.query(
    query_texts=["learn how to program"],
    n_results=1,
)

{'ids': [['a95fbffc']],
 'embeddings': None,
 'documents': [['python code is really hard to learn']],
 'metadatas': [[None]],
 'distances': [[1.0571759939193726]]}