In [None]:
%reload_ext autoreload
%autoreload 2

# Where Filtering
This notebook demonstrates how to use where filtering to filter the data returned from get or query.

In [None]:
import chromadb

In [None]:
client = chromadb.Client()

In [None]:
collections = client.list_collections()
if collections:
    client.delete_collection(collections[0].name)

In [None]:
# Create a new chroma collection
collection_name = "filter_example_collection"
collection = client.create_collection(name=collection_name)

In [None]:
# Add some data to the collection
collection.add(
    embeddings=[
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
    ],
    metadatas=[
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
    ],
    documents=["A document that discusses domestic policy", "A document that discusses international affairs", "A document that discusses kittens", "A document that discusses dogs", "A document that discusses chocolate", "A document that is sixth that discusses government", "A document that discusses dogs affairs", "A document that discusses global affairs"],
    ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
)

In [None]:
collection.build_index()

In [None]:
collection.get(
    include=["documents", "embeddings"],
    where_document={"$hybrid":{ "$hybrid_terms": ["affairs", "dogs"]}},
)

In [None]:
# Get documents that are read and about affairs
collection.get(where_document={"$contains": "affairs"})

In [None]:
# Get documents that are about global affairs or domestic policy
collection.get(where_document={"$or": [{"$contains": "global affairs"}, {"$contains": "domestic policy"}]})

In [None]:
# Get 5 closest vectors to [0, 0, 0] that are about affairs
# Outputs 3 docs because collection only has 3 docs about affairs
collection.query(
    query_embeddings=[[0, 0, 0]], 
    where_document={"$hybrid":{ "$hybrid_terms": ["affairs", "dogs"], "$hybrid_weight": 0.5}}, 
    n_results=5,
    n_buckets=4,
    include=["documents", "distances", "embeddings"]
)

In [None]:
collection.query(query_embeddings=[[0, 0, 0]], where_document={"$not_contains": "domestic policy"}, n_results=5)

# Where Filtering With Logical Operators
This section demonstrates how one can use the logical operators in `where` filtering.

Chroma currently supports: `$and` and `$or`operators.

> Note: Logical operators can be nested

In [None]:
# Or Logical Operator Filtering
# import chromadb
client = chromadb.Client()
collection = client.get_or_create_collection("test-where-list")
collection.add(documents=["Article by john", "Article by Jack", "Article by Jill"],
               metadatas=[{"author": "john"}, {"author": "jack"}, {"author": "jill"}], ids=["1", "2", "3"])

collection.get(where={"$or": [{"author": "john"}, {"author": "jack"}]})


In [None]:
# And Logical Operator Filtering
collection = client.get_or_create_collection("test-where-list")
collection.upsert(documents=["Article by john", "Article by Jack", "Article by Jill"],
               metadatas=[{"author": "john","category":"chroma"}, {"author": "jack","category":"ml"}, {"author": "jill","category":"lifestyle"}], ids=["1", "2", "3"])
collection.get(where={"$and": [{"category": "chroma"}, {"author": "john"}]})

In [None]:
# And logical that doesn't match anything
collection.get(where={"$and": [{"category": "chroma"}, {"author": "jill"}]})

In [None]:
# Combined And and Or Logical Operator Filtering
collection.get(where={"$and": [{"category": "chroma"}, {"$or": [{"author": "john"}, {"author": "jack"}]}]})

In [None]:
collection.get(where_document={"$contains": "Article"},where={"$and": [{"category": "chroma"}, {"$or": [{"author": "john"}, {"author": "jack"}]}]})

In [None]:
from rank_bm25 import BM25Okapi
import numpy as np

documents = [
    'A document that discusses international affairs',
    'A document that discusses global affairs',
    'A document that discusses dogs',
    'A document that discusses dogs affairs',
    'A document that discusses domestic policy',
    'A document that discusses kittens',
    'A document that discusses chocolate',
    'A document that is sixth that discusses government'
]

tokenized_docs = [doc.lower().split(" ") for doc in documents]

bm25 = BM25Okapi(tokenized_docs)

query = ["dogs", "affairs"]

scores = bm25.get_scores(query)
results = np.argsort(scores)[::-1]

ranked_documents = [documents[idx] for idx in results if scores[idx] > 0]

for doc in ranked_documents:
    print(doc)

In [None]:
import sqlite3

conn = sqlite3.connect(':memory:')
c = conn.cursor()

c.execute("CREATE VIRTUAL TABLE docs USING fts5(content, tokenize='trigram')")

documents = [
    'A document that discusses international affairs',
    'A document that discusses global affairs',
    'A document that discusses dogs',
    'A document that discusses dogs affairs',
    'A document that discusses domestic policy',
    'A document that discusses kittens',
    'A document that discusses chocolate',
    'A document that is sixth that discusses government'
]

for doc in documents:
    c.execute('INSERT INTO docs (content) VALUES (?)', (doc,))

conn.commit()

search_terms = 'dogs OR affairs'

c.execute(f"SELECT content FROM docs WHERE docs MATCH '{search_terms}' ORDER BY rank")

results = c.fetchall()
for content in results:
    print(f'Content: {content}')

conn.close()


In [None]:
rankings = {
    'query1': ['id7', 'id4', 'id8', 'id2'],
    'query2': ['id3', 'id5', 'id7', 'id1', 'id6'],
}

def result_func(ranking, q):
    return ranking[q]

def rank_func(results, d):
    return results.index(d) + 1 # adding 1 because ranks start from 1

def reciprocal_rank_fusion(queries, d, k, result_func, rank_func, rankings):
    return sum([1.0 / (k + rank_func(result_func(rankings, q), d)) if d in result_func(rankings, q) else 0 for q in queries])

k = 5
queries = list(rankings.keys())

combined_ids = set(doc_id for query_results in rankings.values() for doc_id in query_results)

id_scores = {doc: reciprocal_rank_fusion(queries, doc, k, result_func, rank_func, rankings) for doc in combined_ids}

sorted_documents = sorted(id_scores.items(), key=lambda x: x[1], reverse=True)

top_3_documents = sorted_documents[:5]

for doc, score in top_3_documents:
    print(f"{doc}: {score}")