In [2]:
# If python < 3.10:
# !pip install pysqlite3-binary
# __import__('pysqlite3')
# import sys
# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [3]:
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction


api_key = "..."
model_name = "text-embedding-3-small"
db_path = "vectorstore"


client = chromadb.PersistentClient(path=db_path)

embedding_fn = OpenAIEmbeddingFunction(
    api_key=api_key,
    model_name=model_name
)

collection = client.get_or_create_collection(
    name="my_collection",
    embedding_function=embedding_fn,
    metadata={"hnsw:space": "cosine"},
)

In [4]:
import json
with open('articles.json', 'r') as f:
    articles = json.load(f)

In [5]:
texts = [article['title'] for article in articles]
meta = [{'timestamp': article['date']} for article in articles]
ids = [str(i) for i in range(len(articles))]

collection.add(
    ids,
    metadatas=meta,
    documents=texts
)

In [8]:
collection.query(
    query_texts="Apple AI",
    where={"timestamp": {"$gt": 1412137600}},
    n_results=5
)

Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4


{'ids': [['3', '2', '0', '1']],
 'distances': [[0.3533566369244252,
   0.382343716566358,
   0.792960077643678,
   0.9688457418429988]],
 'metadatas': [[{'timestamp': 1728154800},
   {'timestamp': 1728154800},
   {'timestamp': 1560519810},
   {'timestamp': 1728154800}]],
 'embeddings': None,
 'documents': [['Apple Intelligence: ChatGPT coming to iPhones in AI overhaul',
   'Introducing Apple Intelligence for iPhone, iPad, and Mac',
   'Как искусственный интеллект применяется в металлургии',
   'ПМЭФ-2024: Общий объем подписанных соглашений Самарской области превысил 100 млрд рублей']],
 'uris': None,
 'data': None}

In [7]:
collection.count()

4

## Как работать с категориями

In [9]:
import time

collection.add(
    ["test_id"],
    metadatas=[{
        "timestamp": int(time.time()),
        "relevancy": 6,
        "cat1": True,
        "cat2": False,
        "cat3": True,
        }],
    documents=["hello world!"],
    )

In [11]:
collection.get(
    "test_id",
    where={"$and": [
            {"relevancy": {"$gt": 5}},
            {"cat1": True},
            {"timestamp": {"$gt": 1412137600}},
            {"timestamp": {"$lt": 1812137600}},
        ]}
    )

{'ids': ['test_id'],
 'embeddings': None,
 'metadatas': [{'cat1': True,
   'cat2': False,
   'cat3': True,
   'relevancy': 6,
   'timestamp': 1718126909}],
 'documents': ['hello world!'],
 'uris': None,
 'data': None}