# 不同向量库的对比
 - MetadataFilter
 - 
## 

## Metadata Query Operators

| **运算符** | **描述**                                                   | **支持的值类型**         | **示例**                           |
|------------|-----------------------------------------------------------|--------------------------|-------------------------------------|
| `$eq`      | 匹配元数据值等于指定值的向量。                             | Number, string, boolean  | `{ "author": { "$eq": "john" } }`  |
| `$ne`      | 匹配元数据值不等于指定值的向量。                           | Number, string, boolean  | `{ "author": { "$ne": "jack" } }`  |
| `$gt`      | 匹配元数据值大于指定值的向量。                             | Number                   | `{ "age": { "$gt": 30 } }`         |
| `$gte`     | 匹配元数据值大于或等于指定值的向量。                       | Number                   | `{ "age": { "$gte": 30 } }`        |
| `$lt`      | 匹配元数据值小于指定值的向量。                             | Number                   | `{ "age": { "$lt": 30 } }`         |
| `$lte`     | 匹配元数据值小于或等于指定值的向量。                       | Number                   | `{ "age": { "$lte": 30 } }`        |
| `$in`      | 匹配元数据值在指定数组中的向量。                           | String, number           | `{ "author": { "$in": ["john", "jill"] } }` |
| `$nin`     | 匹配元数据值不在指定数组中的向量。                         | String, number           | `{ "author": { "$nin": ["jack"] } }` |
| `$exists`  | 匹配具有指定元数据字段的向量。                             | Boolean                  | `{ "author": { "$exists": true } }` |


In [8]:
import chromadb

from chromadb.utils import embedding_functions

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")


client = chromadb.Client()
# client.heartbeat()
# client.reset()
collection = client.get_or_create_collection("test-where-list", embedding_function=sentence_transformer_ef)
collection.add(documents=["Article by john", "Article by Jack", "Article by Jill"],
               metadatas=[{"author": "john"}, {"author": "jack"}, {"author": "jill"}], ids=["1", "2", "3"])

query = ["Give me articles by john"]
res = collection.query(query_texts=query,where={'author': {'$in': ['john', 'jill']}}, n_results=10)
print(res)

res_get = collection.get(where={'author': {'$in': ['john', 'jill']}})
print(res_get)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3


{'ids': [['1', '3']], 'distances': [[0.2882421016693115, 1.0175083875656128]], 'metadatas': [[{'author': 'john'}, {'author': 'jill'}]], 'embeddings': None, 'documents': [['Article by john', 'Article by Jill']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}
{'ids': ['1', '3'], 'embeddings': None, 'metadatas': [{'author': 'john'}, {'author': 'jill'}], 'documents': ['Article by john', 'Article by Jill'], 'uris': None, 'data': None, 'included': ['metadatas', 'documents']}


In [9]:
import pprint 
pprint.pp(res)

{'ids': [['1', '3']],
 'distances': [[0.2882421016693115, 1.0175083875656128]],
 'metadatas': [[{'author': 'john'}, {'author': 'jill'}]],
 'embeddings': None,
 'documents': [['Article by john', 'Article by Jill']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}


In [10]:
pprint.pp(res_get)

{'ids': ['1', '3'],
 'embeddings': None,
 'metadatas': [{'author': 'john'}, {'author': 'jill'}],
 'documents': ['Article by john', 'Article by Jill'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}


In [11]:
collection.upsert(documents=["Article by john", "Article by Jack", "Article by Jill"],
               metadatas=[{"author": "john","article_type":"blog"}, {"author": "jack","article_type":"social"}, {"author": "jill","article_type":"paper"}], ids=["1", "2", "3"])

collection.query(query_texts=query,where={"$and":[{"author": {'$in': ['john', 'jill']}},{"article_type":{"$eq":"blog"}}]}, n_results=3)

{'ids': [['1']],
 'distances': [[0.2882421016693115]],
 'metadatas': [[{'article_type': 'blog', 'author': 'john'}]],
 'embeddings': None,
 'documents': [['Article by john']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

## BM 25

In [1]:
from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
from pymilvus.model.sparse import BM25EmbeddingFunction

analyzer = build_default_analyzer(language="en")

corpus = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

tokens = analyzer(corpus[0])
print("tokens:", tokens)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/blackink/nltk_data...


tokens: ['artifici', 'intellig', 'found', 'academ', 'disciplin', '1956']


[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
bm25_ef = BM25EmbeddingFunction(analyzer)

bm25_ef.fit(corpus)


In [3]:
docs = [
    "The field of artificial intelligence was established as an academic subject in 1956.",
    "Alan Turing was the pioneer in conducting significant research in artificial intelligence.",
    "Originating in Maida Vale, London, Turing grew up in the southern regions of England.",
    "In 1956, artificial intelligence emerged as a scholarly field.",
    "Turing, originally from Maida Vale, London, was brought up in the south of England."
]

docs_embeddings = bm25_ef.encode_documents(docs)

print("Embeddings:", docs_embeddings)
print("Sparse dim:", bm25_ef.dim, list(docs_embeddings)[0].shape)


Embeddings: <Compressed Sparse Row sparse array of dtype 'float32'
	with 24 stored elements and shape (5, 21)>
  Coords	Values
  (0, 0)	1.0208816528320312
  (0, 1)	1.0208816528320312
  (0, 3)	1.0208816528320312
  (0, 5)	1.0208816528320312
  (1, 0)	0.960698664188385
  (1, 1)	0.960698664188385
  (1, 6)	0.960698664188385
  (1, 7)	0.960698664188385
  (1, 10)	0.960698664188385
  (1, 12)	0.960698664188385
  (2, 7)	0.907216489315033
  (2, 15)	0.907216489315033
  (2, 16)	0.907216489315033
  (2, 17)	0.907216489315033
  (2, 19)	0.907216489315033
  (2, 20)	0.907216489315033
  (3, 0)	1.089108943939209
  (3, 1)	1.089108943939209
  (3, 5)	1.089108943939209
  (4, 7)	0.960698664188385
  (4, 15)	0.960698664188385
  (4, 16)	0.960698664188385
  (4, 17)	0.960698664188385
  (4, 20)	0.960698664188385
Sparse dim: 21 (21,)


In [4]:
queries = ["When was artificial intelligence founded", 
           "Where was Alan Turing born?"]

query_embeddings = bm25_ef.encode_queries(queries)

print("Embeddings:", query_embeddings)
print("Sparse dim:", bm25_ef.dim, list(query_embeddings)[0].shape)


Embeddings: <Compressed Sparse Row sparse array of dtype 'float32'
	with 6 stored elements and shape (2, 21)>
  Coords	Values
  (0, 0)	0.5108256340026855
  (0, 1)	0.5108256340026855
  (0, 2)	0.5108256340026855
  (1, 6)	0.5108256340026855
  (1, 7)	0.115543894469738
  (1, 14)	0.5108256340026855
Sparse dim: 21 (21,)
