In [1]:
# refer to the previous tutorial hello_milvus, https://milvus.io/docs/v2.3.x/example_code.md
# In this tutorail, we demonstrate the basic operations of PyMilvus, a Python SDK of Milvus.

# 1. connect to Milvus
# 2. create a collection
# 3. insert data
# 4. create index
# 5. search, query, and hybrid search on entities
# 6. delete entities by PK
# 7. drop collection

# compared to quickStart.ipynb, it will involve more detailed operations

In [2]:
import time
import numpy as np
from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType

In [3]:
fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
num_entities, dim = 3000, 8

In [6]:
# --------------- 1. Connect to Milvus Server
HOST = '10.10.10.250'
PORT = 19530
DB_NAME = 'testdb'
URL="http://"+HOST+':'+str(PORT)
# if you deployed the standalone milvus, and connect to the database server
# if you deployed in your local machine, use "http://localhost:19530"
client = MilvusClient(URL)

if DB_NAME not in client.list_databases():
    client.create_database(DB_NAME)
client.using_database(DB_NAME)
client.list_databases()

['default', 'testdb']

In [7]:
# ---------------- 2. Create a collection with customized schema
# We are going to create a collection with 3 fields.
# +-+------------+------------+------------------+------------------------------+
# | | field name | field type | other attributes |       field description      |
# +-+------------+------------+------------------+------------------------------+
# |1|    "pk"    |   VarChar  |  is_primary=True |      "primary field"         |
# | |            |            |   auto_id=False  |                              |
# +-+------------+------------+------------------+------------------------------+
# |2|  "random"  |    Double  |                  |      "a double field"        |
# +-+------------+------------+------------------+------------------------------+
# |3|"embeddings"| FloatVector|     dim=8        |  "float vector with dim 8"   |
# +-+------------+------------+------------------+------------------------------+
COLLECTION_NAME = 'demo_collection'
schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)
schema.add_field(field_name="pk", datatype=DataType.VARCHAR,
                 is_primary=True, auto_id=False, max_length=100)
schema.add_field(field_name="random", datatype=DataType.DOUBLE)
schema.add_field(field_name="embeddings",
                 datatype=DataType.FLOAT_VECTOR, dim=dim)

if client.has_collection(COLLECTION_NAME):
    client.drop_collection(COLLECTION_NAME)
res = client.create_collection(
    collection_name=COLLECTION_NAME,
    schema=schema
)
res

In [8]:
# ---------------- 3. Insert data
# We are going to insert 3000 rows of data into 'demo_collection'.add()
# Data to be inserted must be organized in fields.
# The insert() returns:
# - either automatically generated primary keys if auto_id = True in the schema
# - or user-defined primary keys if auto_id = False in the schema
rng = np.random.default_rng(seed=2025)
entities = [
   # provide the pk field because `auto_id` is set to False
    [str(i) for i in range(num_entities)],
    rng.random(num_entities).tolist(),  # field random, only supports list
    rng.random((num_entities, dim)),    # field embeddings, supports numpy.ndarray and list
]
data = [
    {"pk": x, "random"  : y, "embeddings": z} 
    for x, y, z in zip(*entities)
]

res = client.insert(data = data, collection_name=COLLECTION_NAME)
res['insert_count']

3000

In [9]:
# flush and check the number of entities
client.flush(collection_name=COLLECTION_NAME)
client.get_collection_stats(COLLECTION_NAME)

{'row_count': 3000}

In [10]:
# ---------------- 4. create index 
# We are going to create an IVF_FLAT index for this collection.
# create_index() can only be applied to `FloatVector` and `BinaryVector` fields.
# details of index can be found in https://milvus.io/docs/index-vector-fields.md

index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="embeddings",
    metric_type="L2",
    index_type="IVF_FLAT",
    index_name="vector_index",
    params={"nlist": 128}
)

client.create_index(
    collection_name=COLLECTION_NAME,
    index_params = index_params,
    sync = False # Whether to wait for index creation to complete before returning. Defaults to True.
)

In [11]:
# --------------------- 5. search, query, and hybrid search
# After data were inserted into Milvus and indexed, you can perform:
# - search based on vector similarity
# - query based on scalar filtering(boolean, int, etc.)
# - hybrid search based on vector similarity and scalar filtering.

# Before conducting a search or a query, you need to load the data into memory.
# Milvus loads the index files and the raw data of all fields into memory for rapid response to searches and queries.
# Entities inserted after a collection load are automatically indexed and loaded.

client.load_collection(collection_name=COLLECTION_NAME)
res = client.get_load_state(collection_name=COLLECTION_NAME)
print(fmt.format("Load collection"))
print(res)

print(fmt.format("Start searching based on vector similarity"))
vectors_to_search = entities[-1][-2:]
print(vectors_to_search)

search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
start_time = time.time()

# refer to the doc https://milvus.io/docs/single-vector-search.md
res = client.search(
    collection_name=COLLECTION_NAME,
    data=vectors_to_search,
    limit=4,
    search_params=search_params,
    output_fields=["random", "pk"]
)
end_time = time.time()

for hits in res:
    for hit in hits:
        entity = hit['entity']
        print(f"hit distance: {hit['distance']}, pk field: {entity.get('pk')} random field: {entity.get('random')}")
print(search_latency_fmt.format(end_time - start_time))



=== Load collection                ===

{'state': <LoadState: Loaded>}

=== Start searching based on vector similarity ===

[[0.2656976  0.171993   0.91092544 0.14604513 0.25858877 0.95456009
  0.5787366  0.91869466]
 [0.84256713 0.74070046 0.85305628 0.41972725 0.29071471 0.72756499
  0.56489305 0.57036048]]
hit distance: 0.0, pk field: 2998 random field: 0.33275249632421944
hit distance: 0.14918643236160278, pk field: 657 random field: 0.5011751600348467
hit distance: 0.15586762130260468, pk field: 1236 random field: 0.9941086782691402
hit distance: 0.16213689744472504, pk field: 2705 random field: 0.429570630790552
hit distance: 0.0, pk field: 2999 random field: 0.6066943264600407
hit distance: 0.06192314624786377, pk field: 1277 random field: 0.14939943687845592
hit distance: 0.1049613207578659, pk field: 1544 random field: 0.4428134410010268
hit distance: 0.1197763979434967, pk field: 598 random field: 0.46292542343899024
search latency = 2.0846s


In [12]:
# query based on scalar filtering (boolean, int, etc.)
# expr details can be found in https://milvus.io/docs/boolean.md
start_time = time.time()
res = client.query(
    collection_name=COLLECTION_NAME,
    filter=f"random > 0.8",
    output_fields=["random", "pk"],
)
end_time = time.time()
print(f"query result:\n{res[0]}")
print(search_latency_fmt.format(end_time - start_time))

query result:
{'random': 0.9944578051677608, 'pk': '0'}
search latency = 0.0118s


In [13]:
# pagination
r1 = client.query(
    collection_name=COLLECTION_NAME,
    filter=f"random > 0.8",
    output_fields=["random", "pk"],
    limit=4,
)

r2 = client.query(
    collection_name=COLLECTION_NAME,
    filter=f"random > 0.8",
    output_fields=["random", "pk"],
    limit=3,
    offset=1
)
print(f"query pagination(limit=4):\n\t{r1}")
print(f"query pagination(offset=1, limit=3):\n\t{r2}")

query pagination(limit=4):
	data: ["{'random': 0.9944578051677608, 'pk': '0'}", "{'random': 0.9032464289430996, 'pk': '1000'}", "{'random': 0.9653262466962503, 'pk': '1002'}", "{'random': 0.9492585570925737, 'pk': '1007'}"] 
query pagination(offset=1, limit=3):
	data: ["{'pk': '1000', 'random': 0.9032464289430996}", "{'pk': '1002', 'random': 0.9653262466962503}", "{'pk': '1007', 'random': 0.9492585570925737}"] 


In [14]:
# hybrid search 
start_time = time.time()
res = client.search(
    collection_name=COLLECTION_NAME,
    data=vectors_to_search,
    filter=f"random > 0.8",
    limit=4,
    search_params=search_params,
    output_fields=["random", "pk"]
)
end_time = time.time()

for hits in res:
    for hit in hits:
        entity = hit['entity']
        print(f"hit distance: {hit['distance']}, pk field: {entity.get('pk')} random field: {entity.get('random')}")
print(search_latency_fmt.format(end_time - start_time))

hit distance: 0.15586762130260468, pk field: 1236 random field: 0.9941086782691402
hit distance: 0.1722332090139389, pk field: 796 random field: 0.8013339188414864
hit distance: 0.19888439774513245, pk field: 137 random field: 0.9970659791038791
hit distance: 0.2118156999349594, pk field: 1520 random field: 0.8977844898616876
hit distance: 0.17695319652557373, pk field: 659 random field: 0.9506619402274943
hit distance: 0.18178263306617737, pk field: 58 random field: 0.9310743903641379
hit distance: 0.19831611216068268, pk field: 2453 random field: 0.8160436876255385
hit distance: 0.22429069876670837, pk field: 2938 random field: 0.9848352771777965
search latency = 0.0037s


In [15]:
# ---------------- 6. delete entities by PK
# You can delete entities by their PK values using boolean expressions.
filtering = f'pk in["{3}","{1000}"]'
res = client.query(
    collection_name=COLLECTION_NAME,
    filter=filtering,
    output_fields=["pk", "random"],
)
print(f"query before delete by expr=`{filtering}` -> result: \n-{res[0]}\n-{res[1]}\n")

res = client.delete(
    collection_name=COLLECTION_NAME,
    filter = filtering
)

print(res)

res = client.query(
    collection_name=COLLECTION_NAME,
    filter=filtering,
    output_fields=["pk", "random"],
)
print(f"query after delete by expr=`{filtering}` -> result: {res}\n")

query before delete by expr=`pk in["3","1000"]` -> result: 
-{'random': 0.9032464289430996, 'pk': '1000'}
-{'random': 0.8372552761899896, 'pk': '3'}

{'delete_count': 2}
query after delete by expr=`pk in["3","1000"]` -> result: data: ["{'pk': '1000', 'random': 0.9032464289430996}", "{'pk': '3', 'random': 0.8372552761899896}"] 



In [16]:
# ---------------- 7. drop collection
client.drop_collection(COLLECTION_NAME)
client.close()