In [1]:
# refer to 
from pymilvus import MilvusClient
from pymilvus import model

In [2]:
HOST = '10.10.10.247'
PORT = 19530
DB_NAME = 'testdb'
URL=HOST+':'+str(PORT)
# if you deployed the standalone milvus, and connect to the database server
# if you deployed in your local machine, use "http://localhost:19530"
client = MilvusClient("http://10.10.10.247:19530")

In [3]:
# create a local milvus vector database
# DB_FILE = './milvus_demo.db'
# client = MilvusClient(DB_FILE)
client.list_databases()

['default', 'testdb']

In [4]:
# create the database and shift to the database
# otherwise, the default database "default" will be used
if DB_NAME not in client.list_databases():
    client.create_database(DB_NAME)
client.using_database(DB_NAME)
client.list_databases()

['default', 'testdb']

In [5]:
# we use collection to store vectors and their associate metadata
# when create a collection, we can define schema and index params to configure vector spaces such as dimensionality, index types and distant metrics
# the details parameters can be found in https://milvus.io/api-reference/pymilvus/v2.5.x/MilvusClient/Collections/create_collection.md

# in below setup, only basic function and parameters are focused.
# the primary key and vector fields use their default names ("id" and "vector")
# the primary key fields accepsts integer type and vector fields accepts float type
# the metric type (vector distance definition) is set to its default value ("COSINE")

COLLECTION_NAME = 'simple_demo_collection'
if client.has_collection(COLLECTION_NAME):
    client.drop_collection(COLLECTION_NAME)
client.create_collection(
    collection_name=COLLECTION_NAME,
    dimension=768,
)

In [6]:
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

In [7]:
# generate vectors for the documents
# pre-requisite is to download pymilvus[model] first
embedding_fn = model.DefaultEmbeddingFunction()
vectors = embedding_fn.encode_documents(docs)
print("Dim:", embedding_fn.dim, vectors[0].shape)

# Each record is named "entity"m entity has id, vector representation, raw text, and a subject label that we use to demo meta data.
# the usage is similar to the NoSQL database like mongodb.
data = [
    {"id":i, "vector": vectors[i], "text": docs[i], "subject":"history"}
    for i in range(len(docs))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Dim: 768 (768,)
Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


In [8]:
# if you couldn't downlaod the model module. 
# use random vectors to represent th text and finish the example.
import random
vectors = [[random.uniform(-1, 1) for _ in range(768)] for _ in docs]
data = [
    {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


In [9]:
# insert into database
res = client.insert(collection_name=COLLECTION_NAME, data = data)
print(res)

{'insert_count': 3, 'ids': [0, 1, 2]}


In [10]:
# ---------------- simply semantic vector search
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing ?"])
res = client.search(
    collection_name=COLLECTION_NAME,
    data = query_vectors,
    limit=2,
    output_fields=["text", "subject"],
)
res

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


data: ["[{'id': 0, 'distance': 0.0020332501735538244, 'entity': {'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history'}}, {'id': 2, 'distance': -0.0006970130489207804, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}]"] 

In [11]:
# -------------- vector search with Metadata filtering

# conduct the vector search while considering the values of the metadata.


# insert more docs in another subject
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
    {"id": 3 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
    for i in range(len(vectors))
]

res = client.insert(collection_name=COLLECTION_NAME, data=data)
res

{'insert_count': 3, 'ids': [3, 4, 5], 'cost': 0}

In [12]:
# semantic search with metadata filtering
# filter expression details can be found https://milvus.io/docs/boolean.md
res = client.search(
    collection_name=COLLECTION_NAME,
    data = embedding_fn.encode_queries(["tell me AI related information"]),
    filter="subject == 'biology'",
    limit=2,
    output_fields=["text", "subject"],
)
res

data: ['[]'] 

In [13]:
# retrieve all entities whose scalar field has a particular value

res = client.query(
    collection_name=COLLECTION_NAME,
    filter="subject == 'history'",
    output_fields=["text", "subject"],
)
res

data: ["{'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history', 'id': 0}", "{'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history', 'id': 1}", "{'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history', 'id': 2}"] 

In [14]:
# retrieve entity by primary key "id"
res = client.query(
    collection_name=COLLECTION_NAME,
    ids = [0,2],
    output_fields=["text", "subject"],
)
res

data: ["{'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history', 'id': 0}", "{'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history', 'id': 2}"] 

In [15]:
# ----------------- delete entities

# delete by primary key "id"
res = client.delete(
    collection_name=COLLECTION_NAME,
    ids = [0,2],
)
res

{'delete_count': 2, 'cost': 0}

In [16]:
# delete entities by the filter expression
res = client.delete(
    collection_name=COLLECTION_NAME,
    filter="subject == 'history'",
)
res

{'delete_count': 3, 'cost': 0}

In [17]:
# check all entities
res = client.query(
    collection_name=COLLECTION_NAME,
    limit=10,
    output_fields=["text", "subject"],
)
res

data: ["{'text': 'Machine learning has been used for drug design.', 'subject': 'biology', 'id': 3}", "{'text': 'Computational synthesis with AI algorithms predicts molecular properties.', 'subject': 'biology', 'id': 4}", "{'text': 'DDR1 is involved in cancers and fibrosis.', 'subject': 'biology', 'id': 5}"] 