In [None]:
# https://milvus.io/docs/example_code.md
# https://www.youtube.com/watch?v=kh1CSlQF788&list=PLPg7_faNDlT5Fb8WN8r1PzzQTNzdechnS&index=5
# YouTube Playlist https://www.youtube.com/playlist?list=PLPg7_faNDlT5Fb8WN8r1PzzQTNzdechnS

#Run this commond in terminal
#sudo pip install pymilvus


In [1]:
import random
import numpy as np

In [2]:
from pprint import pprint

In [3]:
from milvus import Milvus, IndexType, MetricType, Status

ImportError: cannot import name 'Milvus' from 'milvus' (/Users/binod/Documents/software/anaconda/anaconda3/lib/python3.7/site-packages/milvus/__init__.py)

In [4]:
#Imports a PyMilvus package:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

In [5]:
#Connects to a server:
connections.connect("default", host="localhost", port="19530")

In [6]:
#Creates a collection:
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="random", dtype=DataType.DOUBLE),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=8)
]
schema = CollectionSchema(fields, "hello_milvus is the simplest demo to introduce the APIs")
hello_milvus = Collection("hello_milvus", schema)

In [7]:
#Inserts vectors in the collection:
import random
entities = [
    [i for i in range(3000)],  # field pk
    [float(random.randrange(-20, -10)) for _ in range(3000)],  # field random
    [[random.random() for _ in range(8)] for _ in range(3000)],  # field embeddings
]
insert_result = hello_milvus.insert(entities)
# After final entity is inserted, it is best to call flush to have no growing segments left in memory
hello_milvus.flush() 

In [8]:
# Builds indexes on the entities:

index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
hello_milvus.create_index("embeddings", index)


Status(code=0, message=)

In [9]:
# Loads the collection to memory and performs a vector similarity search:
hello_milvus.load()
vectors_to_search = entities[-1][-2:]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["random"])


In [11]:
# Performs a vector query:
result = hello_milvus.query(expr="random > -14", output_fields=["random", "embeddings"])

In [12]:
# Performs a hybrid search:
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > -12", output_fields=["random"])


In [None]:
# Deletes entities by their primary keys:
expr = f"pk in [{entities[0]}, {entities[1]}]"
hello_milvus.delete(expr)


In [13]:
# Drops the collection:
utility.drop_collection("hello_milvus")


In [29]:
#Creates a collection:
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="words", dtype=DataType.VARCHAR, max_length=50),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=100)
]
schema = CollectionSchema(fields, "Simple Demo for storing and retrieve embedding and matching")
demo = Collection("demo", schema)

In [32]:
# Builds indexes on the entities:

index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
demo.create_index("embeddings", index)



Status(code=0, message=)

In [28]:
# Drops the collection:
utility.drop_collection("demo")



In [15]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Define the example sentences
sentences = [
    "The puppy loyal to its owner and loved to chase its tail",
    "The dog was loyal and protective of its owner, has lovely tail"
]




In [16]:
word_lists = [sentence.lower().split() for sentence in sentences]

In [17]:
model = Word2Vec(word_lists, min_count=1)

# Get the embeddings for "associate" and "employee"
embedding_1 = model.wv['puppy']
embedding_2 = model.wv['dog']

# Print the embeddings
print("Embedding for 'puppy': ", embedding_1)
print("Embedding for 'dog': ", embedding_2)




Embedding for 'puppy':  [ 8.1693530e-03 -4.4424967e-03  8.9852214e-03  8.2536135e-03
 -4.4353902e-03  3.0241191e-04  4.2745322e-03 -3.9266776e-03
 -5.5602933e-03 -6.5135299e-03 -6.6985562e-04 -2.9506299e-04
  4.4629988e-03 -2.4744093e-03 -1.7139514e-04  2.4623820e-03
  4.8689600e-03 -3.1117946e-05 -6.3400054e-03 -9.2621362e-03
  2.6943893e-05  6.6615203e-03  1.4670712e-03 -8.9669479e-03
 -7.9381913e-03  6.5520909e-03 -3.7867506e-03  6.2539754e-03
 -6.6802753e-03  8.4803579e-03 -6.5157204e-03  3.2886867e-03
 -1.0567167e-03 -6.7876372e-03 -3.2876898e-03 -1.1616098e-03
 -5.4721525e-03 -1.2111445e-03 -7.5635971e-03  2.6450334e-03
  9.0694204e-03 -2.3773636e-03 -9.7476749e-04  3.5129883e-03
  8.6645139e-03 -5.9219110e-03 -6.8874871e-03 -2.9331418e-03
  9.1473395e-03  8.6730585e-04 -8.6782174e-03 -1.4478377e-03
  9.4779180e-03 -7.5491238e-03 -5.3589093e-03  9.3162311e-03
 -8.9734495e-03  3.8262166e-03  6.6430384e-04  6.6604549e-03
  8.3128037e-03 -2.8500976e-03 -3.9914288e-03  8.8984985e-03


In [18]:
embedding_1.shape

(100,)

In [36]:
entities = [
    [10,20],  # field pk
    ['puppy','dog'],  # field words
    [embedding_1,embedding_2],  # field embeddings
]
insert_result = demo.insert(entities)
# After final entity is inserted, it is best to call flush to have no growing segments left in memory
demo.flush() 

In [37]:
insert_result

(insert count: 2, delete count: 0, upsert count: 0, timestamp: 442736533841641476, success count: 2, err count: 0)

In [38]:
# Loads the collection to memory and 
demo.load()



In [39]:
entities

[[10, 20],
 ['puppy', 'dog'],
 [array([ 8.1693530e-03, -4.4424967e-03,  8.9852214e-03,  8.2536135e-03,
         -4.4353902e-03,  3.0241191e-04,  4.2745322e-03, -3.9266776e-03,
         -5.5602933e-03, -6.5135299e-03, -6.6985562e-04, -2.9506299e-04,
          4.4629988e-03, -2.4744093e-03, -1.7139514e-04,  2.4623820e-03,
          4.8689600e-03, -3.1117946e-05, -6.3400054e-03, -9.2621362e-03,
          2.6943893e-05,  6.6615203e-03,  1.4670712e-03, -8.9669479e-03,
         -7.9381913e-03,  6.5520909e-03, -3.7867506e-03,  6.2539754e-03,
         -6.6802753e-03,  8.4803579e-03, -6.5157204e-03,  3.2886867e-03,
         -1.0567167e-03, -6.7876372e-03, -3.2876898e-03, -1.1616098e-03,
         -5.4721525e-03, -1.2111445e-03, -7.5635971e-03,  2.6450334e-03,
          9.0694204e-03, -2.3773636e-03, -9.7476749e-04,  3.5129883e-03,
          8.6645139e-03, -5.9219110e-03, -6.8874871e-03, -2.9331418e-03,
          9.1473395e-03,  8.6730585e-04, -8.6782174e-03, -1.4478377e-03,
          9.4779180e-

In [None]:
# performs a vector similarity search:
vectors_to_search = entities[-1][-2:]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
result = demo.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["words"])



In [40]:
#Creates a collection:
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="words", dtype=DataType.VARCHAR, max_length=50),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=100)
]
schema = CollectionSchema(fields, "Simple Demo for storing and retrieve embedding and matching")
demo_2 = Collection("demo_2", schema)

In [41]:
#Insert data in collection
data = [
    [1,2],  # field pk
    ['puppy','dog'],  # field words
    [embedding_1,embedding_2],  # field embeddings
]
insert_result = demo_2.insert(data)
# After final entity is inserted, it is best to call flush to have no growing segments left in memory
demo_2.flush() 

In [43]:
# Builds indexes on the entities:

index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
demo_2.create_index("embeddings", index)




Status(code=0, message=)

In [44]:
# Loads the collection to memory and 
# All search and query operations within Milvus are executed in memory. 
# Load the collection to memory before conducting a vector similarity search.
demo_2.load()



In [45]:
data[-1]

[array([ 8.1693530e-03, -4.4424967e-03,  8.9852214e-03,  8.2536135e-03,
        -4.4353902e-03,  3.0241191e-04,  4.2745322e-03, -3.9266776e-03,
        -5.5602933e-03, -6.5135299e-03, -6.6985562e-04, -2.9506299e-04,
         4.4629988e-03, -2.4744093e-03, -1.7139514e-04,  2.4623820e-03,
         4.8689600e-03, -3.1117946e-05, -6.3400054e-03, -9.2621362e-03,
         2.6943893e-05,  6.6615203e-03,  1.4670712e-03, -8.9669479e-03,
        -7.9381913e-03,  6.5520909e-03, -3.7867506e-03,  6.2539754e-03,
        -6.6802753e-03,  8.4803579e-03, -6.5157204e-03,  3.2886867e-03,
        -1.0567167e-03, -6.7876372e-03, -3.2876898e-03, -1.1616098e-03,
        -5.4721525e-03, -1.2111445e-03, -7.5635971e-03,  2.6450334e-03,
         9.0694204e-03, -2.3773636e-03, -9.7476749e-04,  3.5129883e-03,
         8.6645139e-03, -5.9219110e-03, -6.8874871e-03, -2.9331418e-03,
         9.1473395e-03,  8.6730585e-04, -8.6782174e-03, -1.4478377e-03,
         9.4779180e-03, -7.5491238e-03, -5.3589093e-03,  9.31623

In [46]:
import numpy as np


arr = np.array([1, 2, 3, 4])

print(arr[-1])

print(arr[-2])

print(arr[-2:])

4
3
[3 4]


In [49]:
data

[[1, 2],
 ['puppy', 'dog'],
 [array([ 8.1693530e-03, -4.4424967e-03,  8.9852214e-03,  8.2536135e-03,
         -4.4353902e-03,  3.0241191e-04,  4.2745322e-03, -3.9266776e-03,
         -5.5602933e-03, -6.5135299e-03, -6.6985562e-04, -2.9506299e-04,
          4.4629988e-03, -2.4744093e-03, -1.7139514e-04,  2.4623820e-03,
          4.8689600e-03, -3.1117946e-05, -6.3400054e-03, -9.2621362e-03,
          2.6943893e-05,  6.6615203e-03,  1.4670712e-03, -8.9669479e-03,
         -7.9381913e-03,  6.5520909e-03, -3.7867506e-03,  6.2539754e-03,
         -6.6802753e-03,  8.4803579e-03, -6.5157204e-03,  3.2886867e-03,
         -1.0567167e-03, -6.7876372e-03, -3.2876898e-03, -1.1616098e-03,
         -5.4721525e-03, -1.2111445e-03, -7.5635971e-03,  2.6450334e-03,
          9.0694204e-03, -2.3773636e-03, -9.7476749e-04,  3.5129883e-03,
          8.6645139e-03, -5.9219110e-03, -6.8874871e-03, -2.9331418e-03,
          9.1473395e-03,  8.6730585e-04, -8.6782174e-03, -1.4478377e-03,
          9.4779180e-03

In [58]:
len(data)

3

In [59]:
len(data[0])

2

In [60]:
vectors_to_search = data[-1][-2:]
vectors_to_search

[array([ 8.1693530e-03, -4.4424967e-03,  8.9852214e-03,  8.2536135e-03,
        -4.4353902e-03,  3.0241191e-04,  4.2745322e-03, -3.9266776e-03,
        -5.5602933e-03, -6.5135299e-03, -6.6985562e-04, -2.9506299e-04,
         4.4629988e-03, -2.4744093e-03, -1.7139514e-04,  2.4623820e-03,
         4.8689600e-03, -3.1117946e-05, -6.3400054e-03, -9.2621362e-03,
         2.6943893e-05,  6.6615203e-03,  1.4670712e-03, -8.9669479e-03,
        -7.9381913e-03,  6.5520909e-03, -3.7867506e-03,  6.2539754e-03,
        -6.6802753e-03,  8.4803579e-03, -6.5157204e-03,  3.2886867e-03,
        -1.0567167e-03, -6.7876372e-03, -3.2876898e-03, -1.1616098e-03,
        -5.4721525e-03, -1.2111445e-03, -7.5635971e-03,  2.6450334e-03,
         9.0694204e-03, -2.3773636e-03, -9.7476749e-04,  3.5129883e-03,
         8.6645139e-03, -5.9219110e-03, -6.8874871e-03, -2.9331418e-03,
         9.1473395e-03,  8.6730585e-04, -8.6782174e-03, -1.4478377e-03,
         9.4779180e-03, -7.5491238e-03, -5.3589093e-03,  9.31623

# performs a vector similarity search:
Euclidean distance (L2) <br>
vectors_to_search -> Vectors to search with. <br>
embeddings -> Name of the field to search on. <br>
search_param -> metric_type -> which metric used to measure the similarity of vectorss. <br>
                params -> nprobe Indicates the number of cluster units to search. <br>
                This parameter is available only when index_type is set to IVF_FLAT, IVF_SQ8, or IVF_PQ.<br>
<br>
limit -> Number of the most similar results to return.<br>
output_fields -> Name of the field to return. Vector field is not supported in current release.<br>





In [61]:
vectors_to_search = data[-1][-2:]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
result = demo_2.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["words"])
result


<pymilvus.orm.search.SearchResult at 0x7ff6fcdb7e90>

In [68]:
# get the IDs of all returned hits
result[0].ids

[1, 2]

In [69]:
# get the distances to the query vector from all returned hits
result[0].distances

[0.0, 0.005495465360581875]

In [70]:
# get the value of an output field specified in the search request.
hit = result[0][0]
hit.entity.get('words')

'puppy'

In [66]:
result[0].distances

[0.0, 0.005495465360581875]

In [71]:
from sklearn.metrics.pairwise import cosine_similarity

In [72]:
similarity = cosine_similarity(embedding_1.reshape(1,-1), embedding_2.reshape(1,-1))[0][0]

print("Cosine similarity : ", similarity)

Cosine similarity :  0.11131189


In [None]:
# Performs a vector query:
result = demo_2.query(expr="random > -14", output_fields=["random", "embeddings"])

In [None]:
# Release the collection loaded in Milvus to reduce memory consumption when the search is completed.
demo_2.release()

In [85]:
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="words", dtype=DataType.VARCHAR, max_length=50),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=300)
]
schema = CollectionSchema(fields, "Simple Demo for storing and retrieve embedding from Google")
demo_3 = Collection("demo_3", schema)

In [76]:
import gensim.downloader as api

# Load a pre-trained Word2Vec model using the Gensim downloader
model = api.load("word2vec-google-news-300")

In [79]:
dog = model['dog']
dog.shape

(300,)

In [80]:
puppy = model['puppy']

In [87]:
king = model['king']
queen = model['queen']
book = model['book']
lion = model['lion']
jungle = model['jungle']
tiger = model['tiger']
study = model['study']

In [88]:
#Insert data in collection
data = [
    [1,2,3,4,5,6,7,8,9],  # field pk
    ['puppy','dog','king','queen','book','lion','jungle','tiger','study'],  # field words
    [puppy, dog, king, queen, book, lion, jungle, tiger, study],  # field embeddings
]
insert_result = demo_3.insert(data)
# After final entity is inserted, it is best to call flush to have no growing segments left in memory
demo_3.flush() 

In [89]:
# Builds indexes on the entities:

index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
demo_3.create_index("embeddings", index)




Status(code=0, message=)

In [90]:
demo_3.load()

In [91]:
# performs a vector similarity search:
data = [dog]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
result = demo_3.search(data, "embeddings", search_params, limit=3, output_fields=["words"])




In [92]:
result[0].ids

[2, 1, 6]

vector similarity search: <br><br>
A vector similarity search in Milvus calculates the distance between query vector(s) and vectors in the <br>collection with specified similarity metrics, and returns the most similar results. <br>
https://milvus.io/docs/search.md

In [100]:
# performs a vector similarity search:
data = [dog]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
result = demo_3.search(data, "embeddings", search_params, limit=5, output_fields=["words"])





In [101]:
result[0].ids

[2, 1, 6, 8, 5]

In [102]:
result[0].distances

[0.0, 3.7864410877227783, 11.244926452636719, 11.9305419921875, 13.925182342529297]

In [108]:
# get the value of an output field specified in the search request.
for i in range(0,5):
    hit = result[0][i]
    print(hit.entity.get('words'))
    

dog
puppy
lion
tiger
book


Performs a vector query:<br><br>
Unlike a vector similarity search, a vector query retrieves vectors via scalar filtering based on boolean <br> expression. Milvus supports many data types in the scalar fields and a variety of boolean expressions. The <br>boolean expression filters on scalar fields or the primary key field, and it retrieves all results that <br>match the filters. <br>
https://milvus.io/docs/query.md <br>

For example<br>
<br>
res = collection.query( <br>
  expr = "book_id in [2,4,6,8]", <br>
  offset = 0, <br>
  limit = 10, <br>
  output_fields = ["book_id", "book_intro"], <br>
  consistency_level="Strong"<br>
)<br>

In [116]:
result = demo_3.query(expr="pk > 5", output_fields=["pk","words", "embeddings"])

In [117]:
result[0]

{'words': 'lion',
 'embeddings': [0.21289062,
  -0.0045776367,
  -0.23632812,
  0.045654297,
  0.13476562,
  -0.13671875,
  -0.014099121,
  -0.122558594,
  0.038085938,
  -0.0021209717,
  -0.15722656,
  0.00592041,
  -0.1875,
  -0.09863281,
  -0.46484375,
  -0.032714844,
  -0.25195312,
  0.015014648,
  -0.23730469,
  -0.10546875,
  -0.064453125,
  -0.21972656,
  0.38085938,
  -0.067871094,
  0.16796875,
  0.13183594,
  -0.36523438,
  0.09082031,
  0.19433594,
  0.0021209717,
  -0.125,
  -0.10107422,
  -0.020507812,
  0.1796875,
  -0.06298828,
  -0.100097656,
  -0.3828125,
  0.2734375,
  0.18945312,
  0.26171875,
  0.21484375,
  -0.203125,
  0.15625,
  0.21582031,
  0.106933594,
  -0.03125,
  0.24121094,
  -0.012023926,
  0.32421875,
  -0.18652344,
  -0.111328125,
  0.037109375,
  0.051757812,
  -0.16796875,
  -0.05444336,
  -0.0028839111,
  -0.029663086,
  -0.12011719,
  0.3671875,
  -0.029174805,
  -0.09863281,
  0.016113281,
  -0.091308594,
  -0.22558594,
  0.16699219,
  -0.18066406,

In [120]:
result[0]["pk"]

6

In [129]:
len(result)

4

In [130]:
for i in range(0,len(result)):
    id = result[i]["pk"]
    name = result[i]["words"]
    print(id, name)

6 lion
7 jungle
8 tiger
9 study


In [133]:
# Sorting the result
sorted_res = sorted(result, key=lambda k: k['words'])
for i in range(0,len(sorted_res)):
    id = sorted_res[i]["pk"]
    name = sorted_res[i]["words"]
    print(id, name)

7 jungle
6 lion
9 study
8 tiger


Hybrid Search <br>
By specifying the boolean expression, you can filter the scalar field of the entities during the vector search. <br>

In [None]:
# Performs a hybrid search:
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > -12", output_fields=["random"])



In [134]:
demo_3.schema

{'auto_id': False, 'description': 'Simple Demo for storing and retrieve embedding from Google', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'words', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 50}}, {'name': 'embeddings', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 300}}]}

In [135]:
demo_3.description

'Simple Demo for storing and retrieve embedding from Google'

In [136]:
demo_3.primary_field

{'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}

In [137]:
demo_3.indexes

[<pymilvus.orm.index.Index at 0x7ff6cfbbfbd0>]

In [138]:
# list all collections
utility.list_collections()

['demo_2', 'demo_4', 'demo_3', 'hello_milvus', 'demo']

In [147]:
#Insert data in collection
demo_5 = Collection("demo_5")
data = [
    
    ['puppy','dog','king','queen','book','lion','jungle','tiger','study'],  
    [puppy, dog, king, queen, book, lion, jungle, tiger, study],  
]

insert_result = demo_5.insert(data)
# After final entity is inserted, it is best to call flush to have no growing segments left in memory
demo_5.flush() 

Unexpected error: [describe_collection], invalid literal for int() with base 10: 'null', <Time: {'RPC start': '2023-07-09 21:57:12.866331', 'Exception': '2023-07-09 21:57:12.874240'}>


MilvusException: <MilvusException: (code=1, message=Unexpected error, message=<invalid literal for int() with base 10: 'null'>)>

In [146]:
demo_3

<Collection>:
-------------
<name>: demo_3
<description>: Simple Demo for storing and retrieve embedding from Google
<schema>: {'auto_id': False, 'description': 'Simple Demo for storing and retrieve embedding from Google', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'words', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 50}}, {'name': 'embeddings', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 300}}]}