In [1]:
import tensorflow_hub as hub
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, utility, connections


2024-01-02 23:11:56.137900: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

# Connect to Milvus server
connections.connect(
  alias="default",
  user='username',
  password='password',
  host='localhost',
  port='19530'
)


In [3]:

# Define PrimaryKey field
primary_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True)

# Define field schema for document vectors
doc_name_field = FieldSchema(name="document_name", dtype=DataType.VARCHAR, max_length=200,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="Unknown")
doc_vector_field = FieldSchema(name="document_vectors", dtype=DataType.FLOAT_VECTOR, dim=512)

# Create a collection schema
schema = CollectionSchema(fields=[primary_field, doc_name_field, doc_vector_field], description="Collection for storing document vectors")

# Specify collection parameters
collection_name = "document_embeddings"
shards_num = 2

# Create a collection
collection = Collection(name=collection_name, schema=schema, using='default', shards_num=shards_num)



In [4]:

# Load model
# From modal available locally
model_path = 'universal-sentence-encoder_4'
model = hub.load(model_path)

In [5]:

def embed(input):
    return model(input)

def embed_and_add_to_collection(document_data, collection_name):
    collection = Collection(collection_name)
    embeddings = embed([content for _, content in document_data])
    
    # Convert embeddings to list
    vectors = [embedding.numpy().tolist() for embedding in embeddings]
    
    # Get document names
    doc_names = [doc_name for doc_name, _ in document_data]
    
    data = [
        doc_names,
        vectors
    ]

    status = collection.insert(data)
    print(status)
    

In [6]:
# Load collection of word documents along with their names
document_data = [
    ('document1.txt', open('document1.txt').read()),
    ('document2.txt', open('document2.txt').read()),
    ('document3.txt', open('document3.txt').read()),
    ('document4.txt', open('document4.txt').read()),
    ('document5.txt', open('document5.txt').read()),
]

In [7]:

# Represent documents as vectors

# Embed each document in the collection
embed_and_add_to_collection(document_data, collection_name)


(insert count: 5, delete count: 0, upsert count: 0, timestamp: 446754647392911366, success count: 5, err count: 0)


In [9]:
# Build Index
index_params = {
  "metric_type":"COSINE",
  "index_type":"IVF_FLAT",
  "params":{
    "nlist": 2048
  }
}

# Get an existing collection.
collection = Collection(collection_name)      
collection.create_index(
  field_name="document_vectors", 
  index_params=index_params
)

utility.index_building_progress(collection_name)

{'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0}

In [28]:

# Load target document
target_document = open('document3.txt').read()

# Embed target document
target_document_embedding = embed([target_document])[0]

# Convert to numpy array
target_document_list = [target_document_embedding.numpy().tolist()]



In [29]:

# Search for similar documents
top_k = 5
search_param = {
    'nprobe': 16
}

# Search in collection
collection = Collection(collection_name)
collection.load()

# Prepare search parameters
search_params = {
    "metric_type": "COSINE",
    "params": {"nprobe": 16}
}

results = collection.search(data=target_document_list, 
                            anns_field="document_vectors", 
                            param=search_params, 
                            limit=top_k, 
                            expr=None, 
                            output_fields=['document_name'], 
                            consistency_level="Strong")

In [30]:
# Get top hit
hit = results[0][0]


# Print the top 1 hit document name and distance
print(f"Document name: {hit.entity.get('document_name')}, Distance: {hit.distance}")


Document name: document3.txt, Distance: 1.000000238418579


In [31]:
# Assuming 'result' is of type pymilvus.client.abstract.Hits
hits = results[0]  # Assuming the hits are in the first element of the Hits object

# Iterate through the hits and print document names and distances
for hit in hits:
    document_name = hit.entity.get('document_name')
    distance = hit.distance
    sim_percent = round(distance * 100, 2)
    print(f"Document Name: {document_name}, Distance: {distance}, Similarity: {sim_percent} %")


Document Name: document3.txt, Distance: 1.000000238418579, Similarity: 100.0 %
Document Name: document5.txt, Distance: 0.1426936239004135, Similarity: 14.27 %
Document Name: document4.txt, Distance: 0.137556254863739, Similarity: 13.76 %
Document Name: document1.txt, Distance: 0.10334134101867676, Similarity: 10.33 %
Document Name: document2.txt, Distance: 0.04267469048500061, Similarity: 4.27 %
