In [2]:
from pymilvus import connections, utility
from pymilvus import Collection, DataType, FieldSchema, CollectionSchema
import time
from os import getenv
from dotenv import load_dotenv, find_dotenv

In [22]:
from helper import *

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
chunks = text_splitter("output.txt")

Total chunks: 2886


In [24]:
embeddings = generate_embeddings(chunks)

In [3]:
# find the .env file and load it 
load_dotenv(find_dotenv())

True

In [4]:
milvus_uri = "https://in03-c326a50a933fb09.api.gcp-us-west1.zillizcloud.com"
milvus_key = getenv("milvus_key")

In [5]:
connections.connect("default",
                    uri=milvus_uri,
                    token=milvus_key)

print(f"Connecting to DB: {milvus_uri}")

Connecting to DB: https://in03-c326a50a933fb09.api.gcp-us-west1.zillizcloud.com


In [8]:
collection_name = "Chums"

In [30]:
if utility.has_collection(collection_name):
    drop_result = utility.drop_collection(collection_name)

text_field = FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192, description="text", auto_id=False, is_primary=True)
embedding_field = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384, description="embedding", is_primary=False, auto_id=False)

schema = CollectionSchema(fields=[text_field, embedding_field], description="collection description")

print(f"Creating collection: {collection_name}")
collection = Collection(name=collection_name, schema=schema)
print(f"Schema: {schema}")
print("Success!")

Creating collection: Chums
Schema: {'auto_id': False, 'description': 'collection description', 'fields': [{'name': 'text', 'description': 'text', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 8192}, 'is_primary': True, 'auto_id': False}, {'name': 'embedding', 'description': 'embedding', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'enable_dynamic_field': False}
Success!


In [32]:
# Insert data in sets of 200
# This is to prevent the server from crashing due to too many requests
# This is a temporary solution until we can find a better way to insert data
# into the collection

i = 0
print(f"Inserting data into collection: {collection_name}")

while i < len(chunks):    

    ins_res = collection.insert([chunks[i:i+200], embeddings[i:i+200]])

    print(ins_res)

    i += 200
    
    time.sleep(1)
    
print("Success!")

Inserting data into collection: Chums
(insert count: 200, delete count: 0, upsert count: 0, timestamp: 449284590454702083, success count: 200, err count: 0)
(insert count: 200, delete count: 0, upsert count: 0, timestamp: 449284590979252228, success count: 200, err count: 0)
(insert count: 200, delete count: 0, upsert count: 0, timestamp: 449284591988244481, success count: 200, err count: 0)
(insert count: 200, delete count: 0, upsert count: 0, timestamp: 449284592538746881, success count: 200, err count: 0)
(insert count: 200, delete count: 0, upsert count: 0, timestamp: 449284592971284484, success count: 200, err count: 0)
(insert count: 200, delete count: 0, upsert count: 0, timestamp: 449284593364762625, success count: 200, err count: 0)
(insert count: 200, delete count: 0, upsert count: 0, timestamp: 449284593797300225, success count: 200, err count: 0)
(insert count: 200, delete count: 0, upsert count: 0, timestamp: 449284594622791682, success count: 200, err count: 0)
(insert co

' print(f"Inserting data into collection: {collection_name}")\n\nins_res = collection.insert([chunks, embeddings])\n\nprint(ins_res)\n    \nprint("Success!") '

In [33]:
# flush data
print("Flushing data...")
collection.flush()

Flushing data...


In [34]:
# create index

print("Creating index...")

index_params = {"index_type": "AUTOINDEX", "metric_type": "L2", "params": {}}

collection.create_index(field_name="embedding", index_params=index_params)

print("Success!")


Creating index...
Success!


In [35]:
# load data into collection

print("Loading data...")
collection.load()

print("Success!")

Loading data...
Success!


In [9]:
t0 = time.time()
print("Loading collection...")
collection.load()
t1 = time.time()
print(f"Succeed in {round(t1-t0, 4)} seconds!")


Loading collection...
Succeed in 4.4058 seconds!


In [40]:
emb1 = generate_embeddings(["What is Rstudio?"])

In [43]:
# search for similar embeddings

print("Searching for similar embeddings...")
search_params = {"metric_type": "L2", "params": {"nprobe": 16}}
query_res = collection.search(emb1, "embedding", search_params, limit=5, expr=None, output_fields=["text"])

print(query_res[0])

Searching for similar embeddings...
["id: menu, as shown in Figure 4-10. This book assumes that you use the default Start menu setup. If you choose some other location, you may need to modify procedures later in the book. FIGURE 4-8: The setup process begins by telling you that you’re installing RStudio. FIGURE 4-9: Specify an installation location for RStudio. CHAPTER 4 Installing an R Distribution 5516. Choose Start Menu Configuration (if necessary) and then click Install. You see an Installing dialog box that tells you about the installation progress. After the process completes, you see a completion dialog box appear in its place. 17. Click Finish. You’re ready to begin using RStudio. FIGURE 4-10: Define the Start menu informa- tion as needed. A WORD ABOUT THE SCREENSHOTS As you work your way through the book, you’ll use an IDE of your choice to open the R files containing the book’s source code. Every screenshot that contains IDE-specific information relies on RStudio because RStu

In [44]:
connections.disconnect("default")