In [1]:
!pip freeze | grep langchain
!pip freeze | grep openai
!pip freeze | grep pypdf
!pip freeze | grep tiktoken

langchain==0.0.291
openai==0.28.0
pypdf==3.16.0
tiktoken==0.5.1


In [2]:
from langchain.embeddings import DeterministicFakeEmbedding

vector_dim = 768
embeddings = DeterministicFakeEmbedding(size=vector_dim)

In [3]:
from langchain.vectorstores import Milvus

DEFAULT_MILVUS_CONNECTION = {
    "host": "localhost",
    "port": "19530",
    "user": "",
    "password": "",
    "secure": False,
}

collection_name = f"our_20231011_collection"
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name=collection_name,
    connection_args=DEFAULT_MILVUS_CONNECTION,
    consistency_level="Session",
)

In [4]:
from langchain.docstore.document import Document


NUM_CHUNKS = 10000

documents = []
for num_c in range(NUM_CHUNKS):
    documents.append(
        Document(
            page_content=f"my text for chunk {num_c}",
            metadata={
                'chunk': num_c
            }
        )
    )

In [9]:
%%time
import random
from pymilvus import Collection
import time

random.seed(42)

kb_id = "my_kb_15"

print(f"Ingesting {len(documents)} chunks of {vector_dim} dimension")

if len(documents) == 0:
    raise ValueError()

# If the collection hasn't been initialized yet, perform all steps to do so...
if not isinstance(vector_store.col, Collection):
    print("Creating a new collection")
    vector_store._init(
        embeddings.embed_documents([documents[0].page_content]),
        [documents[0].metadata]
    )


# if the current collection doesn't have a partition for this KB_ID, let's create now...
if not vector_store.col.has_partition(partition_name=kb_id):
    print("Creating a new partition")
    vector_store.col.create_partition(partition_name=kb_id)

start_time = time.perf_counter()
_ = vector_store.add_documents(
    documents,
    partition_name=kb_id,
)
print(f"vector_store.add_documents execution_time={time.perf_counter()-start_time}s")

Ingesting 10000 chunks of 768 dimension
Creating a new collection
Creating a new partition
vector_store.add_documents execution_time=109.3074236249995s
CPU times: user 3.04 s, sys: 244 ms, total: 3.28 s
Wall time: 1min 53s


In [61]:
k = 5
search_params = {
    'offset': 0
}

curr_page = 0
MAX_PAGES = 10

scores = []
chunks_ids = []
query = "my query"
while curr_page < MAX_PAGES:
    print(f"curr_page={curr_page+1} with offset={search_params['offset']}")
    output = vector_store.similarity_search_with_score(
        query=query,
        k=k,
        param=search_params,
        # expr=expr,
        partition_names=[kb_id],
    )
    for i, chunk in enumerate(output):
        text = chunk[0].page_content[:100].replace('\n', ' ')
        print(f"\t{search_params['offset'] + i+1}) chunk_id={chunk[0].metadata['chunk']}, score={chunk[1]}, text={text}...")
        scores.append(chunk[1])
        chunks_ids.append(chunk[0].metadata['chunk'])
    curr_page+=1
    search_params['offset'] = search_params['offset'] + k

curr_page=1 with offset=0
	1) chunk_id=464, score=1276.47021484375, text=my text for chunk 464...
	2) chunk_id=7775, score=1315.8236083984375, text=my text for chunk 7775...
	3) chunk_id=8206, score=1325.511962890625, text=my text for chunk 8206...
	4) chunk_id=9671, score=1326.6737060546875, text=my text for chunk 9671...
	5) chunk_id=1401, score=1329.3077392578125, text=my text for chunk 1401...
curr_page=2 with offset=5
	6) chunk_id=4236, score=1330.72705078125, text=my text for chunk 4236...
	7) chunk_id=3705, score=1332.610595703125, text=my text for chunk 3705...
	8) chunk_id=5903, score=1333.90966796875, text=my text for chunk 5903...
	9) chunk_id=1145, score=1336.4095458984375, text=my text for chunk 1145...
	10) chunk_id=6258, score=1342.9508056640625, text=my text for chunk 6258...
curr_page=3 with offset=10
	11) chunk_id=4290, score=1349.108154296875, text=my text for chunk 4290...
	12) chunk_id=981, score=1350.4874267578125, text=my text for chunk 981...
	13) chunk_id=3000,

In [62]:
from collections import Counter

Counter(chunks_ids).most_common()[:4]
assert len(chunks_ids) == len(set(chunks_ids))

AssertionError: 

In [63]:
for i in range(1, len(scores)):
    assert scores[i-1] <= scores[i], f"{i} {scores[i-1]}, {scores[i]}"

AssertionError: 15 1354.885498046875, 1346.193359375

In [64]:
len(chunks_ids)

50