In [1]:
!pip freeze | grep langchain
!pip freeze | grep openai
!pip freeze | grep pypdf
!pip freeze | grep tiktoken

langchain==0.0.291
openai==0.28.0
pypdf==3.16.0
tiktoken==0.5.1


In [2]:
from langchain.embeddings import DeterministicFakeEmbedding

vector_dim = 768
embeddings = DeterministicFakeEmbedding(size=vector_dim)

In [3]:
from langchain.vectorstores import Milvus

DEFAULT_MILVUS_CONNECTION = {
    "host": "localhost",
    "port": "19530",
    "user": "",
    "password": "",
    "secure": False,
}

collection_name = f"our_20231011_collection_no_partitions"
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name=collection_name,
    connection_args=DEFAULT_MILVUS_CONNECTION,
    consistency_level="Session",
)

In [4]:
from langchain.docstore.document import Document


NUM_CHUNKS = 10000

documents = []
for num_c in range(NUM_CHUNKS):
    documents.append(
        Document(
            page_content=f"my text for chunk {num_c}",
            metadata={
                'chunk': num_c
            }
        )
    )

In [5]:
%%time
import random
from pymilvus import Collection
import time

random.seed(42)

print(f"Ingesting {len(documents)} chunks of {vector_dim} dimension")

if len(documents) == 0:
    raise ValueError()

"""# If the collection hasn't been initialized yet, perform all steps to do so...
if not isinstance(vector_store.col, Collection):
    print("Creating a new collection")
    vector_store._init(
        embeddings.embed_documents([documents[0].page_content]),
        [documents[0].metadata]
    )
"""
start_time = time.perf_counter()
_ = vector_store.add_documents(
    documents,
)
print(f"vector_store.add_documents execution_time={time.perf_counter()-start_time}s")

Ingesting 10000 chunks of 768 dimension
vector_store.add_documents execution_time=22.75188058299682s
CPU times: user 3.1 s, sys: 163 ms, total: 3.27 s
Wall time: 22.8 s


In [6]:
k = 5
search_params = {
    'offset': 0
}

curr_page = 0
MAX_PAGES = 10

scores = []
chunks_ids = []
query = "my query"
while curr_page < MAX_PAGES:
    print(f"curr_page={curr_page+1} with offset={search_params['offset']}")
    output = vector_store.similarity_search_with_score(
        query=query,
        k=k,
        param=search_params,
        # expr=expr,
    )
    for i, chunk in enumerate(output):
        text = chunk[0].page_content[:100].replace('\n', ' ')
        print(f"\t{search_params['offset'] + i+1}) chunk_id={chunk[0].metadata['chunk']}, score={chunk[1]}, text={text}...")
        scores.append(chunk[1])
        chunks_ids.append(chunk[0].metadata['chunk'])
    curr_page+=1
    search_params['offset'] = search_params['offset'] + k

curr_page=1 with offset=0
	1) chunk_id=1417, score=1282.739501953125, text=my text for chunk 1417...
	2) chunk_id=7990, score=1294.706787109375, text=my text for chunk 7990...
	3) chunk_id=5123, score=1296.0478515625, text=my text for chunk 5123...
	4) chunk_id=8755, score=1299.026123046875, text=my text for chunk 8755...
	5) chunk_id=6416, score=1307.1116943359375, text=my text for chunk 6416...
curr_page=2 with offset=5
	6) chunk_id=9620, score=1313.47900390625, text=my text for chunk 9620...
	7) chunk_id=7775, score=1315.8236083984375, text=my text for chunk 7775...
	8) chunk_id=8851, score=1321.05029296875, text=my text for chunk 8851...
	9) chunk_id=4001, score=1321.920166015625, text=my text for chunk 4001...
	10) chunk_id=8206, score=1325.511962890625, text=my text for chunk 8206...
curr_page=3 with offset=10
	11) chunk_id=598, score=1326.9158935546875, text=my text for chunk 598...
	12) chunk_id=1218, score=1328.949462890625, text=my text for chunk 1218...
	13) chunk_id=1401, s

In [7]:
from collections import Counter

Counter(chunks_ids).most_common()[:4]
assert len(chunks_ids) == len(set(chunks_ids))

In [8]:
for i in range(1, len(scores)):
    assert scores[i-1] <= scores[i], f"{i} {scores[i-1]}, {scores[i]}"

In [9]:
len(chunks_ids)

50