In [3]:
!pip install pymilvus

Collecting pymilvus
  Using cached pymilvus-2.6.9-py3-none-any.whl.metadata (6.8 kB)
Collecting setuptools>69 (from pymilvus)
  Downloading setuptools-82.0.0-py3-none-any.whl.metadata (6.6 kB)
Collecting grpcio!=1.68.0,!=1.68.1,!=1.69.0,!=1.70.0,!=1.70.1,!=1.71.0,!=1.72.1,!=1.73.0,>=1.66.2 (from pymilvus)
  Downloading grpcio-1.78.1-cp313-cp313-win_amd64.whl.metadata (3.9 kB)
Collecting orjson>=3.10.15 (from pymilvus)
  Using cached orjson-3.11.7-cp313-cp313-win_amd64.whl.metadata (43 kB)
Collecting protobuf>=5.27.2 (from pymilvus)
  Downloading protobuf-6.33.5-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting pandas>=1.2.4 (from pymilvus)
  Downloading pandas-3.0.1-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting cachetools>=5.0.0 (from pymilvus)
  Downloading cachetools-7.0.1-py3-none-any.whl.metadata (5.6 kB)
Collecting numpy>=1.26.0 (from pandas>=1.2.4->pymilvus)
  Downloading numpy-2.4.2-cp313-cp313-win_amd64.whl.metadata (6.6 kB)
Collecting tzdata (from pandas>=1.2.4-


[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from pymilvus import connections, utility

# Connect to Milvus
connections.connect(
    alias="default",
    host="localhost",
    port="19530"
)

print("✅ Connected to Milvus")

# Optional: Check existing collections
collections = utility.list_collections()
print("Existing collections:", collections)


✅ Connected to Milvus
Existing collections: []


In [5]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),

    FieldSchema(name="contract_id", dtype=DataType.INT64),

    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024),

    FieldSchema(name="clause_type", dtype=DataType.VARCHAR, max_length=100),
    FieldSchema(name="contract_type", dtype=DataType.VARCHAR, max_length=100),
    FieldSchema(name="text_chunk", dtype=DataType.VARCHAR, max_length=5000),
]



In [6]:
schema = CollectionSchema(fields, description="Policy Clause Embeddings")

collection = Collection(
    name="legal_policy_vectors",
    schema=schema
)

index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}

collection.create_index(
    field_name="embedding",
    index_params=index_params
)

collection.load()

In [8]:
from sentence_transformers import SentenceTransformer

# Load BGE-M3
model = SentenceTransformer("BAAI/bge-m3")

# Confirm dimension
test_embedding = model.encode("test sentence", normalize_embeddings=True)
print("Embedding dimension:", len(test_embedding))

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 391/391 [00:00<00:00, 584.05it/s, Materializing param=pooler.dense.weight]                               


Embedding dimension: 1024


In [9]:
def chunk_text(text, chunk_size=500):
    chunks = []
    current_chunk = ""

    for sentence in text.split(". "):
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [None]:
import os

def insert_document(file_path, clause_type, contract_type):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    chunks = chunk_text(text)

    # IMPORTANT: normalize for cosine similarity
    embeddings = model.encode(
        chunks,
        normalize_embeddings=True
    )

    data = [
        embeddings.tolist(),
        [os.path.basename(file_path)] * len(chunks),
        [clause_type] * len(chunks),
        [contract_type] * len(chunks),
        chunks
    ]

    collection.insert(data)
    collection.flush()

    print(f"✅ Inserted {len(chunks)} chunks from {file_path}")

In [10]:
from pymilvus import utility

utility.drop_collection("legal_policy_vectors")

SchemaNotReadyException: <SchemaNotReadyException: (code=1, message=Collection 'legal_policy_vectors' not exist, or you can pass in schema to create one.)>