## Milvus Vector DB for RAG 

### Create a collection of legal docs

In [None]:
# pip install pymilvus
# pip install milvus-cli

In [None]:
from pymilvus import connections
from pymilvus import connections

# If using Docker standalone Milvus
connections.connect("default", host="127.0.0.1", port="19530")

In [None]:
from pymilvus import db

# 1. Create a new database
# db.create_database("rag_db")

# 2. Switch to that database
db.using_database("rag_db")


In [None]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection

# 3. Define schema for legal_docs collection

# Fields: doc_id, title, jurisdiction, date, embedding
fields = [
    FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=256),
    FieldSchema(name="jurisdiction", dtype=DataType.VARCHAR, max_length=64),
    FieldSchema(name="date", dtype=DataType.INT64),  # e.g., YYYYMMDD format
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768)]
schema = CollectionSchema(fields, description="Legal document embeddings with metadata")

# Create collection
collection = Collection("legal_docs_2", schema)

In [None]:
import numpy as np

# Example embeddings (in reality, generate from a model like OpenAI/LegalBERT)
embedding1 = np.random.rand(768).tolist()
embedding2 = np.random.rand(768).tolist()
print(f"Embedding1 Length: {len(embedding1)}")
print(f"Embedding2 Length: {len(embedding2)}")  
print(f"Embedding1 Sample: {embedding1[:5]}")  # Print first 5 elements
print(f"Embedding2 Sample: {embedding2[:5]}")  # Print first 5 elements

docs = [
    [1, 2],  # doc_id
    ["Employment Law Case", "Tax Dispute"],  # title
    ["California", "New York"],  # jurisdiction
    [20230901, 20230715],  # date
    [embedding1, embedding2]  # embeddings
]

index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 128}
}
collection.create_index(
    field_name="embedding",
    index_params=index_params
)

collection.insert(docs)
collection.flush()



In [None]:
# collection.insert(docs)


In [None]:
collection.load()

In [None]:
query_vector = embedding1  # pretend we want similar to doc 1

# Search only in California jurisdiction
results = collection.search(
    data=[query_vector],
    anns_field="embedding",
    param={"metric_type": "COSINE", "params": {"nprobe": 10}},
    limit=3,
    expr='jurisdiction == "California"',
    output_fields=["doc_id", "title", "jurisdiction", "date"]
)

for res in results[0]:
    print(f"doc_id={res.entity.get('doc_id')}, "
          f"title={res.entity.get('title')}, "
          f"jurisdiction={res.entity.get('jurisdiction')}, "
          f"score={res.distance}")
