# Start Milvius

In [None]:
#curl -sfL https://raw.githubusercontent.com/milvus-io/milvus/master/scripts/standalone_embed.sh -o standalone_embed.sh

#bash standalone_embed.sh start


# Set API KEY

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
from pymilvus import connections,utility,Collection,CollectionSchema, FieldSchema,DataType
from langchain.vectorstores import Milvus

In [None]:
from langchain.document_loaders import PyPDFLoader ,PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter

In [None]:
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.document_loaders import UnstructuredFileLoader

# Connect to milvus

In [None]:
connections.connect(host="localhost",port="19530")

# Load Files

In [None]:
pdf_folder_path="/Users/satyak/iceberg/milvus/class/invoices/"

In [None]:
loader = PyPDFDirectoryLoader(pdf_folder_path)
pages = loader.load_and_split()

In [None]:
pages[0]

# Split documents

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) 
docs = text_splitter.split_documents(pages)

In [None]:
docs[0]

# Embedding

In [None]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

# Collection (Index)

In [None]:
COLLECTION_NAME = "invoices"

In [None]:
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)

In [None]:
print(utility.list_collections())

In [None]:
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 16384},
}

In [None]:
search_params={"metric":"L2","offset":0}

# Load documetns

In [None]:
vector_db = Milvus.from_documents(
    pages,
    embeddings,
    connection_args={"host": "127.0.0.1", "port": "19530"},
    collection_name = COLLECTION_NAME, ## custom collection name 
    search_params = {"metric":"L2","offset":0}, ## search params
    index_params =index_params,
)

# Query documents

In [None]:
query = "08/22/2025 Invoices"
docs = vector_db.similarity_search(query)

In [None]:
len(docs)

# Document metdata info

In [None]:
for doc in docs:
    print(doc.page_content)

## Milvus collection schema and collection info

In [None]:
print(f"Default collection name - {vector_db.collection_name}")
print(f"Default search params - {vector_db.search_params}")
print(f"Default index params - {vector_db.index_params}") #HNSW

In [None]:
vector_db.collection_name

In [None]:
print(utility.list_collections())

In [None]:
collection = Collection(COLLECTION_NAME)  

In [None]:
collection.schema.fields

# Query a collection

In [None]:
res = collection.query(
  expr = "pk >= 0", 
  output_fields = ["pk", "page","source"]
)
for x in res:
    print(x["pk"], x["source"],x["page"])

In [None]:
query = "Sep, 2022"
docs = vector_db.similarity_search(search_terms)

In [None]:
# 

In [None]:
res = collection.query(
    expr = "pk >= 0", 
    output_fields = ["pk", "vector","source","page","text"]
)
for x in res:
    print(x["pk"], x["vector"])

# delete a file 

In [None]:
res = collection.query(
    expr = "pk >= 0", 
    output_fields = ["pk", "vector","source","page","text"]
)
for x in res:
    print(x["pk"], x["source"])

In [None]:
def delete_file(file):
    res = collection.query(
        expr = "source =='"+ file+"'",
        output_fields = ["pk", "vector","source"]
    )

    for x in res:
        expr = "pk in ["+str(x["pk"])+"]"
        print(expr)
        print("Deleting:", x["pk"], x["source"])
        collection.delete(expr)


In [None]:
file='/Users/satyak/iceberg/milvus/class/invoices/Invoice_2023_08.pdf'

In [None]:
delete_file(file)

# Check file exits

In [None]:
def check_file_data(file):
    res = collection.query(
        expr = "source =='"+ file+"'",
        output_fields = ["pk", "vector","source"],
    )
    if len(res)>0:
        for x in res:
            expr = "pk in ["+str(x["pk"])+"]"
            print("File exists:", x["pk"], x["source"])
        return True
    else:
        print("File Not exists:",file)
        return False



In [None]:
check_file_data(file)

In [None]:
file='/Users/satyak/iceberg/milvus/class/invoices/Invoice_2022_041.pdf'

In [None]:
check_file_data(file)

# Insert a file

In [None]:
def insert_data(file):
    print("Inserting")
    loader=PyPDFLoader(file)
    all_doc=loader.load()
    all_splits = text_splitter.split_documents(all_doc)
    vector_db = Milvus.from_documents(
        documents=all_splits,
        embedding=embeddings,
       connection_args={"host": "127.0.0.1", "port": "19530"},
       collection_name = COLLECTION_NAME, ## custom collection name 
       search_params = {"metric":"L2","offset":0}, ## search params
       index_params=index_params,   
    )


In [None]:
file='/Users/satyak/iceberg/milvus/class/invoices/Invoice_2023_08.pdf'

In [None]:
insert_data(file)


In [None]:
check_file_data(file)

# Upsert a files (delete and insert)

In [None]:
def upsert_file_data(file):
    if check_file_data(file):
        delete_file(file)
    insert_data(file)

In [None]:
upsert_file_data(file)