In [1]:
import fitz  # PyMuPDF

pdf_path = "IPL_Teams.pdf"
doc = fitz.open(pdf_path)

text_chunks = []
page_numbers = []

# Loop through all pages and chunk text
for page_num in range(len(doc)):
    text = doc[page_num].get_text()
    
    # Split long text into chunks (e.g., 500 characters)
    chunks = [text[i:i+500] for i in range(0, len(text), 500)]
    
    text_chunks.extend(chunks)
    page_numbers.extend([page_num + 1] * len(chunks))

print(f"Total Chunks: {len(text_chunks)}")


Total Chunks: 585


In [2]:
from sentence_transformers import SentenceTransformer

# Load a lightweight and fast model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert each chunk to a 384-dim vector
embeddings = model.encode(text_chunks, show_progress_bar=True).tolist()


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 19/19 [00:05<00:00,  3.19it/s]


In [3]:
from pymilvus import connections, FieldSchema, CollectionSchema, Collection, DataType

# Connect to Milvus (default host/port)
connections.connect(alias="default", host="localhost", port="19530")

# Define schema
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=1000),
    FieldSchema(name="page", dtype=DataType.INT64),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
]

schema = CollectionSchema(fields, description="Storing IPL PDF content")

# Create collection
collection = Collection(name="ipl_teams_search", schema=schema)

# Indexing for fast ANN search
collection.create_index("embedding", {
    "index_type": "HNSW",
    "metric_type": "COSINE",
    "params": {"M": 8, "efConstruction": 64}
})

collection.load()


In [4]:
entities = [
    text_chunks,     # text
    page_numbers,    # page
    embeddings       # embeddings
]

collection.insert(entities)
print("Data inserted into Milvus!")


Data inserted into Milvus!


In [7]:
query = "What is Chennai  Super  Kings ?"
query_vec = model.encode([query])[0]

# Perform hybrid search: semantic + page filter
results = collection.search(
    data=[query_vec],
    anns_field="embedding",
    param={"metric_type": "COSINE", "params": {"ef": 64}},
    limit=3,
    expr="page >= 1 && page <= 5",  # optional filter
    output_fields=["text", "page"]
)

for res in results[0]:
    print(f"Score: {res.score:.4f} | Page: {res.entity.get('page')}")
    print(f"Text: {res.entity.get('text')[:300]}...\n")


Score: 0.6961 | Page: 2
Text: Hyderabad.[2] India Cements acquired the rights to the franchise for 10 years. Former
ICC  Chairman  N.  Srinivasan  was  the  de  facto  owner  of  the  Chennai  Super  Kings,  by
means of his position as the vice-chairman and managing director of India Cements Ltd.
The franchisee was transferred t...

Score: 0.6939 | Page: 1
Text: Chennai Super Kings
T20 kit
Chennai  Super  Kings  (CSK)  is  an  Indian  professional  T20  cricket  franchise  based  in
Chennai, Tamil Nadu. The team competes in the Indian Premier League (IPL) and was
one of the eight franchises incorporated when the league was established in 2008. The
team  pla...

Score: 0.5361 | Page: 3
Text: x wickets as they chased down the
target of 193 with two balls to spare with skipper Dhoni scoring an unbeaten 54 from 29
balls.[17]  Thus,  with  seven  wins  from  14  matches,  Chennai  finished  with  the  same
number of points as three other teams with two semi-final spots at stake. Chennai got