In [1]:
# Step 1: Install required libraries
!pip install sentence-transformers pymupdf pymilvus

# Step 2: Extract text from the PDF
import fitz  # PyMuPDF

pdf_path = "sample.pdf"

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text[:1000])  # Print the first 1000 characters of the extracted text

# Step 3: Generate embeddings using PyTorch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

# Split text into sections
text_sections = extracted_text.split(". ")  # Simple sentence split by period

def generate_embeddings(text_sections):
    embeddings = model.encode(text_sections, convert_to_tensor=True)
    return embeddings

embeddings = generate_embeddings(text_sections)

print(f"Generated {len(embeddings)} embeddings.")
print(embeddings[:2])  # Show embeddings for the first two sections

# Step 4: Store embeddings in Milvus
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# Connect to Milvus
connections.connect("default", host="localhost", port="19530")

# Define the schema for the collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),  # Adjust dimensions to match your model
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2000),  # Store corresponding text
]

schema = CollectionSchema(fields, description="Document section embeddings")
collection = Collection(name="document_embeddings", schema=schema)

# Insert embeddings and text into Milvus
ids = list(range(len(embeddings)))
text_data = [section[:2000] for section in text_sections]  # Truncate text to fit VARCHAR limit
collection.insert([ids, embeddings.cpu().numpy(), text_data])

# Load the collection into memory
collection.load()

print("Embeddings successfully stored in Milvus!")

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading PyMuPDF-1.24.10-cp311-none-macosx_11_0_arm64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pymilvus
  Downloading pymilvus-2.4.6-py3-none-any.whl (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.10 (from pymupdf)
  Downloading PyMuPDFb-1.24.10-py3-none-macosx_11_0_arm64.whl (15.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting setuptools>69 (from pymilvus)
  Downloading setuptools-74.1.2-py3-none

FileNotFoundError: no such file: 'sample.pdf'