In [None]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License

# Embedding Ingestion and Vector Search with Apache Beam and Milvus

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/beam-ml/milvus_vector_ingestion_and_search.ipynb"><img src="https://raw.githubusercontent.com/google/or-tools/main/tools/colab_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/milvus_vector_ingestion_and_search.ipynb"><img src="https://raw.githubusercontent.com/google/or-tools/main/tools/github_32px.png" />View source on GitHub</a>
  </td>
</table>

This notebook demonstrates a complete RAG (Retrieval-Augmented Generation) workflow using Apache Beam and [Milvus](https://milvus.io/). 

The example shows both ingestion and search phases:

**Part 1: Vector Ingestion** - Using the Apache Beam [Milvus I/O connector](https://beam.apache.org/documentation/io/built-in/milvus/) to:
- Process documents into chunks
- Generate embeddings
- Store vectors and metadata in Milvus

**Part 2: Vector Search** - Using the Apache Beam [Milvus enrichment transform](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment-milvus) to:
- Perform vector similarity search
- Execute hybrid search (vector + keyword)
- Apply metadata filters

## Use Case: Technical Documentation Search

This example implements a technical documentation search system that can:
- Ingest technical articles and documentation
- Perform semantic search to find relevant content
- Use hybrid search for improved accuracy
- Filter results by metadata (tags, categories, etc.)

## Before you begin
Set up your environment and download dependencies.

### Install Apache Beam
To use the Milvus I/O connector and enrichment transform, install Apache Beam version 2.67.0 or later.

In [2]:
# Disable tokenizers parallelism to prevent deadlocks when forking processes
# This avoids the "huggingface/tokenizers: The current process just got forked" warning.
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# Install required packages
!pip install rich sentence_transformers llama_index --quiet
!pip install apache_beam[milvus,gcp,test,interactive]>=2.67.0 --quiet
!pip install pymilvus --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
# Standard library imports
from collections import defaultdict
from math import ceil
from typing import List, Dict, Any
import tempfile

# Third-party imports
import apache_beam as beam
from apache_beam.ml.rag.types import Chunk, Content, Embedding
from apache_beam.transforms.enrichment import Enrichment
from apache_beam.ml.transforms.base import MLTransform
import numpy as np
import pandas as pd
from pymilvus import DataType, CollectionSchema, FieldSchema, Function, FunctionType, MilvusClient, RRFRanker
from pymilvus.milvus_client import IndexParams
from rich import print_json
from sentence_transformers import SentenceTransformer
from torch import cuda
from llama_index.core.text_splitter import SentenceSplitter

# Apache Beam ML/RAG imports
from apache_beam.ml.rag.ingestion.milvus_search import MilvusVectorWriterConfig, MilvusWriteConfig
from apache_beam.ml.rag.utils import MilvusConnectionConfig
from apache_beam.ml.rag.embeddings.huggingface import HuggingfaceTextEmbeddings
from apache_beam.ml.rag.utils import MilvusConnectionConfig
from apache_beam.ml.rag.enrichment.milvus_search import (
    HybridSearchParameters, 
    KeywordSearchMetrics, 
    KeywordSearchParameters,
    MilvusCollectionLoadParameters, 
    MilvusSearchEnrichmentHandler,
    MilvusSearchParameters, 
    SearchStrategy, 
    VectorSearchMetrics, 
    VectorSearchParameters
)
from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig 
from apache_beam.ml.rag.test_utils import MilvusTestHelpers

# Part 1: Vector Ingestion

In this section, we'll demonstrate how to use Apache Beam's Milvus I/O connector to ingest documents, generate embeddings, and store them in Milvus.

## Collect Sample Data

The following content represents technical documentation that would typically be processed in a RAG system.

In [5]:
corpus = [
  {
    "id": "1",
    "title": "Apache Beam: Unified Model for Batch and Streaming Data",
    "keywords": ["Apache Beam", "stream processing", "batch processing", "data pipelines", "SDK"],
    "tags": ["Data Engineering", "Open Source", "Streaming", "Batch", "Big Data"],
    "category": "framework",
    "content": (
      "Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. "
      "Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. "
      "Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. "
      "The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. "
      "Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. "
      "Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. "
      "Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. "
      "It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. "
      "Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. "
      "This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. "
      "The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. "
      "The Beam model is based on a unified programming model that decouples pipeline logic from execution. "
      "This makes it easier to reason about time and state in both batch and streaming pipelines. "
      "Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. "
      "Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. "
      "Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. "
      "Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures."
    )
  },
  {
    "id": "2",
    "title": "Google Cloud Dataflow: Run Apache Beam in the Cloud",
    "keywords": ["Google Cloud", "Dataflow", "Apache Beam", "serverless", "stream and batch"],
    "tags": ["Cloud Computing", "Data Pipelines", "Google Cloud", "Serverless", "Enterprise"],
    "category": "cloud-service",
    "content": (
      "Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. "
      "It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. "
      "Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. "
      "Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. "
      "Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. "
      "Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. "
      "With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. "
      "It's a key component for architects building scalable, cloud-native data platforms. "
      "Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. "
      "Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. "
      "Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments. "
      "For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. "
      "It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. "
      "Dataflow's streaming engine significantly improves performance and reduces costs by decoupling compute and state management. "
      "In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. "
      "Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing."
    )
  },
  {
    "id": "3",
    "title": "Apache Kafka: Distributed Streaming Platform",
    "keywords": ["Apache Kafka", "distributed systems", "event streaming", "message queue", "real-time"],
    "tags": ["Streaming", "Messaging", "Distributed Systems", "Open Source", "Real-time"],
    "category": "streaming-platform",
    "content": (
      "Apache Kafka is a distributed streaming platform that enables you to build real-time data pipelines and streaming applications. "
      "Originally developed by LinkedIn and later open-sourced as an Apache project, Kafka handles high-throughput, fault-tolerant streaming of data between systems. "
      "Kafka is built around the concept of distributed commit logs and provides a unified platform for handling all real-time data feeds. "
      "The platform excels at three key capabilities: publishing and subscribing to streams of records, storing streams of records durably and reliably, and processing streams of records in real-time. "
      "Kafka's distributed architecture allows it to scale horizontally across multiple servers, providing both high availability and fault tolerance. "
      "Key components include producers (which send data to Kafka), consumers (which read data from Kafka), brokers (Kafka servers), and Kafka Connect for integration with external systems. "
      "The platform supports various messaging patterns including pub-sub, point-to-point, and request-reply, making it versatile for different use cases. "
      "Kafka is widely used for building data lakes, stream processing applications, event sourcing architectures, and real-time analytics systems. "
      "It integrates seamlessly with popular stream processing frameworks like Apache Beam, Apache Flink, and Apache Storm. "
      "Common use cases include activity tracking, metrics collection, log aggregation, stream processing, event sourcing, and commit log services. "
      "Kafka's ecosystem includes Kafka Streams for building streaming applications directly, Schema Registry for managing data schemas, and various connectors for integration. "
      "The platform is designed to handle millions of messages per second with low latency, making it suitable for mission-critical applications."
    )
  }
]

print(f"Created corpus with {len(corpus)} documents")

Created corpus with 3 documents


## Exploratory Data Analysis (EDA)

### Average Words/Tokens per Document

In [6]:
contents = [c['content'] for c in corpus]
content_lengths = [len(content.split(" ")) for content in contents]
df = pd.DataFrame(content_lengths, columns=['# Words'])
df.describe()

Unnamed: 0,# Words
count,3.0
mean,277.0
std,35.0
min,242.0
25%,259.5
50%,277.0
75%,294.5
max,312.0


In [7]:
mean_word_count = ceil(np.mean(content_lengths))
token_to_word_ratio = 1.3
approx_token_count = ceil(mean_word_count * token_to_word_ratio)
print(f'The mean word count for each document is about {mean_word_count} words, which corresponds to a rough token count of {approx_token_count} tokens.')

The mean word count for each document is about 277 words, which corresponds to a rough token count of 361 tokens.


## Document Processing and Chunking

### Text Splitting Strategy

We'll use sentence splitting as the chunking strategy for optimal semantic coherence. The chunk size is constrained by our embedding model's token limit.

In [8]:
# The `chunk_size` parameter is constrained by the embedding model we're using.
# Since we're using `sentence-transformers/all-MiniLM-L6-v2`, which has a maximum token limit of ~384 tokens,
# we need to ensure chunk sizes stay well within that limit.
chunk_size = 256
llama_txt_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=20)

In [9]:
def split_contents(corpus: list[dict], text_splitter: SentenceSplitter, content_field: str='content') -> list[list[str]]:
    result = []
    for doc in corpus:
        split = text_splitter.split_text(doc[content_field])
        result.append(split)
    return result

In [10]:
content_splits = split_contents(corpus, llama_txt_splitter, "content")

## Create Chunks for Ingestion

We'll convert our processed documents into `Chunk` objects that can be used with Apache Beam's ML transforms.

In [11]:
def create_chunks_from_corpus(corpus: list[dict], content_splits: list[list[str]]) -> list[Chunk]:
    """Convert corpus and content splits into Chunk objects."""
    chunks = []
    
    for doc_idx, splits in enumerate(content_splits):
        doc = corpus[doc_idx]
        
        for chunk_idx, chunk_text in enumerate(splits):
            chunk_id = f"{doc['id']}_{chunk_idx + 1}"
            
            # Create chunk with metadata from original document
            chunk = Chunk(
                id=chunk_id,
                content=Content(text=chunk_text),
                metadata={
                    "title": doc["title"],
                    "keywords": doc["keywords"],
                    "tags": doc["tags"],
                    "category": doc["category"],
                    "doc_id": doc["id"],
                    "chunk_index": chunk_idx
                }
            )
            chunks.append(chunk)
    
    return chunks

In [12]:
chunks = create_chunks_from_corpus(corpus, content_splits)
print(f"Created {len(chunks)} chunks from {len(corpus)} documents")

Created 6 chunks from 3 documents


## Milvus Vector Ingestion Pipeline

Now we'll create an Apache Beam pipeline that:
1. Takes our `Chunk` objects
2. Generates embeddings using HuggingFace
3. Writes the vectors and metadata to Milvus

### Setup Milvus Database

We'll use a test container for this demonstration. In production, you would connect to your Milvus cluster.

In [13]:
db = None

In [14]:
if not db:
    db = MilvusTestHelpers.start_db_container()

Pulling image testcontainers/ryuk:0.8.1
Container started: be33f4bf998f
Waiting for container <Container: be33f4bf998f> with image testcontainers/ryuk:0.8.1 to be ready ...
Pulling image milvusdb/milvus:latest
Container started: b72a3b645acc
Waiting for container <Container: b72a3b645acc> with image milvusdb/milvus:latest to be ready ...
Waiting for container <Container: b72a3b645acc> with image milvusdb/milvus:latest to be ready ...
Waiting for container <Container: b72a3b645acc> with image milvusdb/milvus:latest to be ready ...
Waiting for container <Container: b72a3b645acc> with image milvusdb/milvus:latest to be ready ...
Waiting for container <Container: b72a3b645acc> with image milvusdb/milvus:latest to be ready ...
Waiting for container <Container: b72a3b645acc> with image milvusdb/milvus:latest to be ready ...
Waiting for container <Container: b72a3b645acc> with image milvusdb/milvus:latest to be ready ...
Waiting for container <Container: b72a3b645acc> with image milvusdb/milv

In [15]:
# Configure connection parameters for both ingestion and search.
milvus_connection_config = MilvusConnectionConfig(
    uri=db.uri, 
    user=db.user, 
    password=db.password, 
    db_name=db.db_id
)

### Configure Milvus Ingestion

We configure the Milvus I/O connector with:
- Connection settings
- Collection name and write batch size
- Embedding model for generating vectors

### Create Collection Manually

Before using the Apache Beam I/O connector, let's create the Milvus collection with proper schema and indexes using the pymilvus client directly.

In [16]:
# Collection name for storing our vectors
collection_name = "beam_technical_docs"

# Create Milvus client
client = MilvusClient(
    uri=db.uri,
    user=db.user,
    password=db.password,
    db_name=db.db_id
)

# Define collection schema with fields that match our RAG use case
fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=100, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),  # sentence-transformers/all-MiniLM-L6-v2 dimension
    FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=2000),
    # Metadata fields from our chunks
    FieldSchema(name="metadata", dtype=DataType.JSON)   # Complete metadata as JSON
]

schema = CollectionSchema(fields=fields, description="Technical documentation collection for RAG")

# Create collection if it doesn't exist
if client.has_collection(collection_name):
    print(f"Collection '{collection_name}' already exists")
    client.drop_collection(collection_name)
    print(f"Dropped existing collection '{collection_name}'")

client.create_collection(
    collection_name=collection_name,
    schema=schema
)

print(f"Created collection '{collection_name}' with schema:")
for field in fields:
    print(f"  - {field.name}: {field.dtype} {'(primary)' if field.is_primary else ''}")

# Create index for the vector field for efficient similarity search
index_params = IndexParams()
index_params.add_index(
    field_name="embedding",
    index_type="IVF_FLAT",
    metric_type="COSINE",
    params={"nlist": 128}
)

client.create_index(
    collection_name=collection_name,
    index_params=index_params
)

print(f"Created COSINE similarity index for 'embedding' field")
print(f"Collection '{collection_name}' is ready for ingestion!")

Created collection 'beam_technical_docs' with schema:
  - id: 21 (primary)
  - embedding: 101 
  - content: 21 
  - metadata: 23 
Created COSINE similarity index for 'embedding' field
Collection 'beam_technical_docs' is ready for ingestion!


In [17]:
# Configure write settings (collection already created above).
write_config = MilvusWriteConfig(
    collection_name=collection_name,
    write_config=WriteConfig(write_batch_size=100)
)

# Configure Milvus writer
milvus_config = MilvusVectorWriterConfig(
    connection_params=milvus_connection_config,
    write_config=write_config
)

# Configure embedding model
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
huggingface_embedder = HuggingfaceTextEmbeddings(model_name=model_name)

### Run Ingestion Pipeline

This pipeline will:
1. Create chunks from our documents
2. Generate embeddings for each chunk
3. Write everything to Milvus

In [18]:
print("Starting Milvus ingestion pipeline...")

with beam.Pipeline() as p:
    _ = (
        p
        | "Create Chunks" >> beam.Create(chunks)
        | "Generate Embeddings" >> MLTransform(write_artifact_location=tempfile.mkdtemp())
          .with_transform(huggingface_embedder)
        | "Write to Milvus" >> milvus_config.create_write_transform()
    )

print("Ingestion pipeline completed!")

Starting Milvus ingestion pipeline...




Ingestion pipeline completed!


### Verify Ingestion

Let's verify that our data was successfully ingested into Milvus.

In [19]:
# Create a Milvus client to verify data
client = MilvusClient(**milvus_connection_config.__dict__)

# Check if collection exists and get entity count
if client.has_collection(collection_name):
    stats = client.get_collection_stats(collection_name)
    print(f"Collection '{collection_name}' exists with {stats['row_count']} entities")
else:
    print(f"Collection '{collection_name}' not found")

Collection 'beam_technical_docs' exists with 6 entities


# Part 2: Vector Search and Enrichment

Now that we have ingested our documents into Milvus, we'll demonstrate different search capabilities using Apache Beam's enrichment transform.

## Prepare Search Infrastructure

We'll create helper classes and configure the enrichment transform for different types of searches.

In [20]:
class FormatAndPrintResults(beam.PTransform):
    """Transform to format and display search results."""
    
    def expand(self, pcoll):
        return pcoll | beam.Map(self.format_and_print)
    
    @staticmethod
    def format_and_print(chunk):
        # Create a clean structure to display
        formatted_result = {
            "query": chunk.content.text,
            "query_embedding_dimensions": FormatAndPrintResults.get_embedding_count(chunk),
            "results": []
        }
        
        # Extract the enrichment data
        enrichment_data = chunk.metadata.get('enrichment_data', defaultdict(list))
        
        # Format each result with its distance score
        for i in range(len(enrichment_data.get('id', []))):
            result = {
                "id": enrichment_data['id'][i],
                "distance": round(enrichment_data['distance'][i], 4),
                "fields": enrichment_data['fields'][i] if i < len(enrichment_data.get('fields', [])) else {}
            }
            formatted_result["results"].append(result)
        
        # Sort by distance in descending order (highest/best first)
        formatted_result["results"] = sorted(formatted_result["results"], key=lambda x: x["distance"], reverse=True)

        # Print the formatted JSON
        print_json(data=formatted_result)
        
        # Return the original chunk for further processing if needed
        return chunk

    @staticmethod
    def get_embedding_count(chunk):
        if chunk.embedding and chunk.embedding.dense_embedding:
            return len(chunk.embedding.dense_embedding)
        return 0

In [21]:
# Set up embedding model for query processing
model = SentenceTransformer(model_name)

def get_default_device():
    return "cuda:0" if cuda.is_available() else "cpu"

def encode_embedding(text, device=get_default_device()):
    return list(map(float, model.encode(text, device=device)))

## Vector Similarity Search

First, let's perform a basic vector similarity search to find documents most similar to our query.

In [22]:
# Example query for vector search
query_text = "What is a distributed data processing framework?"
query_embedding = encode_embedding(query_text)

# Configure vector search parameters
search_parameters = MilvusSearchParameters(
    collection_name=collection_name,
    search_strategy=VectorSearchParameters(limit=3, anns_field="embedding"),
    output_fields=["metadata"]
)

collection_load_parameters = MilvusCollectionLoadParameters()

milvus_handler = MilvusSearchEnrichmentHandler(
    connection_parameters=milvus_connection_config,
    search_parameters=search_parameters,
    collection_load_parameters=collection_load_parameters
)

print("Running vector similarity search...")

with beam.Pipeline() as p:
    _ = (
        p
        | "Create Query" >> beam.Create([
            Chunk(
                content=Content(text=query_text),
                embedding=Embedding(dense_embedding=query_embedding)
            )
        ])
        | "Vector Search" >> Enrichment(milvus_handler)
        | "Format Results" >> FormatAndPrintResults()
    )

Running vector similarity search...


## Filtered Search (Metadata Filtering)

Now let's perform a search with metadata filtering to find documents in a specific category.

In [23]:
# Filtered search - only looking for framework-related documents
filtered_query = "How to build data pipelines?"
filtered_embedding = encode_embedding(filtered_query)

# Configure search with category filter
filtered_search_parameters = MilvusSearchParameters(
    collection_name=collection_name,
    search_strategy=VectorSearchParameters(
        filter="metadata['category'] == 'framework'",  # Filter for framework documents
        limit=2,
        anns_field="embedding"
    ),
    output_fields=["metadata"]
)

filtered_handler = MilvusSearchEnrichmentHandler(
    connection_parameters=milvus_connection_config,
    search_parameters=filtered_search_parameters,
    collection_load_parameters=collection_load_parameters
)

print("Running filtered search (category='framework')...")

with beam.Pipeline() as p:
    _ = (
        p
        | "Create Filtered Query" >> beam.Create([
            Chunk(
                content=Content(text=filtered_query),
                embedding=Embedding(dense_embedding=filtered_embedding)
            )
        ])
        | "Filtered Search" >> Enrichment(filtered_handler)
        | "Format Filtered Results" >> FormatAndPrintResults()
    )

Running filtered search (category='framework')...


## Batch Search Processing

Finally, let's demonstrate processing multiple queries in a single pipeline, which is useful for batch enrichment scenarios.

In [24]:
# Multiple queries to process
batch_queries = [
    "What is stream processing?",
    "How to handle real-time data?",
    "What are the benefits of cloud services?",
    "How to build distributed systems?"
]

def create_query_chunk(query_text: str) -> Chunk:
    """Create a chunk with embedded query for search."""
    embedding = encode_embedding(query_text)
    return Chunk(
        content=Content(text=query_text),
        embedding=Embedding(dense_embedding=embedding),
        metadata={"query_type": "batch_search"}
    )

# Configure search for batch processing
batch_search_parameters = MilvusSearchParameters(
    collection_name=collection_name,
    search_strategy=VectorSearchParameters(limit=2, anns_field="embedding"),
    output_fields=["metadata"]
)

batch_handler = MilvusSearchEnrichmentHandler(
    connection_parameters=milvus_connection_config,
    search_parameters=batch_search_parameters,
    collection_load_parameters=collection_load_parameters
)

print("Running batch search processing...")

with beam.Pipeline() as p:
    _ = (
        p
        | "Create Batch Queries" >> beam.Create(batch_queries)
        | "Convert to Chunks" >> beam.Map(create_query_chunk)
        | "Batch Search" >> Enrichment(batch_handler)
        | "Format Batch Results" >> FormatAndPrintResults()
    )

Running batch search processing...


## Performance Considerations

When using Apache Beam with Milvus in production, consider:

### Ingestion Performance
- **Batch Size**: Adjust `write_batch_size` in `MilvusWriteConfig` based on your data size and Milvus cluster capacity
- **Parallel Processing**: Use Apache Beam's natural parallelization for large document collections
- **Embedding Efficiency**: Consider using GPU-enabled workers for embedding generation

### Search Performance
- **Collection Loading**: Ensure collections are loaded into memory for faster searches
- **Index Types**: Choose appropriate index types (IVF_FLAT, HNSW) based on your accuracy/speed requirements
- **Batch Searches**: Process multiple queries together to amortize connection overhead

### Resource Management
- **Connection Pooling**: Configure connection parameters for production workloads
- **Memory Usage**: Monitor memory usage during embedding generation and vector storage
- **Network Optimization**: Place compute close to your Milvus cluster when possible

## Cleanup

Don't forget to clean up resources when you're done.

In [26]:
# Clean up the test database container
if db:
    MilvusTestHelpers.stop_db_container(db)
    db = None
    print("Milvus test container stopped")

Milvus test container stopped


## Resources
- [Apache Beam Milvus I/O Documentation](https://beam.apache.org/documentation/io/built-in/milvus/)
- [Apache Beam ML Transforms](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.transforms.html)
- [Milvus Documentation](https://milvus.io/docs)
- [Apache Beam RAG Package](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.rag.html)