In [1]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License

In [2]:
# Standard library imports
from collections import defaultdict
from math import ceil
from typing import List

# Third-party imports
import apache_beam as beam
from apache_beam.ml.rag.types import Chunk, Content, Embedding
from apache_beam.transforms.enrichment import Enrichment
import numpy as np
import pandas as pd
from pymilvus import DataType, CollectionSchema, FieldSchema, Function, FunctionType, MilvusClient, RRFRanker
from pymilvus.milvus_client import IndexParams
from rich import print_json
from sentence_transformers import SentenceTransformer
from torch import cuda

# Local application imports
from llama_index.core.text_splitter import SentenceSplitter
from apache_beam.ml.rag.enrichment.milvus_search import (
    HybridSearchNamespace,
    HybridSearchParameters, 
    KeywordSearchMetrics, 
    KeywordSearchParameters,
    MilvusCollectionLoadParameters, 
    MilvusConnectionParameters, 
    MilvusSearchEnrichmentHandler,
    MilvusSearchParameters, 
    SearchStrategy, 
    VectorSearchMetrics, 
    VectorSearchParameters
)
from apache_beam.ml.rag.enrichment.milvus_search_it_test import MilvusEnrichmentTestHelper

  from .autonotebook import tqdm as notebook_tqdm


# Collect Data

This content has been paraphrased from publicly available information on the internet using a large language model (OpenAI’s GPT-4) and is provided for informational purposes only.

The third data point, related to Google Beam, was intentionally included to illustrate the importance of metadata filtering (filtered search) in Milvus—such as when a user searches for the term “Beam.” without it the vector database retrieval engine may confuse between Apache Beam and Google Beam.

In [3]:
corpus = [
  {
    "id": "1",
    "title": "Apache Beam: Unified Model for Batch and Streaming Data",
    "keywords": ["Apache Beam", "stream processing", "batch processing", "data pipelines", "SDK"],
    "tags": ["Data Engineering", "Open Source", "Streaming", "Batch", "Big Data"],
    "content": (
      "Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. "
      "Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. "
      "Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. "
      "The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. "
      "Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. "
      "Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. "
      "Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. "
      "It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. "
      "Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. "
      "This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. "
      "The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. "
      "The Beam model is based on a unified programming model that decouples pipeline logic from execution. "
      "This makes it easier to reason about time and state in both batch and streaming pipelines. "
      "Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. "
      "Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. "
      "Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. "
      "Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures."
    )
  },
  {
    "id": "2",
    "title": "Google Cloud Dataflow: Run Apache Beam in the Cloud",
    "keywords": ["Google Cloud", "Dataflow", "Apache Beam", "serverless", "stream and batch"],
    "tags": ["Cloud Computing", "Data Pipelines", "Google Cloud", "Serverless", "Enterprise"],
    "content": (
      "Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. "
      "It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. "
      "Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. "
      "Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. "
      "Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. "
      "Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. "
      "With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. "
      "It’s a key component for architects building scalable, cloud-native data platforms. "
      "Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. "
      "Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. "
      "Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments. "
      "For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. "
      "It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. "
      "Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. "
      "In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. "
      "Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing."
    )
  },
  {
    "id": "3",
    "title": "Google Beam: 3D Communication Powered by AI",
    "keywords": ["Google Beam", "Project Starline", "3D video", "AI communication", "real-time meetings"],
    "tags": ["AI", "Communication", "3D Technology", "Remote Work", "Enterprise Tech"],
    "content": (
      "Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. "
      "Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. "
      "This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. "
      "Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. "
      "Powered by Google AI, Beam represents a significant leap in communication technology. "
      "Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. "
      "Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. "
      "Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. "
      "It’s a promising step toward more human and effective remote interactions."
    )
  }
]

# Exploratory Data Analysis (EDA)

## Average Words/Tokens per Doc

In [4]:
# The second video may skew the average tokens results since it is a youtube short video.
contents = [c['content'] for c in corpus]
content_lengths = [len(content.split(" ")) for content in contents]
df = pd.DataFrame(content_lengths, columns=['# Words'])
df.describe()

Unnamed: 0,# Words
count,3.0
mean,253.666667
std,72.85831
min,172.0
25%,224.5
50%,277.0
75%,294.5
max,312.0


In [5]:
mean_word_count = ceil(np.mean(content_lengths))
token_to_word_ratio = 1.3
approx_token_count = ceil(mean_word_count * token_to_word_ratio)
print(f'The mean word count for each video is about {mean_word_count} words, which corresponds to a rough token count of {approx_token_count} tokens.')

The mean word count for each video is about 254 words, which corresponds to a rough token count of 331 tokens.


# Preprocess Data

## Chunking

We'll use sentence splitting as the chunking strategy for simplicity.<br>
Ideally, we would pass a tokenizer here — preferably the same one used by the retriever — to ensure consistency.<br>
However, in this example, we are not using a tokenizer.

In [6]:
# The `chunk_size` parameter is constrained by the embedding model we’re using.
# Since we’re using `sentence-transformers/all-MiniLM-L6-v2`, which has a maximum token limit of ~384 tokens,
# we need to ensure chunk sizes stay well within that limit.
# Given that each document in our dataset contains approximately 331 tokens,
# using a chunk size of 256 allows us to preserve nearly the most semantic meaning of each entry
# while staying safely under the model’s token limit.
chunk_size = 256
llama_txt_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=20)

In [7]:
def split_contents(corpus: list[dict], text_splitter: SentenceSplitter, content_field: str='content') -> list[list[str]]:
    result = []
    for video in corpus:
        split = llama_txt_splitter.split_text(video[content_field])
        result.append(split)
    return result

In [8]:
content_splits = split_contents(corpus, llama_txt_splitter, "content")

In [9]:
print_json(data=content_splits)

## Embedding Generation

In [10]:
# Let's choose `sentence-transformers/all-MiniLM-L6-v2` as our embedding generator here.
# It gives a good balance between embedding generation speed, accuracy, and being free to use.
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

In [11]:
def get_default_device():
    return "cuda:0" if cuda.is_available() else "cpu"

In [12]:
def encode_embedding(chunk, device=get_default_device()):
    return list(map(float, model.encode(chunk, device=device)))

In [13]:
def encode_content_splits(content_splits: list[list[str]],
                          model: SentenceTransformer,
                          device: str = get_default_device()
                         ) -> list[list[tuple[str,list]]]:
    result = []
    for split in content_splits:
        sub_result = []
        for chunk in split:
            encoded = encode_embedding(chunk, device)
            sub_result.append((chunk, encoded))
        result.append(sub_result)
    return result

In [14]:
text_vector_tuples = encode_content_splits(content_splits, model)

## Joining Metadata

In [15]:
def join_metadata(corpus: list[dict], 
                  text_vector_list: list[list[tuple[str, list]]],
                  unique_id_field: str='id',
                  content_field: str='content',
                  embedding_field: str='content_embedding'
                 ) -> list[dict]:
    result = []
    for indx, embeddings in enumerate(text_vector_list):
        for j, (chunk_text, embedding) in enumerate(embeddings):
            doc = {**corpus[indx]}
            doc[content_field] = chunk_text
            doc[embedding_field] = embedding
            doc["doc_id"] = f"{doc[unique_id_field]}_{j+1}"
            del doc[unique_id_field]
            result.append(doc)
    return result

In [16]:
docs = join_metadata(corpus, text_vector_tuples)

# Milvus Sink I/O

This could be delegated to the Beam Milvus Sink I/O once it is implemented. For now, we will use pymilvs client directly for indexing.

## Setup Milvus

In [17]:
db = None

In [18]:
if not db:
    db = MilvusEnrichmentTestHelper.start_milvus_search_db_container()

Pulling image testcontainers/ryuk:0.8.1
Container started: de7765122cc3
Waiting for container <Container: de7765122cc3> with image testcontainers/ryuk:0.8.1 to be ready ...
Pulling image milvusdb/milvus:v2.5.10
Container started: b2f20de0fef3
Waiting for container <Container: b2f20de0fef3> with image milvusdb/milvus:v2.5.10 to be ready ...
Waiting for container <Container: b2f20de0fef3> with image milvusdb/milvus:v2.5.10 to be ready ...
Waiting for container <Container: b2f20de0fef3> with image milvusdb/milvus:v2.5.10 to be ready ...
Waiting for container <Container: b2f20de0fef3> with image milvusdb/milvus:v2.5.10 to be ready ...
Waiting for container <Container: b2f20de0fef3> with image milvusdb/milvus:v2.5.10 to be ready ...
Waiting for container <Container: b2f20de0fef3> with image milvusdb/milvus:v2.5.10 to be ready ...
Waiting for container <Container: b2f20de0fef3> with image milvusdb/milvus:v2.5.10 to be ready ...
Waiting for container <Container: b2f20de0fef3> with image milvu

In [19]:
milvus_connection_parameters = MilvusConnectionParameters(uri=db.address,
    user=db.user, password=db.password, db_id=db.id)

In [20]:
client = MilvusClient(**milvus_connection_parameters.__dict__)

In [21]:
client.get_server_version()

'2.5.10'

## Define Schema

### Define Fields

In [22]:
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=model.get_sentence_embedding_dimension()),
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=256),
    FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65279),
    FieldSchema(name="combined_text", dtype=DataType.VARCHAR, max_length=65279+256, enable_analyzer=True),
    FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=100),
    FieldSchema(name="keywords", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=100, max_capacity=64),
    FieldSchema(name="tags", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=100, max_capacity=32),
]

## Define Functions for Processing

In [23]:
bm25_function = Function(
    name="content_bm25_emb",
    input_field_names=["combined_text"],
    output_field_names=["sparse_vector"],
    function_type=FunctionType.BM25)

functions = [bm25_function]

In [24]:
schema = CollectionSchema(fields=fields,functions=functions)
schema

{'auto_id': True, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'name': 'sparse_vector', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}, {'name': 'title', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'content', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65279}}, {'name': 'combined_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535, 'enable_analyzer': True}}, {'name': 'doc_id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}}, {'name': 'keywords', 'description': '', 'type': <DataType.ARRAY: 22>, 'params': {'max_length': 100, 'max_capacity': 64}, 'element_type': <DataType.VARCHAR: 21>}, {'name': 'tags', 'descrip

## Define Indices

In [25]:
index_params = IndexParams()

### Define Dense Vector Index

In [26]:
index_params.add_index(
    field_name="vector",
    index_name="dense_vector_ivf_flat",
    index_type="IVF_FLAT",
    metric_type=VectorSearchMetrics.COSINE.value,
    params={"nlist": 1024})

### Define Sparse Vector Index

In [27]:
index_params.add_index(
    field_name="sparse_vector",
    index_name="sparse_inverted_index",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type=KeywordSearchMetrics.BM25.value,
    params={"inverted_index_algo": "DAAT_MAXSCORE", "bm25_k1": 1.2, "bm25_b": 0.75})

In [28]:
index_params

[{'field_name': 'vector', 'index_type': 'IVF_FLAT', 'index_name': 'dense_vector_ivf_flat', 'nlist': 1024, 'metric_type': 'COSINE'},
 {'field_name': 'sparse_vector', 'index_type': 'SPARSE_INVERTED_INDEX', 'index_name': 'sparse_inverted_index', 'inverted_index_algo': 'DAAT_MAXSCORE', 'bm25_k1': 1.2, 'bm25_b': 0.75, 'metric_type': 'BM25'}]

## Create Collection

In [29]:
collection_name = "beam_minilm_256"

In [30]:
client.drop_collection(collection_name=collection_name)

In [31]:
client.create_collection(collection_name=collection_name, schema=schema, index_params=index_params)

In [32]:
client.has_collection(collection_name)

True

## Index Data

### Index

In [33]:
data_ready_to_index = []
for doc in docs:
    item = {}
    item["vector"] = doc["content_embedding"]
    item["content"] = doc["content"]
    item["doc_id"] = doc["doc_id"]
    item["title"] = doc["title"]
    item["keywords"] = doc["keywords"]
    item["tags"] = doc["tags"]
    item["combined_text"] = f"{doc['title']}. {doc['content']}"
    data_ready_to_index.append(item)

In [34]:
client.insert(collection_name=collection_name, data=data_ready_to_index)

{'insert_count': 5, 'ids': [458732056809570355, 458732056809570356, 458732056809570357, 458732056809570358, 458732056809570359], 'cost': 0}

### Check the Indexed Data

In [35]:
# Search by content vector similarity.
query_embedding = model.encode("What is apache beam")

search_results = client.search(
    collection_name=collection_name,
    data=[query_embedding],
    anns_field="vector",
    limit=5,
    output_fields=["title", "content", "doc_id", "keywords", "tags"]
)

for hits in search_results:
    for hit in hits:
        print(hit)
        print("---")

# Milvus Beam Enrichment Handler

## Prep for Milvus Beam Enrichment Handler

In [36]:
class FormatAndPrintResults(beam.PTransform):
    def expand(self, pcoll):
        return pcoll | beam.Map(self.format_and_print)
    
    @staticmethod
    def format_and_print(chunk):
        # Create a clean structure to display.
        formatted_result = {
            "query": chunk.content.text,
            "query_embedding": FormatAndPrintResults.get_embedding_count(chunk),
            "results": []
        }
        
        # Extract the enrichment data
        enrichment_data = chunk.metadata.get('enrichment_data', defaultdict(list))
        
        # Format each result with its distance score
        for i in range(len(enrichment_data.get('id', []))):
            result = {
                "id": enrichment_data['id'][i],
                "distance": round(enrichment_data['distance'][i], 4),
                "fields": enrichment_data['fields'][i] if i < len(enrichment_data.get('fields', [])) else {}
            }
            formatted_result["results"].append(result)
        
        # Sort by distance in descending order (highest/best first)
        formatted_result["results"] = sorted(formatted_result["results"], key=lambda x: x["distance"], reverse=True)

        # Print the formatted JSON
        print_json(data=formatted_result)
        
        # Return the original chunk for further processing if needed
        return chunk

    @staticmethod
    def get_embedding_count(chunk):
        if chunk.embedding:
            if chunk.embedding.dense_embedding:
                return len(chunk.embedding.dense_embedding)
            if chunk.embedding.sparse_embedding:
                return len(chunk.embedding.sparse_embedding)

In [37]:
milvus_connection_parameters

MilvusConnectionParameters(uri='http://localhost:32778', user='', password='', db_id='default', token='', timeout=None, kwargs={})

## Vector Search

In [38]:
query = encode_embedding("what is beam?")

In [39]:
search_parameters = MilvusSearchParameters(
    collection_name=collection_name,
    search_strategy=VectorSearchParameters(limit=10, anns_field="vector"),
    output_fields=["title","keywords","tags", "content"])

In [40]:
collection_load_parameters = MilvusCollectionLoadParameters() 

In [41]:
milvus_handler = MilvusSearchEnrichmentHandler(
    connection_parameters=milvus_connection_parameters,
    search_parameters=search_parameters,
    collection_load_parameters=collection_load_parameters)

In [42]:
with beam.Pipeline() as p:
    _ = (
        p
        | "Create" >> beam.Create([Chunk(content=Content(),embedding=Embedding(dense_embedding=query))])
        | "Enrich W/ Milvus Vector Search" >> Enrichment(milvus_handler)
        | "Format and Print Results" >> FormatAndPrintResults())



## Keyword Search

In [43]:
query = "what is beam?"

In [44]:
search_parameters = MilvusSearchParameters(
    collection_name=collection_name,
    search_strategy=KeywordSearchParameters(limit=10,anns_field="sparse_vector"),
    output_fields=["title","keywords","tags", "content"])

In [45]:
collection_load_parameters = MilvusCollectionLoadParameters() 

In [46]:
milvus_handler = MilvusSearchEnrichmentHandler(
    connection_parameters=milvus_connection_parameters,
    search_parameters=search_parameters,
    collection_load_parameters=collection_load_parameters)

In [47]:
with beam.Pipeline() as p:
    _ = (
        p
        | "Create" >> beam.Create([Chunk(content=Content(text=query))])
        | "Enrich W/ Milvus Keyword Search" >> Enrichment(milvus_handler)
        | "Format and Print Results" >> FormatAndPrintResults()
    )

## Hybrid Search

Let’s choose a deliberate query that illustrates the benefits of hybrid search:

Query: "real-time data processing systems"

This query demonstrates hybrid search advantages because:

* Dense vector (semantic) contribution: Will understand the conceptual relationship between "real-time processing" and "streaming" (found in docs #1 and #2)
* Sparse vector (keyword) contribution: Will match exact terms like "data" and "processing" (found in docs #1 and #2)
* Hybrid advantage: Document #1 about Apache Beam should rank highest since it contains more specific technical details about real-time processing capabilities like "event time," "triggers," and "stateful processing" - even though the exact phrase "real-time data processing" doesn't appear in any document

In [48]:
query = "real-time data processing system"
query_embedding = encode_embedding(query)

In [49]:
hybrid_search_namespace = HybridSearchNamespace(
    vector=VectorSearchParameters(limit=10,anns_field="vector"),
    keyword=KeywordSearchParameters(limit=10,anns_field="sparse_vector"),
    hybrid=HybridSearchParameters(ranker=RRFRanker(3),limit=2))

In [50]:
search_parameters = MilvusSearchParameters(
    collection_name=collection_name,
    search_strategy=hybrid_search_namespace,
    output_fields=["title","keywords","tags", "content"])

In [51]:
collection_load_parameters = MilvusCollectionLoadParameters() 

In [52]:
milvus_handler = MilvusSearchEnrichmentHandler(
    connection_parameters=milvus_connection_parameters,
    search_parameters=search_parameters,
    collection_load_parameters=collection_load_parameters)

In [53]:
chunk = Chunk(
    content=Content(text=query),
    embedding=Embedding(dense_embedding=query_embedding)
)

chunks = [chunk]

In [54]:
with beam.Pipeline() as p:
    _ = (
        p
        | "Create" >> beam.Create(chunks)
        | "Enrich W/ Milvus Keyword Search" >> Enrichment(milvus_handler)
        | "Format and Print Results" >> FormatAndPrintResults()
    )

## Filtered Search (Metadata Filtering)

### Searching for Apache Beam

In [55]:
query = encode_embedding("what is beam?")

In [56]:
vector_search_parameters = VectorSearchParameters(
    filter="ARRAY_CONTAINS(keywords, 'data pipelines')",
    limit=10,
    anns_field="vector")

In [57]:
search_parameters = MilvusSearchParameters(
    collection_name=collection_name,
    search_strategy=VectorSearchParameters(filter="ARRAY_CONTAINS(keywords, 'data pipelines')",limit=10,anns_field="vector"),
    output_fields=["title","keywords","tags", "content"])

In [58]:
collection_load_parameters = MilvusCollectionLoadParameters() 

In [59]:
milvus_handler = MilvusSearchEnrichmentHandler(
    connection_parameters=milvus_connection_parameters,
    search_parameters=search_parameters,
    collection_load_parameters=collection_load_parameters)

In [60]:
with beam.Pipeline() as p:
    _ = (
        p
        | "Create" >> beam.Create([Chunk(content=Content(),embedding=Embedding(dense_embedding=query))])
        | "Enrich W/ Milvus Vector Search" >> Enrichment(milvus_handler)
        | "Format and Print Results" >> FormatAndPrintResults())

### Searching for Google Beam

In [61]:
query = encode_embedding("what is beam?")

In [62]:
search_parameters = MilvusSearchParameters(
    collection_name=collection_name,
    search_strategy=VectorSearchParameters(filter="ARRAY_CONTAINS(tags, 'Remote Work')",limit=10,anns_field="vector"),
    output_fields=["title","keywords","tags", "content"])

In [63]:
collection_load_parameters = MilvusCollectionLoadParameters() 

In [64]:
milvus_handler = MilvusSearchEnrichmentHandler(
    connection_parameters=milvus_connection_parameters,
    search_parameters=search_parameters,
    collection_load_parameters=collection_load_parameters)

In [65]:
with beam.Pipeline() as p:
    _ = (
        p
        | "Create" >> beam.Create([Chunk(content=Content(),embedding=Embedding(dense_embedding=query))])
        | "Enrich W/ Milvus Vector Search" >> Enrichment(milvus_handler)
        | "Format and Print Results" >> FormatAndPrintResults())

# Cleanup

In [66]:
client.release_collection(collection_name=collection_name)
client.drop_collection(collection_name=collection_name)
MilvusEnrichmentTestHelper.stop_milvus_search_db_container(db)
db = None