# 🔎 OpenSearch Search Examples

This notebook demonstrates how to run different types of search queries over
a local OpenSearch index containing arXiv papers. The queries include:

1. BM25 full-text search
2. Dense vector (semantic) search using BGE embeddings
3. Hybrid search combining both approaches

Requirements:
- OpenSearch running at localhost:9200
- Local embedding API available at localhost:8000

In [None]:
from opensearchpy import OpenSearch
import requests

In [None]:
# Index name in OpenSearch
INDEX_NAME = "arxiv-papers"

# URL of the local embedding API
EMBEDDING_API_URL = "http://localhost:8000/v1/embeddings"

In [None]:
# Connect to OpenSearch
client = OpenSearch(
    hosts=[{"host": "localhost", "port": 9200}],
    http_compress=True,
    verify_certs=False,
    ssl_show_warn=False,
)

In [None]:
def get_embedding(text):
    """
    Query the local embedding API to obtain a dense vector
    using the BGE model.
    """
    payload = {"input": [text]}
    headers = {"Content-Type": "application/json"}
    response = requests.post(EMBEDDING_API_URL, json=payload, headers=headers)
    return response.json()["data"][0]["embedding"]

# 1. BM25 Full-Text Search

In [None]:
query_text = "retrieval augmented generation"

bm25_query = {"size": 5, "query": {"match": {"chunk_text": query_text}}}

response = client.search(index=INDEX_NAME, body=bm25_query)

print("🔎 Top BM25 Results:\n")
for hit in response["hits"]["hits"]:
    print("📄 Score:", round(hit["_score"], 2))
    print(hit["_source"]["chunk_text"][:300], "...\n")

# 2. Dense Vector Search (k-NN)

In [None]:
embedding = get_embedding(query_text)

knn_query = {"size": 5, "query": {"knn": {"embedding": {"vector": embedding, "k": 5}}}}

response = client.search(index=INDEX_NAME, body=knn_query)

print("📐 Top Vector Search Results:\n")
for hit in response["hits"]["hits"]:
    print("📄 Score:", round(hit["_score"], 2))
    print(hit["_source"]["chunk_text"][:300], "...\n")

# 3. Hybrid Search: BM25 + Vector Similarity

In [None]:
hybrid_query = {
    "size": 5,
    "query": {
        "script_score": {
            "query": {"match": {"chunk_text": query_text}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, doc['embedding']) + 1.0",
                "params": {"query_vector": embedding},
            },
        }
    },
}

response = client.search(index=INDEX_NAME, body=hybrid_query)

print("🧪 Top Hybrid Search Results:\n")
for hit in response["hits"]["hits"]:
    print("📄 Score:", round(hit["_score"], 2))
    print(hit["_source"]["chunk_text"][:300], "...\n")