In [2]:
import requests
import json
import time

# Configuration
OPENSEARCH_URL = "http://localhost:19200"
AUTH = ('admin', 'OpenSearch@2024')
VERIFY_SSL = False

def run_request(method, endpoint, body=None, params=None):
    """
    Helper to run OpenSearch requests.
    """
    url = f"{OPENSEARCH_URL}/{endpoint}"
    headers = {"Content-Type": "application/json"}
    try:
        if method == "GET":
            response = requests.get(url, auth=AUTH, verify=VERIFY_SSL, params=params, json=body, headers=headers)
        elif method == "POST":
            response = requests.post(url, auth=AUTH, verify=VERIFY_SSL, params=params, json=body, headers=headers)
        elif method == "PUT":
            response = requests.put(url, auth=AUTH, verify=VERIFY_SSL, params=params, json=body, headers=headers)
        elif method == "DELETE":
            response = requests.delete(url, auth=AUTH, verify=VERIFY_SSL, params=params, json=body, headers=headers)
        
        print(f"--- {method} {endpoint} ---")
        print(f"Status: {response.status_code}")
        try:
            print(json.dumps(response.json(), indent=2))
        except:
            print(response.text)
        print("-" * 50)
        return response
    except Exception as e:
        print(f"Error: {e}")

print("✅ Setup Complete")

✅ Setup Complete


## 1. Search Options (Pagination, Sorting, Highlighting)

Standard search features that enhance the user experience.

*   **Pagination**: `from` and `size`.
*   **Sorting**: `sort` array.
*   **Highlighting**: `highlight` object to show where matches occurred.

In [3]:
# Setup: Index some sample data
index_name = "books"
run_request("DELETE", index_name)
run_request("PUT", index_name, {
    "mappings": {
        "properties": {
            "title": { "type": "text" },
            "author": { "type": "keyword" },
            "price": { "type": "float" },
            "description": { "type": "text" }
        }
    }
})

docs = [
    { "title": "The Art of Search", "author": "Alice", "price": 10.50, "description": "A deep dive into search engines." },
    { "title": "Advanced OpenSearch", "author": "Bob", "price": 25.00, "description": "Mastering OpenSearch features." },
    { "title": "Search Patterns", "author": "Alice", "price": 15.75, "description": "Common patterns in search UI." },
    { "title": "Data Structures", "author": "Charlie", "price": 30.00, "description": "Algorithms and data structures." }
]

for i, doc in enumerate(docs):
    run_request("POST", f"{index_name}/_doc/{i+1}?refresh=true", doc)

# Demo: Sort by Price DESC, Highlight 'search', Page 1 (Size 2)
query = {
    "from": 0,
    "size": 2,
    "query": {
        "match": { "description": "search" }
    },
    "sort": [
        { "price": { "order": "desc" } }
    ],
    "highlight": {
        "fields": {
            "description": {}
        }
    }
}

run_request("GET", f"{index_name}/_search", query)

--- DELETE books ---
Status: 404
{
  "error": {
    "root_cause": [
      {
        "type": "index_not_found_exception",
        "reason": "no such index [books]",
        "index": "books",
        "resource.id": "books",
        "resource.type": "index_or_alias",
        "index_uuid": "_na_"
      }
    ],
    "type": "index_not_found_exception",
    "reason": "no such index [books]",
    "index": "books",
    "resource.id": "books",
    "resource.type": "index_or_alias",
    "index_uuid": "_na_"
  },
  "status": 404
}
--------------------------------------------------
--- PUT books ---
Status: 200
{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "books"
}
--------------------------------------------------
--- POST books/_doc/1?refresh=true ---
Status: 201
{
  "_index": "books",
  "_id": "1",
  "_version": 1,
  "result": "created",
  "forced_refresh": true,
  "_shards": {
    "total": 2,
    "successful": 1,
    "failed": 0
  },
  "_seq_no": 0,
  "_primary_term": 1


<Response [200]>

## 2. Keyword Search (BM25 Configuration)

OpenSearch uses BM25 by default. You can tune `k1` (saturation) and `b` (length normalization).

*   **k1**: Controls how quickly term frequency saturation is reached. Default 1.2.
*   **b**: Controls how much document length affects the score. Default 0.75.

**Note:** Changing these requires closing the index or setting them at creation time.

In [None]:
# Create index with custom BM25 settings
bm25_index = "bm25_tuned"
run_request("DELETE", bm25_index)
run_request("PUT", bm25_index, {
    "settings": {
        "index": {
            "similarity": {
                "my_custom_bm25": {
                    "type": "BM25",
                    "k1": 1.5,  # Higher saturation cap
                    "b": 0.5    # Less penalty for long fields
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "similarity": "my_custom_bm25"
            }
        }
    }
})

print("Index created with custom BM25 similarity.")

## 3. Search Pipelines

Search pipelines allow you to intercept and modify requests (before they hit the shard) and responses (before they return to the user).

**Common Processors:**
*   `filter_query`: Enforce security or business logic (e.g., only show public docs).
*   `rename_field`: Rename fields in the response.
*   `script_request`: Modify the query using a script.

In [None]:
# 1. Create a Search Pipeline
pipeline_id = "my_security_pipeline"
pipeline_body = {
    "request_processors": [
        {
            "filter_query": {
                "tag": "enforce_public",
                "description": "Only show public documents",
                "query": {
                    "term": { "visibility": "public" }
                }
            }
        }
    ],
    "response_processors": [
        {
            "rename_field": {
                "field": "price",
                "target_field": "cost"
            }
        }
    ]
}

run_request("PUT", f"_search/pipeline/{pipeline_id}", pipeline_body)

# 2. Index Data (Some public, some private)
pipeline_index = "pipeline_demo"
run_request("DELETE", pipeline_index)
run_request("PUT", pipeline_index, {"mappings": {"properties": {"visibility": {"type": "keyword"}}}})
run_request("POST", f"{pipeline_index}/_doc/1?refresh=true", {"title": "Public Doc", "visibility": "public", "price": 100})
run_request("POST", f"{pipeline_index}/_doc/2?refresh=true", {"title": "Private Doc", "visibility": "private", "price": 200})

# 3. Search WITHOUT Pipeline (Sees everything)
print("\n--- Search WITHOUT Pipeline ---")
run_request("GET", f"{pipeline_index}/_search")

# 4. Search WITH Pipeline (Filtered & Renamed)
print("\n--- Search WITH Pipeline ---")
run_request("GET", f"{pipeline_index}/_search?search_pipeline={pipeline_id}")

## 4. Asynchronous Search

For heavy queries that might time out. Run them in the background and poll for results.

**Flow:**
1.  `POST _plugins/_asynchronous_search` -> Returns an ID.
2.  `GET _plugins/_asynchronous_search/<ID>` -> Poll for status/results.
3.  `DELETE _plugins/_asynchronous_search/<ID>` -> Cleanup.

In [None]:
# 1. Submit Async Search
async_body = {
    "size": 10,
    "query": {
        "match_all": {}
    }
}
print("--- Submitting Async Search ---")
resp = run_request("POST", "_plugins/_asynchronous_search", async_body)
search_id = resp.json().get("id")

if search_id:
    print(f"\nSearch ID: {search_id}")
    # 2. Poll for Results
    print("--- Polling Results ---")
    run_request("GET", f"_plugins/_asynchronous_search/{search_id}")
    
    # 3. Delete
    print("--- Deleting Async Search ---")
    run_request("DELETE", f"_plugins/_asynchronous_search/{search_id}")

## 5. SQL and PPL

Query OpenSearch using familiar SQL or the pipe-based PPL (Piped Processing Language).

**SQL Endpoint:** `_plugins/_sql`
**PPL Endpoint:** `_plugins/_ppl`

In [None]:
# 1. SQL Query
sql_query = {
    "query": f"SELECT title, price FROM {index_name} WHERE price > 15"
}
print("--- SQL Query ---")
run_request("POST", "_plugins/_sql", sql_query)

# 2. PPL Query
ppl_query = {
    "query": f"source={index_name} | where price > 15 | fields title, price"
}
print("\n--- PPL Query ---")
run_request("POST", "_plugins/_ppl", ppl_query)

## 6. Cross-Cluster Search (CCS)

Search across multiple clusters as if they were one.

**Setup (Conceptual):**
1.  Configure remote cluster seeds in `_cluster/settings`.
2.  Search using `remote_cluster:index_name`.

```json
PUT _cluster/settings
{
  "persistent": {
    "cluster.remote": {
      "cluster_two": {
        "seeds": ["192.168.1.10:9300"]
      }
    }
  }
}
```

**Query:**
```json
GET /cluster_two:my_index/_search
{
  "query": { "match_all": {} }
}
```

## 7. User Behavior Insights (UBI)

UBI is a standard for collecting user interaction data (clicks, queries) to improve search relevance.

**Key Components:**
*   **Schema**: Defines how to store events.
*   **Instrumentation**: Client-side (JS) or Server-side collection.
*   **Analysis**: Using the data to train Learning to Rank models.

**Workflow:**
1.  User searches -> Log Query ID & Results.
2.  User clicks result -> Log Click Event linked to Query ID.
3.  Analyze Click-Through Rate (CTR) per query.

## 8. Learning to Rank (LTR)

Uses machine learning to re-rank search results based on features (BM25 score, recency, click data).

**Steps:**
1.  **Feature Logging**: Log features for documents (e.g., "How much did this doc match 'apple'?").
2.  **Training**: Use XGBoost/RankLib to train a model using judgment lists (human or UBI data).
3.  **Upload Model**: Upload the trained model to OpenSearch.
4.  **Rescore**: Use the model in a `rescore` query.

```json
GET /_search
{
  "query": { "match": { "text": "apple" } },
  "rescore": {
    "window_size": 100,
    "query": {
      "rescore_query": {
        "sltr": {
          "params": { "keywords": "apple" },
          "model": "my_ltr_model"
        }
      }
    }
  }
}
```

## 9. Search Relevance Workbench (API & UI)

The **Search Relevance Workbench** allows you to experiment with queries and pipelines to improve search quality.
While primarily a UI tool in OpenSearch Dashboards, it is backed by a powerful API that lets you automate experiments.

**Key Concepts:**
*   **Query Set**: A collection of queries (e.g., "tv", "laptop") to test.
*   **Search Configuration**: A template defining *how* to search (e.g., "boost title by 10").
*   **Experiment**: A comparison between two configurations using a query set.

**Prerequisite**: Enable the workbench backend.
```json
PUT _cluster/settings
{
  "persistent" : {
    "plugins.search_relevance.workbench_enabled" : true
  }
}
```

In [4]:
# 1. Enable the Workbench Plugin
print("--- Enabling Workbench ---")
run_request("PUT", "_cluster/settings", {
    "persistent": {
        "plugins.search_relevance.workbench_enabled": True
    }
})

# 2. Create a Query Set
# This defines WHAT users are searching for.
query_set_body = {
    "name": "Electronics Queries",
    "description": "Common search terms for electronics",
    "sampling": "manual",
    "querySetQueries": [
        { "queryText": "tv" },
        { "queryText": "laptop" }
    ]
}

print("\n--- Creating Query Set ---")
qs_resp = run_request("PUT", "_plugins/_search_relevance/query_sets", query_set_body)
query_set_id = qs_resp.json().get("query_set_id")
print(f"Query Set ID: {query_set_id}")

--- Enabling Workbench ---
--- PUT _cluster/settings ---
Status: 200
{
  "acknowledged": true,
  "persistent": {
    "plugins": {
      "search_relevance": {
        "workbench_enabled": "true"
      }
    }
  },
  "transient": {}
}
--------------------------------------------------

--- Creating Query Set ---
--- PUT _plugins/_search_relevance/query_sets ---
Status: 200
{
  "query_set_id": "c9a91669-1f74-4031-bba4-7b0f1bc70d6c",
  "query_set_result": "CREATED"
}
--------------------------------------------------
Query Set ID: c9a91669-1f74-4031-bba4-7b0f1bc70d6c


In [5]:
# 3. Create Search Configurations
# Config A: Standard Match
config_a_body = {
    "name": "config_standard",
    "query": "{\"query\": {\"match\": {\"description\": \"%SearchText%\"}}}",
    "index": index_name
}

# Config B: Boosted Title
config_b_body = {
    "name": "config_boosted",
    "query": "{\"query\": {\"bool\": {\"should\": [{\"match\": {\"description\": \"%SearchText%\"}}, {\"match\": {\"title\": {\"query\": \"%SearchText%\", \"boost\": 10}}}]}}}",
    "index": index_name
}

if query_set_id:
    print("\n--- Creating Search Config A ---")
    resp_a = run_request("PUT", "_plugins/_search_relevance/search_configurations", config_a_body)
    config_a_id = resp_a.json().get("search_configuration_id")

    print("\n--- Creating Search Config B ---")
    resp_b = run_request("PUT", "_plugins/_search_relevance/search_configurations", config_b_body)
    config_b_id = resp_b.json().get("search_configuration_id")
    
    print(f"Config IDs: {config_a_id}, {config_b_id}")


--- Creating Search Config A ---
--- PUT _plugins/_search_relevance/search_configurations ---
Status: 200
{
  "search_configuration_id": "536ffc5a-f661-42dd-9984-729ef4cf806a",
  "search_configuration_result": "CREATED"
}
--------------------------------------------------

--- Creating Search Config B ---
--- PUT _plugins/_search_relevance/search_configurations ---
Status: 200
{
  "search_configuration_id": "8d1909bb-2628-4858-b5af-8192f4b5f3a9",
  "search_configuration_result": "CREATED"
}
--------------------------------------------------
Config IDs: 536ffc5a-f661-42dd-9984-729ef4cf806a, 8d1909bb-2628-4858-b5af-8192f4b5f3a9


In [6]:
# 4. Run Comparison Experiment
if query_set_id and config_a_id and config_b_id:
    experiment_body = {
        "querySetId": query_set_id,
        "searchConfigurationList": [config_a_id, config_b_id],
        "size": 5,
        "type": "PAIRWISE_COMPARISON"
    }

    print("\n--- Running Experiment ---")
    exp_resp = run_request("PUT", "_plugins/_search_relevance/experiments", experiment_body)
    experiment_id = exp_resp.json().get("experiment_id")
    
    if experiment_id:
        print(f"Experiment ID: {experiment_id}")
        time.sleep(1) # Wait for async execution
        
        print("\n--- Fetching Results ---")
        results = run_request("GET", f"_plugins/_search_relevance/experiments/{experiment_id}")
        # The output is large, so we just print the status
        print(f"Status: {results.json().get('_source', {}).get('status')}")


--- Running Experiment ---
--- PUT _plugins/_search_relevance/experiments ---
Status: 200
{
  "experiment_id": "d0931782-b99c-4063-ac5e-e16f2d3198d9",
  "experiment_result": "CREATED"
}
--------------------------------------------------
Experiment ID: d0931782-b99c-4063-ac5e-e16f2d3198d9

--- Fetching Results ---
--- GET _plugins/_search_relevance/experiments/d0931782-b99c-4063-ac5e-e16f2d3198d9 ---
Status: 200
{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 1,
      "relation": "eq"
    },
    "max_score": 1.0,
    "hits": [
      {
        "_index": ".plugins-search-relevance-experiment",
        "_id": "d0931782-b99c-4063-ac5e-e16f2d3198d9",
        "_score": 1.0,
        "_source": {
          "id": "d0931782-b99c-4063-ac5e-e16f2d3198d9",
          "timestamp": "2025-11-29T00:14:34.971Z",
          "type": "PAIRWISE_COMPARISON",
          "status": "COMPLETED",