aws · kukushking · Oct 7, 2021 · Aug 31, 2021 · Sep 10, 2021 · Sep 14, 2021
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -215,6 +215,10 @@ or
 ``./deploy-base.sh``
 ``./deploy-databases.sh``
 
+* [OPTIONAL] Deploy the Cloudformation template `opensearch.yaml` (if you need to test Amazon OpenSearch Service). This step could take about 15 minutes to deploy.
+
+``./deploy-opensearch.sh``
+
 * Go to the `EC2 -> SecurityGroups` console, open the `aws-data-wrangler-*` security group and configure to accept your IP from any TCP port.
   - Alternatively run:
 
@@ -244,7 +248,7 @@ or
 
 ``pytest -n 8 tests/test_db.py``
 
-* To run all data lake test functions for all python versions (Only if Amazon QuickSight is activated):
+* To run all data lake test functions for all python versions (Only if Amazon QuickSight is activated and Amazon OpenSearch template is deployed):
 
 ``./test.sh``
 

diff --git a/README.md b/README.md
@@ -136,6 +136,7 @@ FROM "sampleDB"."sampleTable" ORDER BY time DESC LIMIT 3
   - [026 - Amazon Timestream](https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/026%20-%20Amazon%20Timestream.ipynb)
   - [027 - Amazon Timestream 2](https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/027%20-%20Amazon%20Timestream%202.ipynb)
   - [028 - Amazon DynamoDB](https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/028%20-%20DynamoDB.ipynb)
+  - [031 - OpenSearch](https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/031%20-%20OpenSearch.ipynb)
 - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/2.11.0/api.html)
   - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/2.11.0/api.html#amazon-s3)
   - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/2.11.0/api.html#aws-glue-catalog)

diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
@@ -17,6 +17,7 @@
     emr,
     exceptions,
     mysql,
+    opensearch,
     postgresql,
     quicksight,
     redshift,
@@ -38,6 +39,7 @@
     "data_api",
     "dynamodb",
     "exceptions",
+    "opensearch",
     "quicksight",
     "s3",
     "sts",

diff --git a/awswrangler/opensearch/__init__.py b/awswrangler/opensearch/__init__.py
@@ -0,0 +1,17 @@
+"""Utilities Module for Amazon OpenSearch."""
+
+from awswrangler.opensearch._read import search, search_by_sql
+from awswrangler.opensearch._utils import connect
+from awswrangler.opensearch._write import create_index, delete_index, index_csv, index_df, index_documents, index_json
+
+__all__ = [
+    "connect",
+    "create_index",
+    "delete_index",
+    "index_csv",
+    "index_documents",
+    "index_df",
+    "index_json",
+    "search",
+    "search_by_sql",
+]
diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py
@@ -0,0 +1,169 @@
+"""Amazon OpenSearch Read Module (PRIVATE)."""
+
+from typing import Any, Collection, Dict, List, Mapping, Optional, Union
+
+import pandas as pd
+from opensearchpy import OpenSearch
+from opensearchpy.helpers import scan
+
+from awswrangler.opensearch._utils import _get_distribution
+
+
+def _resolve_fields(row: Mapping[str, Any]) -> Mapping[str, Any]:
+    fields = {}
+    for field in row:
+        if isinstance(row[field], dict):
+            nested_fields = _resolve_fields(row[field])
+            for n_field, val in nested_fields.items():
+                fields[f"{field}.{n_field}"] = val
+        else:
+            fields[field] = row[field]
+    return fields
+
+
+def _hit_to_row(hit: Mapping[str, Any]) -> Mapping[str, Any]:
+    row: Dict[str, Any] = {}
+    for k in hit.keys():
+        if k == "_source":
+            solved_fields = _resolve_fields(hit["_source"])
+            row.update(solved_fields)
+        elif k.startswith("_"):
+            row[k] = hit[k]
+    return row
+
+
+def _search_response_to_documents(response: Mapping[str, Any]) -> List[Mapping[str, Any]]:
+    return [_hit_to_row(hit) for hit in response["hits"]["hits"]]
+
+
+def _search_response_to_df(response: Union[Mapping[str, Any], Any]) -> pd.DataFrame:
+    return pd.DataFrame(_search_response_to_documents(response))
+
+
+def search(
+    client: OpenSearch,
+    index: Optional[str] = "_all",
+    search_body: Optional[Dict[str, Any]] = None,
+    doc_type: Optional[str] = None,
+    is_scroll: Optional[bool] = False,
+    filter_path: Optional[Union[str, Collection[str]]] = None,
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """Return results matching query DSL as pandas dataframe.
+
+    Parameters
+    ----------
+    client : OpenSearch
+        instance of opensearchpy.OpenSearch to use.
+    index : str, optional
+        A comma-separated list of index names to search.
+        use `_all` or empty string to perform the operation on all indices.
+    search_body : Dict[str, Any], optional
+        The search definition using the [Query DSL](https://opensearch.org/docs/opensearch/query-dsl/full-text/).
+    doc_type : str, optional
+        Name of the document type (for Elasticsearch versions 5.x and earlier).
+    is_scroll : bool, optional
+        Allows to retrieve a large numbers of results from a single search request using
+        [scroll](https://opensearch.org/docs/opensearch/rest-api/scroll/)
+        for example, for machine learning jobs.
+        Because scroll search contexts consume a lot of memory, we suggest you don’t use the scroll operation
+        for frequent user queries.
+    filter_path : Union[str, Collection[str]], optional
+        Use the filter_path parameter to reduce the size of the OpenSearch Service response \
+(default: ['hits.hits._id','hits.hits._source'])
+    **kwargs :
+        KEYWORD arguments forwarded to [opensearchpy.OpenSearch.search]\
+(https://opensearch-py.readthedocs.io/en/latest/api.html#opensearchpy.OpenSearch.search)
+        and also to [opensearchpy.helpers.scan](https://opensearch-py.readthedocs.io/en/master/helpers.html#scan)
+         if `is_scroll=True`
+
+    Returns
+    -------
+    Union[pandas.DataFrame, Iterator[pandas.DataFrame]]
+        Results as Pandas DataFrame
+
+    Examples
+    --------
+    Searching an index using query DSL
+
+    >>> import awswrangler as wr
+    >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT')
+    >>> df = wr.opensearch.search(
+    ...         client=client,
+    ...         index='movies',
+    ...         search_body={
+    ...           "query": {
+    ...             "match": {
+    ...               "title": "wind"
+    ...             }
+    ...           }
+    ...         }
+    ...      )
+
+
+    """
+    if doc_type:
+        kwargs["doc_type"] = doc_type
+
+    if filter_path is None:
+        filter_path = ["hits.hits._id", "hits.hits._source"]
+
+    if is_scroll:
+        if isinstance(filter_path, str):
+            filter_path = [filter_path]
+        filter_path = ["_scroll_id", "_shards"] + list(filter_path)  # required for scroll
+        documents_generator = scan(client, index=index, query=search_body, filter_path=filter_path, **kwargs)
+        documents = [_hit_to_row(doc) for doc in documents_generator]
+        df = pd.DataFrame(documents)
+    else:
+        response = client.search(index=index, body=search_body, filter_path=filter_path, **kwargs)
+        df = _search_response_to_df(response)
+    return df
+
+
+def search_by_sql(client: OpenSearch, sql_query: str, **kwargs: Any) -> pd.DataFrame:
+    """Return results matching [SQL query](https://opensearch.org/docs/search-plugins/sql/index/) as pandas dataframe.
+
+    Parameters
+    ----------
+    client : OpenSearch
+        instance of opensearchpy.OpenSearch to use.
+    sql_query : str
+        SQL query
+    **kwargs :
+        KEYWORD arguments forwarded to request url (e.g.: filter_path, etc.)
+
+    Returns
+    -------
+    Union[pandas.DataFrame, Iterator[pandas.DataFrame]]
+        Results as Pandas DataFrame
+
+    Examples
+    --------
+    Searching an index using SQL query
+
+    >>> import awswrangler as wr
+    >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT')
+    >>> df = wr.opensearch.search_by_sql(
+    >>>         client=client,
+    >>>         sql_query='SELECT * FROM my-index LIMIT 50'
+    >>>      )
+
+
+    """
+    if _get_distribution(client) == "opensearch":
+        url = "/_plugins/_sql"
+    else:
+        url = "/_opendistro/_sql"
+
+    kwargs["format"] = "json"
+    body = {"query": sql_query}
+    for size_att in ["size", "fetch_size"]:
+        if size_att in kwargs:
+            body["fetch_size"] = kwargs[size_att]
+            del kwargs[size_att]  # unrecognized parameter
+    response = client.transport.perform_request(
+        "POST", url, headers={"Content-Type": "application/json"}, body=body, params=kwargs
+    )
+    df = _search_response_to_df(response)
+    return df
diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py
@@ -0,0 +1,108 @@
+"""Amazon OpenSearch Utils Module (PRIVATE)."""
+
+import logging
+import re
+from typing import Any, Optional
+
+import boto3
+from opensearchpy import OpenSearch, RequestsHttpConnection
+from requests_aws4auth import AWS4Auth
+
+from awswrangler import _utils, exceptions
+
+_logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _get_distribution(client: OpenSearch) -> Any:
+    return client.info().get("version", {}).get("distribution", "elasticsearch")
+
+
+def _get_version(client: OpenSearch) -> Any:
+    return client.info().get("version", {}).get("number")
+
+
+def _get_version_major(client: OpenSearch) -> Any:
+    version = _get_version(client)
+    if version:
+        return int(version.split(".")[0])
+    return None
+
+
+def _strip_endpoint(endpoint: str) -> str:
+    uri_schema = re.compile(r"https?://")
+    return uri_schema.sub("", endpoint).strip().strip("/")
+
+
+def connect(
+    host: str,
+    port: Optional[int] = 443,
+    boto3_session: Optional[boto3.Session] = boto3.Session(),
+    region: Optional[str] = None,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
+) -> OpenSearch:
+    """Create a secure connection to the specified Amazon OpenSearch domain.
+
+    Note
+    ----
+    We use [opensearch-py](https://github.com/opensearch-project/opensearch-py), an OpenSearch low-level python client.
+
+    The username and password are mandatory if the OS Cluster uses [Fine Grained Access Control]\
+(https://docs.aws.amazon.com/opensearch-service/latest/developerguide/fgac.html).
+    If fine grained access control is disabled, session access key and secret keys are used.
+
+    Parameters
+    ----------
+    host : str
+        Amazon OpenSearch domain, for example: my-test-domain.us-east-1.es.amazonaws.com.
+    port : int
+        OpenSearch Service only accepts connections over port 80 (HTTP) or 443 (HTTPS)
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.
+    region :
+        AWS region of the Amazon OS domain. If not provided will be extracted from boto3_session.
+    username :
+        Fine-grained access control username. Mandatory if OS Cluster uses Fine Grained Access Control.
+    password :
+        Fine-grained access control password. Mandatory if OS Cluster uses Fine Grained Access Control.
+
+    Returns
+    -------
+    opensearchpy.OpenSearch
+        OpenSearch low-level client.
+        https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
+    """
+    valid_ports = {80, 443}
+
+    if port not in valid_ports:
+        raise ValueError(f"results: port must be one of {valid_ports}")
+
+    if username and password:
+        http_auth = (username, password)
+    else:
+        if region is None:
+            region = _utils.get_region_from_session(boto3_session=boto3_session)
+        creds = _utils.get_credentials_from_session(boto3_session=boto3_session)
+        if creds.access_key is None or creds.secret_key is None:
+            raise exceptions.InvalidArgument(
+                "One of IAM Role or AWS ACCESS_KEY_ID and SECRET_ACCESS_KEY must be "
+                "given. Unable to find ACCESS_KEY_ID and SECRET_ACCESS_KEY in boto3 "
+                "session."
+            )
+        http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", session_token=creds.token)
+    try:
+        es = OpenSearch(
+            host=_strip_endpoint(host),
+            port=port,
+            http_auth=http_auth,
+            use_ssl=True,
+            verify_certs=True,
+            connection_class=RequestsHttpConnection,
+            timeout=30,
+            max_retries=10,
+            retry_on_timeout=True,
+        )
+    except Exception as e:
+        _logger.error("Error connecting to Opensearch cluster. Please verify authentication details")
+        raise e
+    return es