From 0a83644b8c92a15775e47b15584db61bfe97c289 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 31 Aug 2021 16:09:44 -0400 Subject: [PATCH 01/41] [skip ci] elasticsearch support init: structure and skeleton code --- awswrangler/elasticsearch/__init__.py | 6 + awswrangler/elasticsearch/_read.py | 3 + awswrangler/elasticsearch/_utils.py | 41 ++++ awswrangler/elasticsearch/_write.py | 285 ++++++++++++++++++++++++++ 4 files changed, 335 insertions(+) create mode 100644 awswrangler/elasticsearch/__init__.py create mode 100644 awswrangler/elasticsearch/_read.py create mode 100644 awswrangler/elasticsearch/_utils.py create mode 100644 awswrangler/elasticsearch/_write.py diff --git a/awswrangler/elasticsearch/__init__.py b/awswrangler/elasticsearch/__init__.py new file mode 100644 index 000000000..bca7b8a7f --- /dev/null +++ b/awswrangler/elasticsearch/__init__.py @@ -0,0 +1,6 @@ +"""Utilities Module for Amazon Elasticsearch.""" + +from awswrangler.elasticsearch._utils import connect +from awswrangler.elasticsearch._write import create_index, index_csv, index_documents, index_df, index_json + +__all__ = ["connect", "create_index", "index_csv", "index_documents", "index_df", "index_json"] diff --git a/awswrangler/elasticsearch/_read.py b/awswrangler/elasticsearch/_read.py new file mode 100644 index 000000000..bf5102ab6 --- /dev/null +++ b/awswrangler/elasticsearch/_read.py @@ -0,0 +1,3 @@ +"""Amazon Elasticsearch Read Module (PRIVATE).""" + +# TODO: create module diff --git a/awswrangler/elasticsearch/_utils.py b/awswrangler/elasticsearch/_utils.py new file mode 100644 index 000000000..51b1b4223 --- /dev/null +++ b/awswrangler/elasticsearch/_utils.py @@ -0,0 +1,41 @@ +"""Amazon Elasticsearch Utils Module (PRIVATE).""" + +from typing import Optional + +import boto3 + +from awswrangler import _utils, exceptions +from elasticsearch import Elasticsearch + + +def connect( + host: str, + boto3_session: Optional[boto3.Session] = None +) -> Elasticsearch: + """Establishes a secure connection to the specified Amazon ES domain. + + Note + ---- + We use [elasticsearch-py](https://elasticsearch-py.readthedocs.io/en/v7.13.4/), an Elasticsearch client for Python, + version 7.13.4, which is the recommended version for best compatibility Amazon ES, + since later versions may reject connections to Amazon ES clusters. + In the future will move to a new open source client under the [OpenSearch project](https://www.opensearch.org/) + You can read more here: + https://aws.amazon.com/blogs/opensource/keeping-clients-of-opensearch-and-elasticsearch-compatible-with-open-source/ + https://opensearch.org/docs/clients/index/ + + Parameters + ---------- + host : str + Amazon Elasticsearch domain, for example: my-test-domain.us-east-1.es.amazonaws.com. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + + Returns + ------- + elasticsearch.Elasticsearch + Elasticsearch low-level client. + https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch + """ + + pass # connect to Amazon ES diff --git a/awswrangler/elasticsearch/_write.py b/awswrangler/elasticsearch/_write.py new file mode 100644 index 000000000..779aa1b6f --- /dev/null +++ b/awswrangler/elasticsearch/_write.py @@ -0,0 +1,285 @@ +"""Amazon Elasticsearch Write Module (PRIVATE).""" + +import json +import logging +from pathlib import Path +from typing import Any, Dict, List, Mapping, Optional, Union, Tuple, Iterable + +import boto3 +import pandas as pd + +from elasticsearch import Elasticsearch + +_logger: logging.Logger = logging.getLogger(__name__) + + +def create_index( + index: str, + doc_type: Optional[str] = None, + settings: Optional[Dict[str, Any]] = None, + mappings: Optional[Dict[str, Any]] = None, + boto3_session: Optional[boto3.Session] = None, + con: Optional[Elasticsearch] = None +) -> Dict[str, Any]: + """Creates an index. + + Parameters + ---------- + index : str + Name of the index. + doc_type : str + Name of the document type (for Elasticsearch versions 5.x and earlier). + settings : Dict[str, Any], optional + Index settings + https://opensearch.org/docs/opensearch/rest-api/create-index/#index-settings + mappings : Dict[str, Any], optional + Index mappings + https://opensearch.org/docs/opensearch/rest-api/create-index/#mappings + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + con : elasticsearch.Elasticsearch, optional + Elasticsearch client. A new connection will be established if con receive None. + + Returns + ------- + Dict[str, Any] + Elasticsearch rest api response + https://opensearch.org/docs/opensearch/rest-api/create-index/#response. + + Examples + -------- + Creating an index. + + >>> import awswrangler as wr + >>> response = wr.elasticsearch.create_index( + ... index="sample-index1", + ... mappings={ + ... "properties": { + ... "age": { "type" : "integer" } + ... } + ... }, + ... settings={ + ... "index": { + ... "number_of_shards": 2, + ... "number_of_replicas": 1 + ... } + ... } + ... ) + + """ + + +def index_json( + path: Union[str, Path], + index: str, + doc_type: Optional[str] = None, + bulk_params: Optional[Union[List[Any], Tuple[Any], Dict[Any, Any]]] = None, + boto3_session: Optional[boto3.Session] = None, + **kwargs +) -> Dict[str, Any]: + """Index all documents from JSON file to Elasticsearch index. + + The JSON file should be in a JSON-Lines text format (newline-delimited JSON) - https://jsonlines.org/. + + Parameters + ---------- + path : Union[str, Path] + Path as str or Path object to the JSON file which contains the documents. + index : str + Name of the index. + doc_type : str + Name of the document type (only for Elasticsearch versions 5.x and earlier). + bulk_params : Union[List, Tuple, Dict], optional + List of parameters to pass to bulk operation. + References: + elasticsearch >= 7.10.2: https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters + elasticsearch < 7.10.2: https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + **kwargs : + KEYWORD arguments forwarded to :func:`~awswrangler.elasticsearch.index_documents` + which is used to execute the operation + + Returns + ------- + Dict[str, Any] + Response payload + https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#response. + + Examples + -------- + Writing contents of JSON file + + >>> import awswrangler as wr + >>> wr.elasticsearch.index_json( + ... path='docs.json', + ... index='sample-index1' + ... ) + """ + # Loading data from file + + pass # TODO: load data from json file + + +def index_csv( + path: Union[str, Path], + index: str, + doc_type: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + pandas_params: Optional[Dict[str, Any]] = None +) -> Dict[str, Any]: + """Index all documents from a CSV file to Elasticsearch index. + + Parameters + ---------- + path : Union[str, Path] + Path as str or Path object to the CSV file which contains the documents. + index : str + Name of the index. + doc_type : str + Name of the document type (only for Elasticsearch versions 5.x and older). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + pandas_params : + Dictionary of arguments forwarded to pandas.read_csv(). + e.g. pandas_kwargs={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html + + Returns + ------- + Dict[str, Any] + Response payload + https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#response. + + Examples + -------- + Writing contents of CSV file + + >>> import awswrangler as wr + >>> wr.elasticsearch.index_csv( + ... path='docs.csv', + ... index='sample-index1' + ... ) + + Writing contents of CSV file using pandas_kwargs + + >>> import awswrangler as wr + >>> wr.elasticsearch.index_csv( + ... path='docs.csv', + ... index='sample-index1', + ... pandas_params={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} + ... ) + """ + pass # TODO: load data from csv file + + +def index_df( + df: pd.DataFrame, + index: str, + doc_type: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Index all documents from a DataFrame to Elasticsearch index. + + Parameters + ---------- + df : pd.DataFrame + Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + index : str + Name of the index. + doc_type : str + Name of the document type (only for Elasticsearch versions 5.x and older). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Response payload + https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#response. + + Examples + -------- + Writing rows of DataFrame + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.elasticsearch.index_df( + ... df=pd.DataFrame([{'_id': '1'}, {'_id': '2'}, {'_id': '3'}]), + ... index='sample-index1' + ... ) + """ + pass # TODO: load data from dataframe + + +def index_documents( + documents: Union[Iterable[Dict[str, Any]], Iterable[Mapping[str, Any]]], + index: str, + doc_type: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + con: Optional[Elasticsearch] = None, + ignore_status: Optional[Union[List[Any], Tuple[Any]]] = None, + chunk_size: Optional[int] = 500, + max_chunk_bytes: Optional[int] = 100 * 1024 * 1024, + max_retries: Optional[int] = 0, + initial_backoff: Optional[int] = 2, + max_backoff: Optional[int] = 600, + **kwargs + +) -> Dict[str, Any]: + """Index all documents to Elasticsearch index. + + Note + ---- + Some of the args are referenced from elasticsearch-py client library (bulk helpers) + https://elasticsearch-py.readthedocs.io/en/v7.13.4/helpers.html#elasticsearch.helpers.bulk + https://elasticsearch-py.readthedocs.io/en/v7.13.4/helpers.html#elasticsearch.helpers.streaming_bulk + + Parameters + ---------- + documents : Union[Iterable[Dict[str, Any]], Iterable[Mapping[str, Any]]] + List which contains the documents that will be inserted. + index : str + Name of the index. + doc_type : str + Name of the document type (only for Elasticsearch versions 5.x and older). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + con : elasticsearch.Elasticsearch, optional + Elasticsearch client. A new connection will be established if con receive None. + ignore_status: Union[List[Any], Tuple[Any]], optional + list of HTTP status codes that you want to ignore (not raising an exception) + chunk_size : int, optional + number of docs in one chunk sent to es (default: 500) + max_chunk_bytes: int, optional + the maximum size of the request in bytes (default: 100MB) + max_retries : int, optional + maximum number of times a document will be retried when + ``429`` is received, set to 0 (default) for no retries on ``429`` (default: 0) + initial_backoff : int, optional + number of seconds we should wait before the first retry. + Any subsequent retries will be powers of ``initial_backoff*2**retry_number`` (default: 2) + max_backoff: int, optional + maximum number of seconds a retry will wait (default: 600) + **kwargs : + KEYWORD arguments forwarded to bulk operation + elasticsearch >= 7.10.2: https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters + elasticsearch < 7.10.2: https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters + + Returns + ------- + Dict[str, Any] + Response payload + https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#response. + + Examples + -------- + Writing documents + + >>> import awswrangler as wr + >>> wr.elasticsearch.index_documents( + ... documents=[{'_id': '1', 'value': 'foo'}, {'_id': '2', 'value': 'bar'}], + ... index='sample-index1' + ... ) + """ + pass # TODO: load documents From 947119f003464cffdf117c4068060a0af657407c Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Fri, 10 Sep 2021 11:44:05 -0400 Subject: [PATCH 02/41] [skip ci] rename elasticsearch->opensearch --- awswrangler/elasticsearch/_read.py | 3 -- .../{elasticsearch => opensearch}/__init__.py | 2 +- awswrangler/opensearch/_read.py | 3 ++ .../{elasticsearch => opensearch}/_utils.py | 12 ++++---- .../{elasticsearch => opensearch}/_write.py | 30 +++++++++---------- 5 files changed, 25 insertions(+), 25 deletions(-) delete mode 100644 awswrangler/elasticsearch/_read.py rename awswrangler/{elasticsearch => opensearch}/__init__.py (84%) create mode 100644 awswrangler/opensearch/_read.py rename awswrangler/{elasticsearch => opensearch}/_utils.py (75%) rename awswrangler/{elasticsearch => opensearch}/_write.py (90%) diff --git a/awswrangler/elasticsearch/_read.py b/awswrangler/elasticsearch/_read.py deleted file mode 100644 index bf5102ab6..000000000 --- a/awswrangler/elasticsearch/_read.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Amazon Elasticsearch Read Module (PRIVATE).""" - -# TODO: create module diff --git a/awswrangler/elasticsearch/__init__.py b/awswrangler/opensearch/__init__.py similarity index 84% rename from awswrangler/elasticsearch/__init__.py rename to awswrangler/opensearch/__init__.py index bca7b8a7f..99cf0e6a4 100644 --- a/awswrangler/elasticsearch/__init__.py +++ b/awswrangler/opensearch/__init__.py @@ -1,4 +1,4 @@ -"""Utilities Module for Amazon Elasticsearch.""" +"""Utilities Module for Amazon OpenSearch.""" from awswrangler.elasticsearch._utils import connect from awswrangler.elasticsearch._write import create_index, index_csv, index_documents, index_df, index_json diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py new file mode 100644 index 000000000..be813c2bb --- /dev/null +++ b/awswrangler/opensearch/_read.py @@ -0,0 +1,3 @@ +"""Amazon OpenSearch Read Module (PRIVATE).""" + +# TODO: create module diff --git a/awswrangler/elasticsearch/_utils.py b/awswrangler/opensearch/_utils.py similarity index 75% rename from awswrangler/elasticsearch/_utils.py rename to awswrangler/opensearch/_utils.py index 51b1b4223..f2464bc80 100644 --- a/awswrangler/elasticsearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -1,4 +1,4 @@ -"""Amazon Elasticsearch Utils Module (PRIVATE).""" +"""Amazon OpenSearch Utils Module (PRIVATE).""" from typing import Optional @@ -12,13 +12,13 @@ def connect( host: str, boto3_session: Optional[boto3.Session] = None ) -> Elasticsearch: - """Establishes a secure connection to the specified Amazon ES domain. + """Establishes a secure connection to the specified Amazon OpenSearch domain. Note ---- We use [elasticsearch-py](https://elasticsearch-py.readthedocs.io/en/v7.13.4/), an Elasticsearch client for Python, - version 7.13.4, which is the recommended version for best compatibility Amazon ES, - since later versions may reject connections to Amazon ES clusters. + version 7.13.4, which is the recommended version for best compatibility Amazon OpenSearch, + since later versions may reject connections to Amazon OpenSearch clusters. In the future will move to a new open source client under the [OpenSearch project](https://www.opensearch.org/) You can read more here: https://aws.amazon.com/blogs/opensource/keeping-clients-of-opensearch-and-elasticsearch-compatible-with-open-source/ @@ -27,7 +27,7 @@ def connect( Parameters ---------- host : str - Amazon Elasticsearch domain, for example: my-test-domain.us-east-1.es.amazonaws.com. + Amazon OpenSearch domain, for example: my-test-domain.us-east-1.es.amazonaws.com. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. @@ -38,4 +38,4 @@ def connect( https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch """ - pass # connect to Amazon ES + pass # connect to Amazon OpenSearch diff --git a/awswrangler/elasticsearch/_write.py b/awswrangler/opensearch/_write.py similarity index 90% rename from awswrangler/elasticsearch/_write.py rename to awswrangler/opensearch/_write.py index 779aa1b6f..0ea0ac537 100644 --- a/awswrangler/elasticsearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -1,4 +1,4 @@ -"""Amazon Elasticsearch Write Module (PRIVATE).""" +"""Amazon OpenSearch Write Module (PRIVATE).""" import json import logging @@ -43,7 +43,7 @@ def create_index( Returns ------- Dict[str, Any] - Elasticsearch rest api response + OpenSearch rest api response https://opensearch.org/docs/opensearch/rest-api/create-index/#response. Examples @@ -51,7 +51,7 @@ def create_index( Creating an index. >>> import awswrangler as wr - >>> response = wr.elasticsearch.create_index( + >>> response = wr.opensearch.create_index( ... index="sample-index1", ... mappings={ ... "properties": { @@ -77,7 +77,7 @@ def index_json( boto3_session: Optional[boto3.Session] = None, **kwargs ) -> Dict[str, Any]: - """Index all documents from JSON file to Elasticsearch index. + """Index all documents from JSON file to OpenSearch index. The JSON file should be in a JSON-Lines text format (newline-delimited JSON) - https://jsonlines.org/. @@ -92,12 +92,12 @@ def index_json( bulk_params : Union[List, Tuple, Dict], optional List of parameters to pass to bulk operation. References: - elasticsearch >= 7.10.2: https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters + elasticsearch >= 7.10.2 / opensearch: https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters elasticsearch < 7.10.2: https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. **kwargs : - KEYWORD arguments forwarded to :func:`~awswrangler.elasticsearch.index_documents` + KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents` which is used to execute the operation Returns @@ -111,7 +111,7 @@ def index_json( Writing contents of JSON file >>> import awswrangler as wr - >>> wr.elasticsearch.index_json( + >>> wr.opensearch.index_json( ... path='docs.json', ... index='sample-index1' ... ) @@ -128,7 +128,7 @@ def index_csv( boto3_session: Optional[boto3.Session] = None, pandas_params: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: - """Index all documents from a CSV file to Elasticsearch index. + """Index all documents from a CSV file to OpenSearch index. Parameters ---------- @@ -156,7 +156,7 @@ def index_csv( Writing contents of CSV file >>> import awswrangler as wr - >>> wr.elasticsearch.index_csv( + >>> wr.opensearch.index_csv( ... path='docs.csv', ... index='sample-index1' ... ) @@ -164,7 +164,7 @@ def index_csv( Writing contents of CSV file using pandas_kwargs >>> import awswrangler as wr - >>> wr.elasticsearch.index_csv( + >>> wr.opensearch.index_csv( ... path='docs.csv', ... index='sample-index1', ... pandas_params={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} @@ -179,7 +179,7 @@ def index_df( doc_type: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Any]: - """Index all documents from a DataFrame to Elasticsearch index. + """Index all documents from a DataFrame to OpenSearch index. Parameters ---------- @@ -204,7 +204,7 @@ def index_df( >>> import awswrangler as wr >>> import pandas as pd - >>> wr.elasticsearch.index_df( + >>> wr.opensearch.index_df( ... df=pd.DataFrame([{'_id': '1'}, {'_id': '2'}, {'_id': '3'}]), ... index='sample-index1' ... ) @@ -227,7 +227,7 @@ def index_documents( **kwargs ) -> Dict[str, Any]: - """Index all documents to Elasticsearch index. + """Index all documents to OpenSearch index. Note ---- @@ -263,7 +263,7 @@ def index_documents( maximum number of seconds a retry will wait (default: 600) **kwargs : KEYWORD arguments forwarded to bulk operation - elasticsearch >= 7.10.2: https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters + elasticsearch >= 7.10.2 / opensearch: https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters elasticsearch < 7.10.2: https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters Returns @@ -277,7 +277,7 @@ def index_documents( Writing documents >>> import awswrangler as wr - >>> wr.elasticsearch.index_documents( + >>> wr.opensearch.index_documents( ... documents=[{'_id': '1', 'value': 'foo'}, {'_id': '2', 'value': 'bar'}], ... index='sample-index1' ... ) From 4534d7a5f22934cadd59476498d369df232be6e6 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Mon, 13 Sep 2021 23:08:22 -0400 Subject: [PATCH 03/41] [skip ci] merge Assaf and Murali forks --- awswrangler/__init__.py | 2 + awswrangler/opensearch/__init__.py | 15 ++- awswrangler/opensearch/_read.py | 117 ++++++++++++++++++++++- awswrangler/opensearch/_utils.py | 74 ++++++++++++++- awswrangler/opensearch/_write.py | 147 ++++++++++++++++++++++------- tests/test_opensearch.py | 71 ++++++++++++++ 6 files changed, 386 insertions(+), 40 deletions(-) create mode 100644 tests/test_opensearch.py diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py index ee068e4f6..c87d36823 100644 --- a/awswrangler/__init__.py +++ b/awswrangler/__init__.py @@ -17,6 +17,7 @@ emr, exceptions, mysql, + opensearch, postgresql, quicksight, redshift, @@ -38,6 +39,7 @@ "data_api", "dynamodb", "exceptions", + "opensearch", "quicksight", "s3", "sts", diff --git a/awswrangler/opensearch/__init__.py b/awswrangler/opensearch/__init__.py index 99cf0e6a4..cd7184e79 100644 --- a/awswrangler/opensearch/__init__.py +++ b/awswrangler/opensearch/__init__.py @@ -1,6 +1,15 @@ """Utilities Module for Amazon OpenSearch.""" -from awswrangler.elasticsearch._utils import connect -from awswrangler.elasticsearch._write import create_index, index_csv, index_documents, index_df, index_json +from awswrangler.opensearch._utils import connect +from awswrangler.opensearch._write import create_index, index_csv, index_documents, index_df, index_json +from awswrangler.opensearch._read import search, search_by_sql -__all__ = ["connect", "create_index", "index_csv", "index_documents", "index_df", "index_json"] +__all__ = ["connect", + "create_index", + "index_csv", + "index_documents", + "index_df", + "index_json", + "search", + "search_by_sql" + ] diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index be813c2bb..e1e735828 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -1,3 +1,118 @@ """Amazon OpenSearch Read Module (PRIVATE).""" -# TODO: create module +from pandasticsearch import Select, DataFrame +from typing import Any, Dict, Optional +from elasticsearch import Elasticsearch +from elasticsearch.helpers import scan + + +def _scan( + client: Elasticsearch, + index: Optional[str] = '_all', + search_body: Optional[Dict[str, Any]] = None, + doc_type: Optional[str] = None, + scroll: Optional[str] = '10m', + **kwargs +): + # TODO: write logic based on https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan + pass + + +def search( + client: Elasticsearch, + index: Optional[str] = '_all', + search_body: Optional[Dict[str, Any]] = None, + doc_type: Optional[str] = None, + is_scroll: Optional[bool] = False, + **kwargs +) -> DataFrame: + """Returns results matching query DSL as pandas dataframe. + + Parameters + ---------- + client : Elasticsearch + instance of elasticsearch.Elasticsearch to use. + index : str, optional + A comma-separated list of index names to search. + use `_all` or empty string to perform the operation on all indices. + search_body : Dict[str, Any], optional + The search definition using the [Query DSL](https://opensearch.org/docs/opensearch/query-dsl/full-text/). + doc_type : str, optional + Name of the document type (for Elasticsearch versions 5.x and earlier). + is_scroll : bool, optional + Allows to retrieve a large numbers of results from a single search request using [scroll](https://opensearch.org/docs/opensearch/rest-api/scroll/) + for example, for machine learning jobs. + Because scroll search contexts consume a lot of memory, we suggest you don’t use the scroll operation for frequent user queries. + **kwargs : + KEYWORD arguments forwarded to [elasticsearch.Elasticsearch.search](https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch.Elasticsearch.search). + If ``is_scroll=True`` arguments will be forwarded to [elasticsearch.helpers.scan](https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan) + + Returns + ------- + Union[pandas.DataFrame, Iterator[pandas.DataFrame]] + Results as Pandas DataFrame + + Examples + -------- + Searching an index using query DSL + + >>> import awswrangler as wr + >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') + >>> df = wr.opensearch.search( + ... client=client, + ... index='movies', + ... search_body={ + ... "query": { + ... "match": { + ... "title": "wind" + ... } + ... } + ... } + ... ) + + + """ + if doc_type: + kwargs['doc_type'] = doc_type + if is_scroll: + # TODO: write logic + # documents = _scan(client, index, search_body, doc_type, **kwargs) + pass + else: + documents = client.search(index=index, body=search_body, **kwargs) + df = Select.from_dict(documents).to_pandas() + return df + + +def search_by_sql( + client: Elasticsearch, + sql_query: str +) -> DataFrame: + """Returns results matching [SQL query](https://opensearch.org/docs/search-plugins/sql/index/) as pandas dataframe + + Parameters + ---------- + client : Elasticsearch + instance of elasticsearch.Elasticsearch to use. + sql_query : str + SQL query + + Returns + ------- + Union[pandas.DataFrame, Iterator[pandas.DataFrame]] + Results as Pandas DataFrame + + Examples + -------- + Searching an index using SQL query + + >>> import awswrangler as wr + >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') + >>> df = wr.opensearch.search_by_sql( + >>> client=client, + >>> sql_query='SELECT * FROM my-index LIMIT 50' + >>> ) + + + """ + # TODO: write logic diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index f2464bc80..1bbfeedf7 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -3,16 +3,41 @@ from typing import Optional import boto3 +import logging from awswrangler import _utils, exceptions -from elasticsearch import Elasticsearch +from elasticsearch import Elasticsearch, RequestsHttpConnection +from requests_aws4auth import AWS4Auth + + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _get_distribution(client: Elasticsearch): + return client.info().get('version', {}).get('distribution', 'elasticsearch') + + +def _get_version(client: Elasticsearch): + return client.info().get('version', {}).get('number') + + +def _get_version_major(client: Elasticsearch): + version = _get_version(client) + if version: + return int(version.split('.')[0]) + return None def connect( host: str, - boto3_session: Optional[boto3.Session] = None + port: Optional[int] = 443, + boto3_session: Optional[boto3.Session] = boto3.Session(), + region: Optional[str] = None, + fgac_user: Optional[str] = None, + fgac_password: Optional[str] = None + ) -> Elasticsearch: - """Establishes a secure connection to the specified Amazon OpenSearch domain. + """Creates a secure connection to the specified Amazon OpenSearch domain. Note ---- @@ -24,12 +49,23 @@ def connect( https://aws.amazon.com/blogs/opensource/keeping-clients-of-opensearch-and-elasticsearch-compatible-with-open-source/ https://opensearch.org/docs/clients/index/ + The username and password are mandatory if the OS Cluster uses [Fine Grained Access Control](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/fgac.html). + If fine grained access control is disabled, session access key and secret keys are used. + Parameters ---------- host : str Amazon OpenSearch domain, for example: my-test-domain.us-east-1.es.amazonaws.com. + port : int + OpenSearch Service only accepts connections over port 80 (HTTP) or 443 (HTTPS) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + region : + AWS region of the Amazon OS domain. If not provided will be extracted from boto3_session. + fgac_user : + Fine-grained access control user. Mandatory if OS Cluster uses Fine Grained Access Control. + fgac_password : + Fine-grained access control password. Mandatory if OS Cluster uses Fine Grained Access Control. Returns ------- @@ -38,4 +74,34 @@ def connect( https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch """ - pass # connect to Amazon OpenSearch + valid_ports = {80, 443} + + if port not in valid_ports: + raise ValueError("results: status must be one of %r." % valid_ports) + + if fgac_user and fgac_password: + http_auth = (fgac_user, fgac_password) + else: + if region is None: + region = boto3_session.region_name + creds = boto3_session.get_credentials() + http_auth = AWS4Auth( + creds.access_key, + creds.secret_key, + region, + 'es', + creds.token + ) + try: + es = Elasticsearch( + host=host, + port=port, + http_auth=http_auth, + use_ssl=True, + verify_certs=True, + connection_class=RequestsHttpConnection + ) + except Exception as e: + _logger.error("Error connecting to Opensearch cluster. Please verify authentication details") + raise e + return es diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 0ea0ac537..b3d18e237 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -1,33 +1,65 @@ """Amazon OpenSearch Write Module (PRIVATE).""" -import json import logging +import uuid from pathlib import Path from typing import Any, Dict, List, Mapping, Optional, Union, Tuple, Iterable - -import boto3 +from ._utils import _get_distribution, _get_version_major import pandas as pd from elasticsearch import Elasticsearch +from elasticsearch.helpers import bulk _logger: logging.Logger = logging.getLogger(__name__) +def _selected_keys(document: Dict, keys_to_write: Optional[List[str]]): + if keys_to_write is None: + keys_to_write = document.keys() + keys_to_write = filter(lambda x: x != '_id', keys_to_write) + return {key: document[key] for key in keys_to_write } + + +def _actions_generator(documents: Union[Iterable[Dict[str, Any]], Iterable[Mapping[str, Any]]], + index: str, + doc_type: Optional[str], + keys_to_write: Optional[List[str]], + id_keys: Optional[List[str]]): + for document in documents: + if id_keys: + _id = '-'.join(list(map(lambda x: str(document[x]), id_keys))) + else: + _id = document.get('_id', uuid.uuid4()) + yield { + "_index": index, + "_type": doc_type, + "_id" : _id, + "_source": _selected_keys(document, keys_to_write), + } + + +def _df_doc_generator(df: pd.DataFrame): + df_iter = df.iterrows() + for i, document in df_iter: + yield document + + def create_index( + client: Elasticsearch, index: str, doc_type: Optional[str] = None, settings: Optional[Dict[str, Any]] = None, - mappings: Optional[Dict[str, Any]] = None, - boto3_session: Optional[boto3.Session] = None, - con: Optional[Elasticsearch] = None + mappings: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: """Creates an index. Parameters ---------- + client : Elasticsearch + instance of elasticsearch.Elasticsearch to use. index : str Name of the index. - doc_type : str + doc_type : str, optional Name of the document type (for Elasticsearch versions 5.x and earlier). settings : Dict[str, Any], optional Index settings @@ -35,10 +67,6 @@ def create_index( mappings : Dict[str, Any], optional Index mappings https://opensearch.org/docs/opensearch/rest-api/create-index/#mappings - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 Session will be used if boto3_session receive None. - con : elasticsearch.Elasticsearch, optional - Elasticsearch client. A new connection will be established if con receive None. Returns ------- @@ -51,7 +79,9 @@ def create_index( Creating an index. >>> import awswrangler as wr + >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') >>> response = wr.opensearch.create_index( + ... client=client, ... index="sample-index1", ... mappings={ ... "properties": { @@ -68,13 +98,28 @@ def create_index( """ + body = {} + if mappings: + if _get_distribution(client) == 'opensearch' or _get_version_major(client) >= 7: + body['mappings'] = mappings # doc type deprecated + else: + if doc_type: + body['mappings'] = {doc_type: mappings} + else: + body['mappings'] = {index: mappings} + if settings: + body['settings'] = settings + if body == {}: + body = None + return client.indices.create(index, body, ignore=[400, 404]) + def index_json( + client: Elasticsearch, path: Union[str, Path], index: str, doc_type: Optional[str] = None, bulk_params: Optional[Union[List[Any], Tuple[Any], Dict[Any, Any]]] = None, - boto3_session: Optional[boto3.Session] = None, **kwargs ) -> Dict[str, Any]: """Index all documents from JSON file to OpenSearch index. @@ -83,19 +128,19 @@ def index_json( Parameters ---------- + client : Elasticsearch + instance of elasticsearch.Elasticsearch to use. path : Union[str, Path] Path as str or Path object to the JSON file which contains the documents. index : str Name of the index. - doc_type : str + doc_type : str, optional Name of the document type (only for Elasticsearch versions 5.x and earlier). bulk_params : Union[List, Tuple, Dict], optional List of parameters to pass to bulk operation. References: elasticsearch >= 7.10.2 / opensearch: https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters elasticsearch < 7.10.2: https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 Session will be used if boto3_session receive None. **kwargs : KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents` which is used to execute the operation @@ -111,7 +156,9 @@ def index_json( Writing contents of JSON file >>> import awswrangler as wr + >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') >>> wr.opensearch.index_json( + ... client=client, ... path='docs.json', ... index='sample-index1' ... ) @@ -122,24 +169,24 @@ def index_json( def index_csv( + client: Elasticsearch, path: Union[str, Path], index: str, doc_type: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, pandas_params: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: """Index all documents from a CSV file to OpenSearch index. Parameters ---------- + client : Elasticsearch + instance of elasticsearch.Elasticsearch to use. path : Union[str, Path] Path as str or Path object to the CSV file which contains the documents. index : str Name of the index. - doc_type : str + doc_type : str, optional Name of the document type (only for Elasticsearch versions 5.x and older). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 Session will be used if boto3_session receive None. pandas_params : Dictionary of arguments forwarded to pandas.read_csv(). e.g. pandas_kwargs={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} @@ -156,7 +203,9 @@ def index_csv( Writing contents of CSV file >>> import awswrangler as wr + >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') >>> wr.opensearch.index_csv( + ... client=client, ... path='docs.csv', ... index='sample-index1' ... ) @@ -164,7 +213,9 @@ def index_csv( Writing contents of CSV file using pandas_kwargs >>> import awswrangler as wr + >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') >>> wr.opensearch.index_csv( + ... client=client, ... path='docs.csv', ... index='sample-index1', ... pandas_params={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} @@ -174,23 +225,27 @@ def index_csv( def index_df( + client: Elasticsearch, df: pd.DataFrame, index: str, doc_type: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, + **kwargs ) -> Dict[str, Any]: """Index all documents from a DataFrame to OpenSearch index. Parameters ---------- + client : Elasticsearch + instance of elasticsearch.Elasticsearch to use. df : pd.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html index : str Name of the index. - doc_type : str + doc_type : str, optional Name of the document type (only for Elasticsearch versions 5.x and older). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + **kwargs : + KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents` + which is used to execute the operation Returns ------- @@ -204,20 +259,30 @@ def index_df( >>> import awswrangler as wr >>> import pandas as pd + >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') >>> wr.opensearch.index_df( + ... client=client, ... df=pd.DataFrame([{'_id': '1'}, {'_id': '2'}, {'_id': '3'}]), ... index='sample-index1' ... ) """ - pass # TODO: load data from dataframe + + return index_documents( + client=client, + documents=_df_doc_generator(df), + index=index, + doc_type=doc_type, + **kwargs + ) def index_documents( + client: Elasticsearch, documents: Union[Iterable[Dict[str, Any]], Iterable[Mapping[str, Any]]], index: str, doc_type: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, - con: Optional[Elasticsearch] = None, + keys_to_write: Optional[List[str]] = None, + id_keys: Optional[List[str]] = None, ignore_status: Optional[Union[List[Any], Tuple[Any]]] = None, chunk_size: Optional[int] = 500, max_chunk_bytes: Optional[int] = 100 * 1024 * 1024, @@ -237,16 +302,19 @@ def index_documents( Parameters ---------- + client : Elasticsearch + instance of elasticsearch.Elasticsearch to use. documents : Union[Iterable[Dict[str, Any]], Iterable[Mapping[str, Any]]] List which contains the documents that will be inserted. index : str Name of the index. - doc_type : str + doc_type : str, optional Name of the document type (only for Elasticsearch versions 5.x and older). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 Session will be used if boto3_session receive None. - con : elasticsearch.Elasticsearch, optional - Elasticsearch client. A new connection will be established if con receive None. + keys_to_write : List[str], optional + list of keys to index. If not provided all keys will be indexed + id_keys : List[str], optional + list of keys that compound document unique id. If not provided will use `_id` key if exists, + otherwise will generate unique identifier for each document. ignore_status: Union[List[Any], Tuple[Any]], optional list of HTTP status codes that you want to ignore (not raising an exception) chunk_size : int, optional @@ -277,9 +345,24 @@ def index_documents( Writing documents >>> import awswrangler as wr + >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') >>> wr.opensearch.index_documents( ... documents=[{'_id': '1', 'value': 'foo'}, {'_id': '2', 'value': 'bar'}], ... index='sample-index1' ... ) """ - pass # TODO: load documents + success, errors = bulk( + client=client, + actions=_actions_generator(documents, index, doc_type, keys_to_write=keys_to_write, id_keys=id_keys), + ignore_status=ignore_status, + chunk_size=chunk_size, + max_chunk_bytes=max_chunk_bytes, + max_retries=max_retries, + initial_backoff=initial_backoff, + max_backoff=max_backoff, + **kwargs + ) + return { + 'success': success, + 'errors': errors + } diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py new file mode 100644 index 000000000..01f30084d --- /dev/null +++ b/tests/test_opensearch.py @@ -0,0 +1,71 @@ +import logging + +import boto3 +import pandas as pd + + +import awswrangler as wr + + +logging.getLogger("awswrangler").setLevel(logging.DEBUG) + +# TODO: create test_infra for opensearch +OPENSEARCH_DOMAIN = 'search-es71-public-z63iyqxccc4ungar5vx45xwgfi.us-east-1.es.amazonaws.com' # change to your domain +OPENSEARCH_DOMAIN_FGAC = 'search-os1-public-urixc6vui2il7oawwiox2e57n4.us-east-1.es.amazonaws.com' + + +def test_connection(): + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + print(client.info()) + + +# def test_fgac_connection(): +# client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN_FGAC, +# fgac_user='admin', +# fgac_password='SECRET') +# print(client.info()) + + +def test_create_index(): + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + response = wr.opensearch.create_index( + client, + index='test-index1', + mappings={ + 'properties': { + 'name': {'type': 'text'}, + 'age': {'type': 'integer'} + } + }, + settings={ + 'index': { + 'number_of_shards': 1, + 'number_of_replicas': 1 + } + } + ) + print(response) + + +def test_index_df(): + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + response = wr.opensearch.index_df(client, + df=pd.DataFrame([{'_id': '1', 'name': 'John'}, + {'_id': '2', 'name': 'George'}, + {'_id': '3', 'name': 'Julia'} + ]), + index='test_index_df1' + ) + print(response) + + +def test_index_documents(): + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + response = wr.opensearch.index_documents(client, + documents=[{'_id': '1', 'name': 'John'}, + {'_id': '2', 'name': 'George'}, + {'_id': '3', 'name': 'Julia'} + ], + index='test_index_documents1' + ) + print(response) \ No newline at end of file From 4e8f4e3c725af0a37a6b0b4c749cbd92f882f51a Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 14 Sep 2021 11:41:15 -0400 Subject: [PATCH 04/41] [skip ci] fixed filter_path pandasticsearch issue --- awswrangler/opensearch/_read.py | 6 ++- tests/test_opensearch.py | 81 ++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index e1e735828..597005b92 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -45,7 +45,6 @@ def search( Because scroll search contexts consume a lot of memory, we suggest you don’t use the scroll operation for frequent user queries. **kwargs : KEYWORD arguments forwarded to [elasticsearch.Elasticsearch.search](https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch.Elasticsearch.search). - If ``is_scroll=True`` arguments will be forwarded to [elasticsearch.helpers.scan](https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan) Returns ------- @@ -74,6 +73,11 @@ def search( """ if doc_type: kwargs['doc_type'] = doc_type + + # pandasticsearch.Select.from_dict requires `took` field + if 'filter_path' in kwargs: + if 'took' not in kwargs['filter_path']: + kwargs['filter_path'].append('took') if is_scroll: # TODO: write logic # documents = _scan(client, index, search_body, doc_type, **kwargs) diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 01f30084d..08a1a947c 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -14,6 +14,15 @@ OPENSEARCH_DOMAIN_FGAC = 'search-os1-public-urixc6vui2il7oawwiox2e57n4.us-east-1.es.amazonaws.com' +inspections_documents = [ +{"business_address":"315 California St","business_city":"San Francisco","business_id":"24936","business_latitude":"37.793199","business_location":{"lon": -122.400152,"lat": 37.793199},"business_longitude":"-122.400152","business_name":"San Francisco Soup Company","business_postal_code":"94104","business_state":"CA","inspection_date":"2016-06-09T00:00:00.000","inspection_id":"24936_20160609","inspection_score":77,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Improper food labeling or menu misrepresentation","violation_id":"24936_20160609_103141"}, +{"business_address":"10 Mason St","business_city":"San Francisco","business_id":"60354","business_latitude":"37.783527","business_location":{"lon": -122.409061,"lat": 37.783527},"business_longitude":"-122.409061","business_name":"Soup Unlimited","business_postal_code":"94102","business_state":"CA","inspection_date":"2016-11-23T00:00:00.000","inspection_id":"60354_20161123","inspection_type":"Routine", "inspection_score": 95}, +{"business_address":"2872 24th St","business_city":"San Francisco","business_id":"1797","business_latitude":"37.752807","business_location":{"lon": -122.409752,"lat": 37.752807},"business_longitude":"-122.409752","business_name":"TIO CHILOS GRILL","business_postal_code":"94110","business_state":"CA","inspection_date":"2016-07-05T00:00:00.000","inspection_id":"1797_20160705","inspection_score":90,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unclean nonfood contact surfaces","violation_id":"1797_20160705_103142"}, +{"business_address":"1661 Tennessee St Suite 3B","business_city":"San Francisco Whard Restaurant","business_id":"66198","business_latitude":"37.75072","business_location":{"lon": -122.388478,"lat": 37.75072},"business_longitude":"-122.388478","business_name":"San Francisco Restaurant","business_postal_code":"94107","business_state":"CA","inspection_date":"2016-05-27T00:00:00.000","inspection_id":"66198_20160527","inspection_type":"Routine","inspection_score":56 }, +{"business_address":"2162 24th Ave","business_city":"San Francisco","business_id":"5794","business_latitude":"37.747228","business_location":{"lon": -122.481299,"lat": 37.747228},"business_longitude":"-122.481299","business_name":"Soup House","business_phone_number":"+14155752700","business_postal_code":"94116","business_state":"CA","inspection_date":"2016-09-07T00:00:00.000","inspection_id":"5794_20160907","inspection_score":96,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unapproved or unmaintained equipment or utensils","violation_id":"5794_20160907_103144"}, +{"business_address":"2162 24th Ave","business_city":"San Francisco","business_id":"5794","business_latitude":"37.747228","business_location":{"lon": -122.481299,"lat": 37.747228},"business_longitude":"-122.481299","business_name":"Soup-or-Salad","business_phone_number":"+14155752700","business_postal_code":"94116","business_state":"CA","inspection_date":"2016-09-07T00:00:00.000","inspection_id":"5794_20160907","inspection_score":96,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unapproved or unmaintained equipment or utensils","violation_id":"5794_20160907_103144"} +] + def test_connection(): client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) print(client.info()) @@ -68,4 +77,74 @@ def test_index_documents(): ], index='test_index_documents1' ) - print(response) \ No newline at end of file + print(response) + + +def test_index_documents_id_keys(): + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + response = wr.opensearch.index_documents(client, + documents=inspections_documents, + index='test_index_documents_id_keys', + id_keys=['inspection_id'] + ) + print(response) + + +def test_index_documents_no_id_keys(): + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + response = wr.opensearch.index_documents(client, + documents=inspections_documents, + index='test_index_documents_no_id_keys' + ) + print(response) + + +def test_search(): + index = 'test_search' + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + response = wr.opensearch.index_documents(client, + documents=inspections_documents, + index=index, + id_keys=['inspection_id'] + ) + df = wr.opensearch.search( + client, + index=index, + search_body={ + "query": { + "match": { + "business_name": "soup" + } + } + }, + _source=['inspection_id', 'business_name', 'business_location'] + ) + + print('') + print(df.to_string()) + + +def test_search_filter_path(): + index = 'test_search' + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + response = wr.opensearch.index_documents(client, + documents=inspections_documents, + index=index, + id_keys=['inspection_id'] + ) + df = wr.opensearch.search( + client, + index=index, + search_body={ + "query": { + "match": { + "business_name": "soup" + } + } + }, + _source=['inspection_id', 'business_name', 'business_location'], + filter_path=['hits.hits._source'] + ) + + print('') + print(df.to_string()) \ No newline at end of file From 7a010cd746e3f5401e5fa9f062731a9e5c71c9d6 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 14 Sep 2021 15:53:55 -0400 Subject: [PATCH 05/41] [skip ci] disable scan for now --- awswrangler/opensearch/_read.py | 16 ++-------------- tests/test_opensearch.py | 2 +- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index 597005b92..c20871713 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -6,18 +6,6 @@ from elasticsearch.helpers import scan -def _scan( - client: Elasticsearch, - index: Optional[str] = '_all', - search_body: Optional[Dict[str, Any]] = None, - doc_type: Optional[str] = None, - scroll: Optional[str] = '10m', - **kwargs -): - # TODO: write logic based on https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan - pass - - def search( client: Elasticsearch, index: Optional[str] = '_all', @@ -79,8 +67,7 @@ def search( if 'took' not in kwargs['filter_path']: kwargs['filter_path'].append('took') if is_scroll: - # TODO: write logic - # documents = _scan(client, index, search_body, doc_type, **kwargs) + # TODO: write logic based on https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan pass else: documents = client.search(index=index, body=search_body, **kwargs) @@ -120,3 +107,4 @@ def search_by_sql( """ # TODO: write logic + pass diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 08a1a947c..b3f29a88e 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -147,4 +147,4 @@ def test_search_filter_path(): ) print('') - print(df.to_string()) \ No newline at end of file + print(df.to_string()) From 79e0a9a8199d401ed4808a799c7a8062dd386d4b Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Wed, 15 Sep 2021 10:49:31 -0400 Subject: [PATCH 06/41] [skip ci] path documentation --- awswrangler/opensearch/_write.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index b3d18e237..05fa75f0c 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -131,7 +131,7 @@ def index_json( client : Elasticsearch instance of elasticsearch.Elasticsearch to use. path : Union[str, Path] - Path as str or Path object to the JSON file which contains the documents. + s3 or local path to the JSON file which contains the documents. index : str Name of the index. doc_type : str, optional @@ -173,7 +173,7 @@ def index_csv( path: Union[str, Path], index: str, doc_type: Optional[str] = None, - pandas_params: Optional[Dict[str, Any]] = None + pandas_kwargs: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: """Index all documents from a CSV file to OpenSearch index. @@ -182,12 +182,12 @@ def index_csv( client : Elasticsearch instance of elasticsearch.Elasticsearch to use. path : Union[str, Path] - Path as str or Path object to the CSV file which contains the documents. + s3 or local path to the CSV file which contains the documents. index : str Name of the index. doc_type : str, optional Name of the document type (only for Elasticsearch versions 5.x and older). - pandas_params : + pandas_kwargs : Dictionary of arguments forwarded to pandas.read_csv(). e.g. pandas_kwargs={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html @@ -218,7 +218,7 @@ def index_csv( ... client=client, ... path='docs.csv', ... index='sample-index1', - ... pandas_params={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} + ... pandas_kwargs={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} ... ) """ pass # TODO: load data from csv file From f07e698590d6b75f2143ea6d1ecdd61e48a05785 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Wed, 15 Sep 2021 11:06:03 -0400 Subject: [PATCH 07/41] [skip ci] add delete_index --- awswrangler/opensearch/__init__.py | 3 +- awswrangler/opensearch/_write.py | 44 +++++++++++++++++++++++++++++- tests/test_opensearch.py | 14 ++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/awswrangler/opensearch/__init__.py b/awswrangler/opensearch/__init__.py index cd7184e79..222519747 100644 --- a/awswrangler/opensearch/__init__.py +++ b/awswrangler/opensearch/__init__.py @@ -1,11 +1,12 @@ """Utilities Module for Amazon OpenSearch.""" from awswrangler.opensearch._utils import connect -from awswrangler.opensearch._write import create_index, index_csv, index_documents, index_df, index_json +from awswrangler.opensearch._write import create_index, delete_index, index_csv, index_documents, index_df, index_json from awswrangler.opensearch._read import search, search_by_sql __all__ = ["connect", "create_index", + "delete_index", "index_csv", "index_documents", "index_df", diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 05fa75f0c..8f629fcbf 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -111,7 +111,49 @@ def create_index( body['settings'] = settings if body == {}: body = None - return client.indices.create(index, body, ignore=[400, 404]) + response = client.indices.create(index, body, ignore=[400, 404]) + if 'error' in response: + _logger.warning(response) + if str(response['error']).startswith(u'MapperParsingException'): + raise ValueError(response['error']) + return response + + +def delete_index( + client: Elasticsearch, + index: str +) -> Dict[str, Any]: + """Creates an index. + + Parameters + ---------- + client : Elasticsearch + instance of elasticsearch.Elasticsearch to use. + index : str + Name of the index. + + Returns + ------- + Dict[str, Any] + OpenSearch rest api response + https://opensearch.org/docs/opensearch/rest-api/create-index/#response. + + Examples + -------- + Creating an index. + + >>> import awswrangler as wr + >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') + >>> response = wr.opensearch.delete_index( + ... client=client, + ... index="sample-index1" + ... ) + + """ + response = client.indices.delete(index, ignore=[400, 404]) + if 'error' in response: + _logger.warning(response) + return response def index_json( diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index b3f29a88e..2bcc95fa5 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -56,6 +56,20 @@ def test_create_index(): print(response) +def test_delete_index(): + index = 'test_delete_index' + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + wr.opensearch.create_index( + client, + index=index + ) + response = wr.opensearch.delete_index( + client, + index=index + ) + print(response) + + def test_index_df(): client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) response = wr.opensearch.index_df(client, From 7d7318b8d6a04227bc4f16205e6aa614b44487ae Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Wed, 15 Sep 2021 11:08:01 -0400 Subject: [PATCH 08/41] [skip ci] add delete_index --- awswrangler/opensearch/_write.py | 1 - 1 file changed, 1 deletion(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 8f629fcbf..7fda11578 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -136,7 +136,6 @@ def delete_index( ------- Dict[str, Any] OpenSearch rest api response - https://opensearch.org/docs/opensearch/rest-api/create-index/#response. Examples -------- From 6b90c936cc7c5d4ad6ef604844f61da5eef85948 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Wed, 15 Sep 2021 13:59:33 -0400 Subject: [PATCH 09/41] [skip ci] add index_json --- awswrangler/opensearch/_write.py | 41 +++++++++++++++++++++++++------- tests/test_opensearch.py | 35 +++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 7fda11578..5a606354f 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -2,9 +2,12 @@ import logging import uuid +import boto3 +import json from pathlib import Path from typing import Any, Dict, List, Mapping, Optional, Union, Tuple, Iterable -from ._utils import _get_distribution, _get_version_major +from awswrangler.opensearch._utils import _get_distribution, _get_version_major +from awswrangler._utils import parse_path import pandas as pd from elasticsearch import Elasticsearch @@ -44,6 +47,15 @@ def _df_doc_generator(df: pd.DataFrame): yield document +def _file_line_generator(path: str, is_json: bool = False): + with open(path) as fp: + for line in fp: + if is_json: + yield json.loads(line) + else: + yield line.strip() + + def create_index( client: Elasticsearch, index: str, @@ -160,7 +172,7 @@ def index_json( path: Union[str, Path], index: str, doc_type: Optional[str] = None, - bulk_params: Optional[Union[List[Any], Tuple[Any], Dict[Any, Any]]] = None, + boto3_session: Optional[boto3.Session] = boto3.Session(), **kwargs ) -> Dict[str, Any]: """Index all documents from JSON file to OpenSearch index. @@ -177,11 +189,9 @@ def index_json( Name of the index. doc_type : str, optional Name of the document type (only for Elasticsearch versions 5.x and earlier). - bulk_params : Union[List, Tuple, Dict], optional - List of parameters to pass to bulk operation. - References: - elasticsearch >= 7.10.2 / opensearch: https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters - elasticsearch < 7.10.2: https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters + boto3_session : boto3.Session(), optional + Boto3 Session to be used to access s3 if s3 path is provided. + The default boto3 Session will be used if boto3_session receive None. **kwargs : KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents` which is used to execute the operation @@ -206,7 +216,22 @@ def index_json( """ # Loading data from file - pass # TODO: load data from json file + if path.startswith("s3://"): + bucket, key = parse_path(path) + s3 = boto3_session.client('s3') + obj = s3.get_object(Bucket=bucket, Key=key) + body = obj['Body'].read() + lines = body.splitlines() + documents = map(lambda x: json.loads(x), lines) + else: # local path + documents = _file_line_generator(path, is_json=True) + return index_documents( + client=client, + documents=documents, + index=index, + doc_type=doc_type, + **kwargs + ) def index_csv( diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 2bcc95fa5..a4398a68b 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -2,7 +2,7 @@ import boto3 import pandas as pd - +import json import awswrangler as wr @@ -12,7 +12,7 @@ # TODO: create test_infra for opensearch OPENSEARCH_DOMAIN = 'search-es71-public-z63iyqxccc4ungar5vx45xwgfi.us-east-1.es.amazonaws.com' # change to your domain OPENSEARCH_DOMAIN_FGAC = 'search-os1-public-urixc6vui2il7oawwiox2e57n4.us-east-1.es.amazonaws.com' - +BUCKET = 'mentzera' inspections_documents = [ {"business_address":"315 California St","business_city":"San Francisco","business_id":"24936","business_latitude":"37.793199","business_location":{"lon": -122.400152,"lat": 37.793199},"business_longitude":"-122.400152","business_name":"San Francisco Soup Company","business_postal_code":"94104","business_state":"CA","inspection_date":"2016-06-09T00:00:00.000","inspection_id":"24936_20160609","inspection_score":77,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Improper food labeling or menu misrepresentation","violation_id":"24936_20160609_103141"}, @@ -162,3 +162,34 @@ def test_search_filter_path(): print('') print(df.to_string()) + + +def test_index_json_local(): + file_path = '/tmp/inspections.json' + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + with open(file_path, 'w') as filehandle: + for doc in inspections_documents: + filehandle.write('%s\n' % json.dumps(doc)) + response = wr.opensearch.index_json( + client, + index='test_index_json_local', + path=file_path + ) + print(response) + + +def test_index_json_s3(): + file_path = '/tmp/inspections.json' + s3_key = 'tmp/inspections.json' + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + with open(file_path, 'w') as filehandle: + for doc in inspections_documents: + filehandle.write('%s\n' % json.dumps(doc)) + s3 = boto3.client('s3') + s3.upload_file(file_path, BUCKET, s3_key) + response = wr.opensearch.index_json( + client, + index='test_index_json_s3', + path=f's3://{BUCKET}/{s3_key}' + ) + print(response) \ No newline at end of file From 73db6f51d9cac8c2aab675deabb9b146197cf8ff Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Wed, 15 Sep 2021 17:13:18 -0400 Subject: [PATCH 10/41] [skip ci] add index_csv local path --- awswrangler/opensearch/_write.py | 43 +++++++++++++++++++++++++++++--- tests/test_opensearch.py | 15 +++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 5a606354f..ad845c2b1 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -4,11 +4,13 @@ import uuid import boto3 import json +import ast from pathlib import Path from typing import Any, Dict, List, Mapping, Optional, Union, Tuple, Iterable from awswrangler.opensearch._utils import _get_distribution, _get_version_major from awswrangler._utils import parse_path import pandas as pd +from pandas import notna from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk @@ -42,9 +44,28 @@ def _actions_generator(documents: Union[Iterable[Dict[str, Any]], Iterable[Mappi def _df_doc_generator(df: pd.DataFrame): + def _deserialize(v): + if isinstance(v, str): + v = v.strip() + if (v.startswith('{') and v.endswith('}') + or + v.startswith('[') and v.endswith(']') + ): + try: + v = json.loads(v) + except Exception as e: + try: + v = ast.literal_eval(v) # if properties are enclosed with single quotes + except: + _logger.warning(f'could not convert string to json: {v}') + _logger.warning(e) + return v + df_iter = df.iterrows() for i, document in df_iter: - yield document + # print(document) + # yield document + yield {k: _deserialize(v) for k, v in document.items() if notna(v)} def _file_line_generator(path: str, is_json: bool = False): @@ -239,7 +260,8 @@ def index_csv( path: Union[str, Path], index: str, doc_type: Optional[str] = None, - pandas_kwargs: Optional[Dict[str, Any]] = None + pandas_kwargs: Optional[Dict[str, Any]] = {}, + **kwargs ) -> Dict[str, Any]: """Index all documents from a CSV file to OpenSearch index. @@ -257,6 +279,9 @@ def index_csv( Dictionary of arguments forwarded to pandas.read_csv(). e.g. pandas_kwargs={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html + **kwargs : + KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents` + which is used to execute the operation Returns ------- @@ -287,7 +312,19 @@ def index_csv( ... pandas_kwargs={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} ... ) """ - pass # TODO: load data from csv file + custom_pandas_params = { + 'skip_blank_lines': True, + 'na_filter': True # will generate Nan value for empty cells. We remove Nan keys in _df_doc_generator + } + pandas_kwargs.update(custom_pandas_params) + df = pd.read_csv(path, **pandas_kwargs) + return index_df( + client, + df=df, + index=index, + doc_type=doc_type, + **kwargs + ) def index_df( diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index a4398a68b..40f83eae6 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -192,4 +192,19 @@ def test_index_json_s3(): index='test_index_json_s3', path=f's3://{BUCKET}/{s3_key}' ) + print(response) + + +def test_index_csv_local(): + file_path = '/tmp/inspections.csv' + index = 'test_index_csv_local' + df=pd.DataFrame(inspections_documents) + df.to_csv(file_path, index=False) + client = wr.opensearch.connect(OPENSEARCH_DOMAIN) + wr.opensearch.delete_index(client, index) + response = wr.opensearch.index_csv( + client, + path=file_path, + index=index + ) print(response) \ No newline at end of file From 15d8aca40bc9e66df8fcb96d0b9d06c33d42db7d Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Fri, 17 Sep 2021 12:38:03 -0400 Subject: [PATCH 11/41] [skip ci] add is_scroll to search (scan) --- awswrangler/opensearch/_read.py | 17 +++++++++++++---- tests/test_opensearch.py | 26 ++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index c20871713..0f82f5964 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -4,6 +4,7 @@ from typing import Any, Dict, Optional from elasticsearch import Elasticsearch from elasticsearch.helpers import scan +import pandas as pd def search( @@ -32,7 +33,8 @@ def search( for example, for machine learning jobs. Because scroll search contexts consume a lot of memory, we suggest you don’t use the scroll operation for frequent user queries. **kwargs : - KEYWORD arguments forwarded to [elasticsearch.Elasticsearch.search](https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch.Elasticsearch.search). + KEYWORD arguments forwarded to [elasticsearch.Elasticsearch.search](https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch.Elasticsearch.search) + and also to [elasticsearch.helpers.scan](https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan) if `is_scroll=True` Returns ------- @@ -67,11 +69,18 @@ def search( if 'took' not in kwargs['filter_path']: kwargs['filter_path'].append('took') if is_scroll: - # TODO: write logic based on https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan - pass + documents_generator = scan( + client, + index=index, + query=search_body, + **kwargs + ) + s = Select() + documents = map(lambda x: s.hit_to_row(x), documents_generator) + df = pd.DataFrame(documents) else: documents = client.search(index=index, body=search_body, **kwargs) - df = Select.from_dict(documents).to_pandas() + df = Select.from_dict(documents).to_pandas() return df diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 40f83eae6..6152da7d8 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -119,7 +119,8 @@ def test_search(): response = wr.opensearch.index_documents(client, documents=inspections_documents, index=index, - id_keys=['inspection_id'] + id_keys=['inspection_id'], + refresh='wait_for' ) df = wr.opensearch.search( client, @@ -144,7 +145,8 @@ def test_search_filter_path(): response = wr.opensearch.index_documents(client, documents=inspections_documents, index=index, - id_keys=['inspection_id'] + id_keys=['inspection_id'], + refresh='wait_for' ) df = wr.opensearch.search( client, @@ -164,6 +166,26 @@ def test_search_filter_path(): print(df.to_string()) +def test_search_scroll(): + index = 'test_search_scroll' + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + response = wr.opensearch.index_documents(client, + documents=inspections_documents, + index=index, + id_keys=['inspection_id'], + refresh='wait_for' + ) + df = wr.opensearch.search( + client, + index=index, + is_scroll=True, + _source=['inspection_id', 'business_name', 'business_location'] + ) + + print('') + print(df.to_string()) + + def test_index_json_local(): file_path = '/tmp/inspections.json' client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) From e01b1a0f593a764b684bf88732ac9f1a7bf5d826 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Fri, 17 Sep 2021 13:50:51 -0400 Subject: [PATCH 12/41] [skip ci] add search_by_sql --- awswrangler/opensearch/_read.py | 36 ++++++++++++++++++++++++++++++--- tests/test_opensearch.py | 18 +++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index 0f82f5964..80ec509d5 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -4,6 +4,7 @@ from typing import Any, Dict, Optional from elasticsearch import Elasticsearch from elasticsearch.helpers import scan +from awswrangler.opensearch._utils import _get_distribution import pandas as pd @@ -86,7 +87,8 @@ def search( def search_by_sql( client: Elasticsearch, - sql_query: str + sql_query: str, + **kwargs ) -> DataFrame: """Returns results matching [SQL query](https://opensearch.org/docs/search-plugins/sql/index/) as pandas dataframe @@ -96,6 +98,8 @@ def search_by_sql( instance of elasticsearch.Elasticsearch to use. sql_query : str SQL query + **kwargs : + KEYWORD arguments forwarded to request url (e.g.: filter_path, etc.) Returns ------- @@ -115,5 +119,31 @@ def search_by_sql( """ - # TODO: write logic - pass + + # can be used if not passing format + def _sql_response_to_docs(response: Dict[str, Any]): + header = list(map(lambda x: x['name'], response.get('schema', []))) + for datarow in response.get('datarows', []): + yield dict(zip(header, datarow)) + + if _get_distribution(client) == 'opensearch': + url = '/_plugins/_sql' + else: + url = '/_opendistro/_sql' + + kwargs['format'] = 'json' + body = {'query': sql_query} + for size_att in ['size', 'fetch_size']: + if size_att in kwargs: + body['fetch_size'] = kwargs[size_att] + del kwargs[size_att] # unrecognized parameter + response = client.transport.perform_request( + "POST", + url, + headers={'Content-Type': 'application/json'}, + body=body, + params=kwargs + ) + + df = Select.from_dict(response).to_pandas() + return df diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 6152da7d8..28fdf5c5c 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -186,6 +186,24 @@ def test_search_scroll(): print(df.to_string()) +def test_search_sql(): + index = 'test_search_sql' + client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + response = wr.opensearch.index_documents(client, + documents=inspections_documents, + index=index, + id_keys=['inspection_id'], + refresh='wait_for' + ) + df = wr.opensearch.search_by_sql( + client, + sql_query=f'select * from {index}' + ) + + print('') + print(df.to_string()) + + def test_index_json_local(): file_path = '/tmp/inspections.json' client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) From 1e1fe3799a8472172746c9ee061f0af33133f0c4 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Mon, 27 Sep 2021 23:41:50 -0400 Subject: [PATCH 13/41] [skip ci] opensearch test infra --- test_infra/app.py | 9 + test_infra/poetry.lock | 670 +++++++++++++++--------- test_infra/pyproject.toml | 1 + test_infra/scripts/delete-opensearch.sh | 6 + test_infra/scripts/deploy-opensearch.sh | 7 + test_infra/stacks/opensearch_stack.py | 113 ++++ tests/_utils.py | 2 +- tests/test_opensearch.py | 148 ++++-- 8 files changed, 657 insertions(+), 299 deletions(-) create mode 100755 test_infra/scripts/delete-opensearch.sh create mode 100755 test_infra/scripts/deploy-opensearch.sh create mode 100644 test_infra/stacks/opensearch_stack.py diff --git a/test_infra/app.py b/test_infra/app.py index 4e27aa261..b14c1fc81 100644 --- a/test_infra/app.py +++ b/test_infra/app.py @@ -2,6 +2,7 @@ from aws_cdk import core as cdk from stacks.base_stack import BaseStack from stacks.databases_stack import DatabasesStack +from stacks.opensearch_stack import OpenSearchStack app = cdk.App() @@ -14,4 +15,12 @@ base.get_key, ) +OpenSearchStack( +app, + "aws-data-wrangler-opensearch", + base.get_vpc, + base.get_bucket, + base.get_key, +) + app.synth() diff --git a/test_infra/poetry.lock b/test_infra/poetry.lock index f68d38031..c4e5df69b 100644 --- a/test_infra/poetry.lock +++ b/test_infra/poetry.lock @@ -1,496 +1,638 @@ [[package]] name = "attrs" -version = "20.3.0" +version = "21.2.0" description = "Classes Without Boilerplate" category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "furo", "sphinx", "pre-commit"] -docs = ["furo", "sphinx", "zope.interface"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"] -tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six"] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"] [[package]] name = "aws-cdk.assets" -version = "1.115.0" +version = "1.124.0" description = "This module is deprecated. All types are now available under the core module" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-applicationautoscaling" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::ApplicationAutoScaling" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-autoscaling-common" = "1.115.0" -"aws-cdk.aws-cloudwatch" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-autoscaling-common" = "1.124.0" +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-autoscaling-common" -version = "1.115.0" +version = "1.124.0" description = "Common implementation package for @aws-cdk/aws-autoscaling and @aws-cdk/aws-applicationautoscaling" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" +publication = ">=0.0.3" + +[[package]] +name = "aws-cdk.aws-certificatemanager" +version = "1.124.0" +description = "The CDK Construct Library for AWS::CertificateManager" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-lambda" = "1.124.0" +"aws-cdk.aws-route53" = "1.124.0" +"aws-cdk.core" = "1.124.0" +constructs = ">=3.3.69,<4.0.0" +jsii = ">=1.34.0,<2.0.0" +publication = ">=0.0.3" + +[[package]] +name = "aws-cdk.aws-cloudformation" +version = "1.124.0" +description = "The CDK Construct Library for AWS::CloudFormation" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-lambda" = "1.124.0" +"aws-cdk.aws-s3" = "1.124.0" +"aws-cdk.aws-sns" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" +constructs = ">=3.3.69,<4.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-cloudwatch" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::CloudWatch" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-codeguruprofiler" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::CodeGuruProfiler" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" +publication = ">=0.0.3" + +[[package]] +name = "aws-cdk.aws-codestarnotifications" +version = "1.124.0" +description = "The CDK Construct Library for AWS::CodeStarNotifications" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +"aws-cdk.core" = "1.124.0" +constructs = ">=3.3.69,<4.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-ec2" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::EC2" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-cloudwatch" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.aws-logs" = "1.115.0" -"aws-cdk.aws-s3" = "1.115.0" -"aws-cdk.aws-s3-assets" = "1.115.0" -"aws-cdk.aws-ssm" = "1.115.0" -"aws-cdk.cloud-assembly-schema" = "1.115.0" -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" -"aws-cdk.region-info" = "1.115.0" +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-logs" = "1.124.0" +"aws-cdk.aws-s3" = "1.124.0" +"aws-cdk.aws-s3-assets" = "1.124.0" +"aws-cdk.aws-ssm" = "1.124.0" +"aws-cdk.cloud-assembly-schema" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" +"aws-cdk.region-info" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-ecr" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::ECR" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-events" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-events" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-ecr-assets" -version = "1.115.0" +version = "1.124.0" description = "Docker image assets deployed to ECR" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.assets" = "1.115.0" -"aws-cdk.aws-ecr" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-s3" = "1.115.0" -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" +"aws-cdk.assets" = "1.124.0" +"aws-cdk.aws-ecr" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-s3" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-efs" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::EFS" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-ec2" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.cloud-assembly-schema" = "1.115.0" -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" +"aws-cdk.aws-ec2" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.cloud-assembly-schema" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-events" -version = "1.115.0" +version = "1.124.0" description = "Amazon EventBridge Construct Library" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-glue" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::Glue" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-ec2" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.aws-s3" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.assets" = "1.124.0" +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-ec2" = "1.124.0" +"aws-cdk.aws-events" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-logs" = "1.124.0" +"aws-cdk.aws-s3" = "1.124.0" +"aws-cdk.aws-s3-assets" = "1.124.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-iam" -version = "1.115.0" +version = "1.124.0" description = "CDK routines for easily assigning correct and minimal IAM permissions" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.core" = "1.115.0" -"aws-cdk.region-info" = "1.115.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.region-info" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-kms" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::KMS" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.cloud-assembly-schema" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-lambda" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::Lambda" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-applicationautoscaling" = "1.115.0" -"aws-cdk.aws-cloudwatch" = "1.115.0" -"aws-cdk.aws-codeguruprofiler" = "1.115.0" -"aws-cdk.aws-ec2" = "1.115.0" -"aws-cdk.aws-ecr" = "1.115.0" -"aws-cdk.aws-ecr-assets" = "1.115.0" -"aws-cdk.aws-efs" = "1.115.0" -"aws-cdk.aws-events" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.aws-logs" = "1.115.0" -"aws-cdk.aws-s3" = "1.115.0" -"aws-cdk.aws-s3-assets" = "1.115.0" -"aws-cdk.aws-signer" = "1.115.0" -"aws-cdk.aws-sqs" = "1.115.0" -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" +"aws-cdk.aws-applicationautoscaling" = "1.124.0" +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-codeguruprofiler" = "1.124.0" +"aws-cdk.aws-ec2" = "1.124.0" +"aws-cdk.aws-ecr" = "1.124.0" +"aws-cdk.aws-ecr-assets" = "1.124.0" +"aws-cdk.aws-efs" = "1.124.0" +"aws-cdk.aws-events" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-logs" = "1.124.0" +"aws-cdk.aws-s3" = "1.124.0" +"aws-cdk.aws-s3-assets" = "1.124.0" +"aws-cdk.aws-signer" = "1.124.0" +"aws-cdk.aws-sqs" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" +"aws-cdk.region-info" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-logs" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::Logs" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-cloudwatch" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.aws-s3-assets" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-s3-assets" = "1.124.0" +"aws-cdk.core" = "1.124.0" +constructs = ">=3.3.69,<4.0.0" +jsii = ">=1.34.0,<2.0.0" +publication = ">=0.0.3" + +[[package]] +name = "aws-cdk.aws-opensearchservice" +version = "1.124.0" +description = "The CDK Construct Library for AWS::OpenSearchService" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +"aws-cdk.aws-certificatemanager" = "1.124.0" +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-ec2" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-logs" = "1.124.0" +"aws-cdk.aws-route53" = "1.124.0" +"aws-cdk.aws-secretsmanager" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.custom-resources" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-rds" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::RDS" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-cloudwatch" = "1.115.0" -"aws-cdk.aws-ec2" = "1.115.0" -"aws-cdk.aws-events" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.aws-logs" = "1.115.0" -"aws-cdk.aws-s3" = "1.115.0" -"aws-cdk.aws-secretsmanager" = "1.115.0" -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-ec2" = "1.124.0" +"aws-cdk.aws-events" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-logs" = "1.124.0" +"aws-cdk.aws-s3" = "1.124.0" +"aws-cdk.aws-secretsmanager" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-redshift" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::Redshift" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-ec2" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.aws-s3" = "1.115.0" -"aws-cdk.aws-secretsmanager" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-ec2" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-lambda" = "1.124.0" +"aws-cdk.aws-s3" = "1.124.0" +"aws-cdk.aws-secretsmanager" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.custom-resources" = "1.124.0" +constructs = ">=3.3.69,<4.0.0" +jsii = ">=1.34.0,<2.0.0" +publication = ">=0.0.3" + +[[package]] +name = "aws-cdk.aws-route53" +version = "1.124.0" +description = "The CDK Construct Library for AWS::Route53" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +"aws-cdk.aws-ec2" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-logs" = "1.124.0" +"aws-cdk.cloud-assembly-schema" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.custom-resources" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-s3" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::S3" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-events" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" +"aws-cdk.aws-events" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-s3-assets" -version = "1.115.0" +version = "1.124.0" description = "Deploy local files and directories to S3" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.assets" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.aws-s3" = "1.115.0" -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" +"aws-cdk.assets" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-s3" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-sam" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for the AWS Serverless Application Model (SAM) resources" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.core" = "1.115.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-secretsmanager" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::SecretsManager" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-ec2" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.aws-lambda" = "1.115.0" -"aws-cdk.aws-sam" = "1.115.0" -"aws-cdk.core" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" +"aws-cdk.aws-ec2" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-lambda" = "1.124.0" +"aws-cdk.aws-sam" = "1.124.0" +"aws-cdk.core" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-signer" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::Signer" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.core" = "1.115.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" +publication = ">=0.0.3" + +[[package]] +name = "aws-cdk.aws-sns" +version = "1.124.0" +description = "The CDK Construct Library for AWS::SNS" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-codestarnotifications" = "1.124.0" +"aws-cdk.aws-events" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.aws-sqs" = "1.124.0" +"aws-cdk.core" = "1.124.0" +constructs = ">=3.3.69,<4.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-sqs" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::SQS" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-cloudwatch" = "1.115.0" -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-cloudwatch" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.aws-ssm" -version = "1.115.0" +version = "1.124.0" description = "The CDK Construct Library for AWS::SSM" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.aws-iam" = "1.115.0" -"aws-cdk.aws-kms" = "1.115.0" -"aws-cdk.cloud-assembly-schema" = "1.115.0" -"aws-cdk.core" = "1.115.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-kms" = "1.124.0" +"aws-cdk.cloud-assembly-schema" = "1.124.0" +"aws-cdk.core" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.cloud-assembly-schema" -version = "1.115.0" +version = "1.124.0" description = "Cloud Assembly Schema" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.core" -version = "1.115.0" +version = "1.124.0" description = "AWS Cloud Development Kit Core Library" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.cloud-assembly-schema" = "1.115.0" -"aws-cdk.cx-api" = "1.115.0" -"aws-cdk.region-info" = "1.115.0" +"aws-cdk.cloud-assembly-schema" = "1.124.0" +"aws-cdk.cx-api" = "1.124.0" +"aws-cdk.region-info" = "1.124.0" constructs = ">=3.3.69,<4.0.0" -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" +publication = ">=0.0.3" + +[[package]] +name = "aws-cdk.custom-resources" +version = "1.124.0" +description = "Constructs for implementing CDK custom resources" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +"aws-cdk.aws-cloudformation" = "1.124.0" +"aws-cdk.aws-ec2" = "1.124.0" +"aws-cdk.aws-iam" = "1.124.0" +"aws-cdk.aws-lambda" = "1.124.0" +"aws-cdk.aws-logs" = "1.124.0" +"aws-cdk.aws-sns" = "1.124.0" +"aws-cdk.core" = "1.124.0" +constructs = ">=3.3.69,<4.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.cx-api" -version = "1.115.0" +version = "1.124.0" description = "Cloud executable protocol" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -"aws-cdk.cloud-assembly-schema" = "1.115.0" -jsii = ">=1.31.0,<2.0.0" +"aws-cdk.cloud-assembly-schema" = "1.124.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] name = "aws-cdk.region-info" -version = "1.115.0" +version = "1.124.0" description = "AWS region information, such as service principal names" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -jsii = ">=1.31.0,<2.0.0" +jsii = ">=1.34.0,<2.0.0" publication = ">=0.0.3" [[package]] @@ -509,14 +651,14 @@ dev = ["bumpversion", "wheel", "watchdog", "flake8", "tox", "coverage", "sphinx" [[package]] name = "cattrs" -version = "1.6.0" +version = "1.8.0" description = "Composable complex class support for attrs and dataclasses." category = "main" optional = false python-versions = ">=3.7,<4.0" [package.dependencies] -attrs = "*" +attrs = ">=20" [[package]] name = "constructs" @@ -547,17 +689,17 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes [[package]] name = "jsii" -version = "1.32.0" +version = "1.34.0" description = "Python client for jsii runtime" category = "main" optional = false python-versions = "~=3.6" [package.dependencies] -attrs = ">=20.1,<21.0" +attrs = ">=21.2,<22.0" cattrs = [ {version = ">=1.0.0,<1.1.0", markers = "python_version < \"3.7\""}, - {version = ">=1.6.0,<1.7.0", markers = "python_version >= \"3.7\""}, + {version = ">=1.8.0,<1.9.0", markers = "python_version >= \"3.7\""}, ] importlib-resources = {version = "*", markers = "python_version < \"3.7\""} python-dateutil = "*" @@ -613,130 +755,158 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes [metadata] lock-version = "1.1" python-versions = ">=3.6.2, <3.10" -content-hash = "6f8430d31b5e3d08bb0393b4c93ca223cc9d49b55bb3045f95326770d74347ca" +content-hash = "7fe703d54794d69aab0dd6ad5b4017c43defbff76ed9a3fe10e243c422adfea6" [metadata.files] attrs = [ - {file = "attrs-20.3.0-py2.py3-none-any.whl", hash = "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6"}, - {file = "attrs-20.3.0.tar.gz", hash = "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"}, + {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"}, + {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"}, ] "aws-cdk.assets" = [ - {file = "aws-cdk.assets-1.115.0.tar.gz", hash = "sha256:e3a569f900451f2f8429a2ad7cd059712f2903d24cbcaa023911f46362496d2d"}, - {file = "aws_cdk.assets-1.115.0-py3-none-any.whl", hash = "sha256:d7f62fdaf500980cbcb0cab82cd08cb7334683428cfb3c67c68f72371e29109f"}, + {file = "aws-cdk.assets-1.124.0.tar.gz", hash = "sha256:8097177806b29824a69bbdb5df9ec74f7b360708b51ed860613d38e30414054a"}, + {file = "aws_cdk.assets-1.124.0-py3-none-any.whl", hash = "sha256:c94b63e36c094111c6a9abb2a9d6c694f3e123034cf5dc23e5293fdc32c44fb3"}, ] "aws-cdk.aws-applicationautoscaling" = [ - {file = "aws-cdk.aws-applicationautoscaling-1.115.0.tar.gz", hash = "sha256:e174b3247252bfec419389b896267516d2f874ec56456880116f79204ae9e3e5"}, - {file = "aws_cdk.aws_applicationautoscaling-1.115.0-py3-none-any.whl", hash = "sha256:45eff7fb107924b6ade243e88edae49f14a599ff3afcaf40a73969c45de733b5"}, + {file = "aws-cdk.aws-applicationautoscaling-1.124.0.tar.gz", hash = "sha256:c3bc89c2754b7ce029c667be9ab1633884bf574d33773a1dc07a3cff1b698670"}, + {file = "aws_cdk.aws_applicationautoscaling-1.124.0-py3-none-any.whl", hash = "sha256:d0dcc91b3de13ad46b874813877af3746adec3ad9f7380b2408a14cdd848b65c"}, ] "aws-cdk.aws-autoscaling-common" = [ - {file = "aws-cdk.aws-autoscaling-common-1.115.0.tar.gz", hash = "sha256:b87c84d3e558b20e3bea515d89cb59d633d71e2c8a6e4e859a691f3c06d45c10"}, - {file = "aws_cdk.aws_autoscaling_common-1.115.0-py3-none-any.whl", hash = "sha256:bc0e56fe4fedd6e5a0d094845c4e1b2681bf60dfb72f2062392ef7edd5b157bd"}, + {file = "aws-cdk.aws-autoscaling-common-1.124.0.tar.gz", hash = "sha256:03f57fcd34d9e370c0929de63c674bdbf2a8fbe2efed40942e0e2bff1ed1d436"}, + {file = "aws_cdk.aws_autoscaling_common-1.124.0-py3-none-any.whl", hash = "sha256:1969320c12bf4107346233b3310464c1e752b65a6577c865abb809711cec2c1f"}, +] +"aws-cdk.aws-certificatemanager" = [ + {file = "aws-cdk.aws-certificatemanager-1.124.0.tar.gz", hash = "sha256:291e7c29aa406619276dc141a3827b0af15c9a997b6e7dc1a8c59bbfb3aa7df7"}, + {file = "aws_cdk.aws_certificatemanager-1.124.0-py3-none-any.whl", hash = "sha256:23071000fe931dd817638b059991872fe93a91a1c1d33750f080c536e9aaf302"}, +] +"aws-cdk.aws-cloudformation" = [ + {file = "aws-cdk.aws-cloudformation-1.124.0.tar.gz", hash = "sha256:c38efe614113c3bdcb964f6c20742994154392bc78e82c34a299d0f1b26a7c65"}, + {file = "aws_cdk.aws_cloudformation-1.124.0-py3-none-any.whl", hash = "sha256:9b530359f567555b83dfbb99f7112fdb2ad893176032ff542ce09f7454ce5107"}, ] "aws-cdk.aws-cloudwatch" = [ - {file = "aws-cdk.aws-cloudwatch-1.115.0.tar.gz", hash = "sha256:adb27916047303bf5748d503dc608041d30ea002b47c4e2c370d2084c1bec8c4"}, - {file = "aws_cdk.aws_cloudwatch-1.115.0-py3-none-any.whl", hash = "sha256:2b6b5e954f0b2a629d977cb6db93ec38e2c3c6dde43d88369dbc7a64c92d1ce1"}, + {file = "aws-cdk.aws-cloudwatch-1.124.0.tar.gz", hash = "sha256:221734f8b6f940068714fe00fd68a8a32d767c713b2adb874365482836248f7f"}, + {file = "aws_cdk.aws_cloudwatch-1.124.0-py3-none-any.whl", hash = "sha256:a9a4abf58e31cb53872601296b41cf8e8d5106807a5775d19a6ac05fbe34bef0"}, ] "aws-cdk.aws-codeguruprofiler" = [ - {file = "aws-cdk.aws-codeguruprofiler-1.115.0.tar.gz", hash = "sha256:bd8954511616b1ae8e6bd88122de5cb94c7d16b79f051452b490af9ec729124d"}, - {file = "aws_cdk.aws_codeguruprofiler-1.115.0-py3-none-any.whl", hash = "sha256:48d6a7ea1a372e3e1dbdb0307c7665ba486ef58b80d1d2ebb56cabb03b40af80"}, + {file = "aws-cdk.aws-codeguruprofiler-1.124.0.tar.gz", hash = "sha256:e37cd801e5b7fa93a0dba84effc36cd94f090b83988c4f165815ba585f7ca866"}, + {file = "aws_cdk.aws_codeguruprofiler-1.124.0-py3-none-any.whl", hash = "sha256:4d4bd49ea2415d9daf7c3c57403060802e5f523bd476a276f1e00a3e3d73c15d"}, +] +"aws-cdk.aws-codestarnotifications" = [ + {file = "aws-cdk.aws-codestarnotifications-1.124.0.tar.gz", hash = "sha256:478486be7e24e455c1fd8a54489de491005997b6ebdc06212a6231e89471414a"}, + {file = "aws_cdk.aws_codestarnotifications-1.124.0-py3-none-any.whl", hash = "sha256:de73fbcceba282ddf3caf5e74b188e4685108cec845f573986ea3fec1c98beba"}, ] "aws-cdk.aws-ec2" = [ - {file = "aws-cdk.aws-ec2-1.115.0.tar.gz", hash = "sha256:e819f98e07d3ee24182f23d435bf164ca7bdfdd42e72305d975b2c75a5a57138"}, - {file = "aws_cdk.aws_ec2-1.115.0-py3-none-any.whl", hash = "sha256:0475af1a07e514136004870c590dd5b187dd4588eb291da4662ed2d7cf5956c7"}, + {file = "aws-cdk.aws-ec2-1.124.0.tar.gz", hash = "sha256:f7515734cac0ef8eeaa003bef85364c878fad4a90876de313d156cc863199811"}, + {file = "aws_cdk.aws_ec2-1.124.0-py3-none-any.whl", hash = "sha256:d000d22d87d887dfbc61b82be897234fc58f421b2fbbbc29f002b683b4fdac4f"}, ] "aws-cdk.aws-ecr" = [ - {file = "aws-cdk.aws-ecr-1.115.0.tar.gz", hash = "sha256:3083470a95283a95275e1f2ad30868f3591d0a5bf432cf4bab360dabe4cb2e29"}, - {file = "aws_cdk.aws_ecr-1.115.0-py3-none-any.whl", hash = "sha256:695842b3b892b404c3219d8b44b9ad7a8bf1fd1957abb97c618dba47e050108b"}, + {file = "aws-cdk.aws-ecr-1.124.0.tar.gz", hash = "sha256:cbf940fbb76eb189143df45f67115673faf10a4b8e7f571660822604c9016aad"}, + {file = "aws_cdk.aws_ecr-1.124.0-py3-none-any.whl", hash = "sha256:1661c6f8fd618ac75da7cdefd36adda747218e4fe27faa44b5df62ecabd0b3f3"}, ] "aws-cdk.aws-ecr-assets" = [ - {file = "aws-cdk.aws-ecr-assets-1.115.0.tar.gz", hash = "sha256:5450bbcebb89eff84327246c6049a90adefe73ed194bd62778ffeee6facf9042"}, - {file = "aws_cdk.aws_ecr_assets-1.115.0-py3-none-any.whl", hash = "sha256:8e7e5b2351370b795b12abd0812a3ace241cc46df8d67aecb92410de2bfd7318"}, + {file = "aws-cdk.aws-ecr-assets-1.124.0.tar.gz", hash = "sha256:b2401b111474413436e664c1652d02d6e053ca946cbbe224a4f9c3c6220005df"}, + {file = "aws_cdk.aws_ecr_assets-1.124.0-py3-none-any.whl", hash = "sha256:7dc6b6f262baffa37df3ed898d8ae74ef2384793be822a91b91159cb512183ff"}, ] "aws-cdk.aws-efs" = [ - {file = "aws-cdk.aws-efs-1.115.0.tar.gz", hash = "sha256:eb96d01635283dbee1101fe57e0a19310974c8de02f75d9042adbab44139fe65"}, - {file = "aws_cdk.aws_efs-1.115.0-py3-none-any.whl", hash = "sha256:8e9e3f0f837e1ff3cfe96da5d700095f24d132c11cc7544f7a9f20024fa27372"}, + {file = "aws-cdk.aws-efs-1.124.0.tar.gz", hash = "sha256:90aaccea5ff55ae4a3045540f78e007c048709e142d77947aa15ad655ed4c011"}, + {file = "aws_cdk.aws_efs-1.124.0-py3-none-any.whl", hash = "sha256:282db0bd269535fb19f0101d4fa6b9cb7cf7dcddf2eaf5d04d7f03fef156c9d0"}, ] "aws-cdk.aws-events" = [ - {file = "aws-cdk.aws-events-1.115.0.tar.gz", hash = "sha256:4ce7f0e894c61849e8157a0170cb74ec5223d18dc613075912f2ef560974856b"}, - {file = "aws_cdk.aws_events-1.115.0-py3-none-any.whl", hash = "sha256:a817f0f46c027163a30eb5bab254540e00f5e5285bb1e8678dfd724f8f1187c0"}, + {file = "aws-cdk.aws-events-1.124.0.tar.gz", hash = "sha256:0b6b5ffca233c0b5d7abaf011072ca896463ce391242ffdf7bf4def28dec8213"}, + {file = "aws_cdk.aws_events-1.124.0-py3-none-any.whl", hash = "sha256:92ba680941365de0f90ad7881b8c2e787c50b85a69bc32e82b4578a3276f810f"}, ] "aws-cdk.aws-glue" = [ - {file = "aws-cdk.aws-glue-1.115.0.tar.gz", hash = "sha256:a85d344e61cfb3e0953665bcd85fd4b7ac282417fe7099e2c54cc393f62bfa99"}, - {file = "aws_cdk.aws_glue-1.115.0-py3-none-any.whl", hash = "sha256:ca2780bf366ab2ba74adb98b6a49c95ee6e5dbde2bc5758657cb5d4197c996ce"}, + {file = "aws-cdk.aws-glue-1.124.0.tar.gz", hash = "sha256:b43f747a2b8480ca848f7ab27b1dd0c7e352c9602fdb039cfc78f5013dbef450"}, + {file = "aws_cdk.aws_glue-1.124.0-py3-none-any.whl", hash = "sha256:d90bc85ae0d6b03536879d6fa72cdc49cfe1d58451b9e0065786b682dc2f9422"}, ] "aws-cdk.aws-iam" = [ - {file = "aws-cdk.aws-iam-1.115.0.tar.gz", hash = "sha256:fe4e3138d6544755cbeb2400fd770b583b01906443648a4588085de2e781707f"}, - {file = "aws_cdk.aws_iam-1.115.0-py3-none-any.whl", hash = "sha256:7ba923894c6ecce33147527dccbf90fdaecc7a5561b2ca9398623f1f063f898c"}, + {file = "aws-cdk.aws-iam-1.124.0.tar.gz", hash = "sha256:9d779439048832c6f4d5722196a9490d80bb649e56bb4dadc554ea3ae940f797"}, + {file = "aws_cdk.aws_iam-1.124.0-py3-none-any.whl", hash = "sha256:249fc537532f73c3cd3f59dc635be58535d9e9f9418062214eb664e14b59a6be"}, ] "aws-cdk.aws-kms" = [ - {file = "aws-cdk.aws-kms-1.115.0.tar.gz", hash = "sha256:1d1feca56bc4c2de722f59a07ee8dc36b6d7a31d70ffe32de5f76c099b2b6322"}, - {file = "aws_cdk.aws_kms-1.115.0-py3-none-any.whl", hash = "sha256:c692b0cebe2b0106ddc0ec3946a895941176b35411d46b27ae9bfb06cdaa9d6d"}, + {file = "aws-cdk.aws-kms-1.124.0.tar.gz", hash = "sha256:205e79bc8f8e009bd1b5df236f0336e977eb141c70575a42080e36829358215f"}, + {file = "aws_cdk.aws_kms-1.124.0-py3-none-any.whl", hash = "sha256:91294f10f02000743eef712da5ba7ea2749b43e4a0ad7d4715c9c95b6a472c10"}, ] "aws-cdk.aws-lambda" = [ - {file = "aws-cdk.aws-lambda-1.115.0.tar.gz", hash = "sha256:11eec3652671f37d261f991eaf963726fed281c5aafe77e9f83afab899398892"}, - {file = "aws_cdk.aws_lambda-1.115.0-py3-none-any.whl", hash = "sha256:65000012469a64096d25614c23e22da74a3d15234925cf44b29fd3d63d21b993"}, + {file = "aws-cdk.aws-lambda-1.124.0.tar.gz", hash = "sha256:801552637c408a693a7b13967da4ec4e8a623f22b90fb0fdfb845c23765e4e29"}, + {file = "aws_cdk.aws_lambda-1.124.0-py3-none-any.whl", hash = "sha256:50d774d026a8a0ca5089df5c8b2c7cc2ef74db2a4b20c5d049210b154d3af03d"}, ] "aws-cdk.aws-logs" = [ - {file = "aws-cdk.aws-logs-1.115.0.tar.gz", hash = "sha256:de30016914a17ca59d55f36029aa10fdc800f8fa69f4a5de822898aebbb29a78"}, - {file = "aws_cdk.aws_logs-1.115.0-py3-none-any.whl", hash = "sha256:8c6adcf54e066a71a6a7031a8592f52f09a01ca0d6a6d1f51080f9996ad7ac52"}, + {file = "aws-cdk.aws-logs-1.124.0.tar.gz", hash = "sha256:2fba565fc4f12b397bd9df1cd9964c1b35ce1ca65cd618407b6b1777bc43d292"}, + {file = "aws_cdk.aws_logs-1.124.0-py3-none-any.whl", hash = "sha256:1f4b1ff436f2d0663e6c76264d7d6ee9dd0d90f3d9c09e5e93f1b0f31abbc379"}, +] +"aws-cdk.aws-opensearchservice" = [ + {file = "aws-cdk.aws-opensearchservice-1.124.0.tar.gz", hash = "sha256:d1bd4ca9ac9cf38b7c04a5e1e63eefe30e6e5e40adc0134e61d468694c71c4b1"}, + {file = "aws_cdk.aws_opensearchservice-1.124.0-py3-none-any.whl", hash = "sha256:170417a55884ac8f26b0ae4cc59c085c8c2a0607b18ca906c1ee4d366b737d85"}, ] "aws-cdk.aws-rds" = [ - {file = "aws-cdk.aws-rds-1.115.0.tar.gz", hash = "sha256:c562843534494ef283474ebd7bba4e44e0b7cb063c0121e20f08ba49749a2a60"}, - {file = "aws_cdk.aws_rds-1.115.0-py3-none-any.whl", hash = "sha256:7c00e329b6455b4279ad9880c2e033509b27be63b31626413f28558ae8d24a7f"}, + {file = "aws-cdk.aws-rds-1.124.0.tar.gz", hash = "sha256:20057fc95cda55fc504987dc0395062836dacc72efce2c86051677a1bb6d8d43"}, + {file = "aws_cdk.aws_rds-1.124.0-py3-none-any.whl", hash = "sha256:bd66c0f76548cee6fb1f100f0e36ab9d5933ef70121b072ae05b3dd26e408ff3"}, ] "aws-cdk.aws-redshift" = [ - {file = "aws-cdk.aws-redshift-1.115.0.tar.gz", hash = "sha256:758e6e940e7a432d46d144ebf8002af51fbe98d452221725510f01488847f9a3"}, - {file = "aws_cdk.aws_redshift-1.115.0-py3-none-any.whl", hash = "sha256:311dcb36814434214917ad707689a210016ce1d6286c69d44ec01f5df27a3c7d"}, + {file = "aws-cdk.aws-redshift-1.124.0.tar.gz", hash = "sha256:70cb4700cdfecad592524cd017a4a859b3d4ae407b3d2fcf329022c1d2faf863"}, + {file = "aws_cdk.aws_redshift-1.124.0-py3-none-any.whl", hash = "sha256:4df5c19f74194fb9bd7a56e5b89b9312c35b681a322b0c1b0e248874f628ddc4"}, +] +"aws-cdk.aws-route53" = [ + {file = "aws-cdk.aws-route53-1.124.0.tar.gz", hash = "sha256:c5137b3c5211632b931d7b79234aec6006f72701c68477086e70c213320639ef"}, + {file = "aws_cdk.aws_route53-1.124.0-py3-none-any.whl", hash = "sha256:97fe84e53c26c1a713a3b57341c2ecf488db56cc0b6127975656c53206ccd471"}, ] "aws-cdk.aws-s3" = [ - {file = "aws-cdk.aws-s3-1.115.0.tar.gz", hash = "sha256:73d72900194b944435056faf42c0df21ca7f6a0f941e0bc8d5cdf3de4c0261e9"}, - {file = "aws_cdk.aws_s3-1.115.0-py3-none-any.whl", hash = "sha256:81f85f3c107f05012a351260640a1bb1911106addbd26f2dd2c22d8c44122053"}, + {file = "aws-cdk.aws-s3-1.124.0.tar.gz", hash = "sha256:3047305a4e013cb796532027c14908003ffe7af95fe8e214e3470a32a11c09e6"}, + {file = "aws_cdk.aws_s3-1.124.0-py3-none-any.whl", hash = "sha256:0b08821e3b79c26110068f54aabdb938da55b562dcf2a28a7171d930334ce71a"}, ] "aws-cdk.aws-s3-assets" = [ - {file = "aws-cdk.aws-s3-assets-1.115.0.tar.gz", hash = "sha256:4aa793512b08d73f0bacb71f72f607a510672d077216cdd1ac307c65bd0751ae"}, - {file = "aws_cdk.aws_s3_assets-1.115.0-py3-none-any.whl", hash = "sha256:0bb1eea914908a5fc69a505b118e89f7d3097bce309126167b738a0aefd98ec6"}, + {file = "aws-cdk.aws-s3-assets-1.124.0.tar.gz", hash = "sha256:568d4c598319e3bf1869536be0586b1004d3c43c2133ba94bf9cda4ad4ae5d5d"}, + {file = "aws_cdk.aws_s3_assets-1.124.0-py3-none-any.whl", hash = "sha256:125c5e3786f2c233512374080553b2a7592efa6a53203764979a1bb987c47338"}, ] "aws-cdk.aws-sam" = [ - {file = "aws-cdk.aws-sam-1.115.0.tar.gz", hash = "sha256:babca8a6fbf68a32ebf6f1fd54f6a7bc506d60dae007fd6e4b06f1637edd42fd"}, - {file = "aws_cdk.aws_sam-1.115.0-py3-none-any.whl", hash = "sha256:ece50ab527eb1e5f84f6de2ad503e7cd61a2351dfcb6446274f8099ffabfcfc5"}, + {file = "aws-cdk.aws-sam-1.124.0.tar.gz", hash = "sha256:39db01a4d88fd05c57dbc4f0c76c2471eab3e75753febc30f2847c546fa8292b"}, + {file = "aws_cdk.aws_sam-1.124.0-py3-none-any.whl", hash = "sha256:b1ca75d2fb13898ed66cd4ee364cfa0b4f0924ab4583994ec4a7200d10c8c71b"}, ] "aws-cdk.aws-secretsmanager" = [ - {file = "aws-cdk.aws-secretsmanager-1.115.0.tar.gz", hash = "sha256:6de8204e4bbcbe8df8852646933c1d8d8cb1332374baee9fe780bd2b413e2423"}, - {file = "aws_cdk.aws_secretsmanager-1.115.0-py3-none-any.whl", hash = "sha256:0acf55659f67ac43c69be9a17e40e382d6122abc8055f092332723e07db15fd9"}, + {file = "aws-cdk.aws-secretsmanager-1.124.0.tar.gz", hash = "sha256:76d3ded9f20d29520d4e54e15c335718cac4f938aacb4827a2a9f98af417576f"}, + {file = "aws_cdk.aws_secretsmanager-1.124.0-py3-none-any.whl", hash = "sha256:0b6ae44966600943eb66fc48a93a0ae2bac60c8d6a5ff9c687ad9675b9f2bc5f"}, ] "aws-cdk.aws-signer" = [ - {file = "aws-cdk.aws-signer-1.115.0.tar.gz", hash = "sha256:9050e46e059edcde6b8e1d80b0d792eb2b4ad36cc00ce0b284d04a15b019b216"}, - {file = "aws_cdk.aws_signer-1.115.0-py3-none-any.whl", hash = "sha256:3b4b920dd5c8873bb0b60c0d2ae340fad434e7f011296f465d482afc094b25da"}, + {file = "aws-cdk.aws-signer-1.124.0.tar.gz", hash = "sha256:96dd4ae63b43c7c12fde59f7ebbbea1895964a5f08c6e2ca4a2a1062abcc2399"}, + {file = "aws_cdk.aws_signer-1.124.0-py3-none-any.whl", hash = "sha256:2fe614e6ce1ea6259d60f3adced41eaefdeace0cf77d961b5fcef815e1f82428"}, +] +"aws-cdk.aws-sns" = [ + {file = "aws-cdk.aws-sns-1.124.0.tar.gz", hash = "sha256:21e838c52cdd9bdcd98fc0fbe16ffad2bf10ba6bf31c5bfcdd9f49a8b3479d0c"}, + {file = "aws_cdk.aws_sns-1.124.0-py3-none-any.whl", hash = "sha256:cb3820fd79643d1c5fb0b69f2b4755900dd16756af0f4c36706d68220a845d8b"}, ] "aws-cdk.aws-sqs" = [ - {file = "aws-cdk.aws-sqs-1.115.0.tar.gz", hash = "sha256:b24e03f0027fd99c6cdfe604e3a2b3d0d203d616dffafc74f74f6715083e2b08"}, - {file = "aws_cdk.aws_sqs-1.115.0-py3-none-any.whl", hash = "sha256:cda589452cb4a6db584050e50f14fbe11757fb0b3aff63f50ae663fad5b7bf27"}, + {file = "aws-cdk.aws-sqs-1.124.0.tar.gz", hash = "sha256:ffed4754784de29473f554e450c6ec1b96c7508a2706406fe8d6442f2a31c58c"}, + {file = "aws_cdk.aws_sqs-1.124.0-py3-none-any.whl", hash = "sha256:382721ca5d82dce9ec2625e5bae26132151748ee60e1269a0aa91cfd03227ee7"}, ] "aws-cdk.aws-ssm" = [ - {file = "aws-cdk.aws-ssm-1.115.0.tar.gz", hash = "sha256:960330865ee74485cab510ba1cac5d8d4578e777f1a421b14e8a20895bbe5ac5"}, - {file = "aws_cdk.aws_ssm-1.115.0-py3-none-any.whl", hash = "sha256:4431c43667b57fe2883a9ef022b277cbd3b62f6ab13cb0b1221513f7f76f2aac"}, + {file = "aws-cdk.aws-ssm-1.124.0.tar.gz", hash = "sha256:bcfc99a5cdf23849503c72d93b9e5734d11976453004f13ebca2a66aeb3df10c"}, + {file = "aws_cdk.aws_ssm-1.124.0-py3-none-any.whl", hash = "sha256:4d7335c2ce0200c1ed347422139c9d9b07c71297253ba911470114277996cc76"}, ] "aws-cdk.cloud-assembly-schema" = [ - {file = "aws-cdk.cloud-assembly-schema-1.115.0.tar.gz", hash = "sha256:d565a8418e0cc05d3471dd48424477528d72bdd7d17adc9a049068559666a3ae"}, - {file = "aws_cdk.cloud_assembly_schema-1.115.0-py3-none-any.whl", hash = "sha256:0686e6f7e5da48dbd2ff724953d51eb0495b6772bdb17400024bb42e6fe05baf"}, + {file = "aws-cdk.cloud-assembly-schema-1.124.0.tar.gz", hash = "sha256:d2989a6742ad988fa0f7085ab67fb7ced14f4c3b1a98cc0bf4a0ea1a9358667c"}, + {file = "aws_cdk.cloud_assembly_schema-1.124.0-py3-none-any.whl", hash = "sha256:77d3f63629b7213c639ffd4c46eb63ce9dd048e9a91a045afa72dcce9576ee6b"}, ] "aws-cdk.core" = [ - {file = "aws-cdk.core-1.115.0.tar.gz", hash = "sha256:42a691cc183219ce76eb58e17507edf768a0f5eca0ea98661b4b1f16f178b90d"}, - {file = "aws_cdk.core-1.115.0-py3-none-any.whl", hash = "sha256:93a8e3d87f79af75866bf3f1cfc702dd5664526ec0f70a1c5f7ade82cb1536b1"}, + {file = "aws-cdk.core-1.124.0.tar.gz", hash = "sha256:bbdc1cf5affc34d0caa549771dc6b41ce467744f8ca727b215f0d89b853f4f0c"}, + {file = "aws_cdk.core-1.124.0-py3-none-any.whl", hash = "sha256:56c4549161029c707aa527882e4741fca1ef4c46f63a6417e56e968710cfba7c"}, +] +"aws-cdk.custom-resources" = [ + {file = "aws-cdk.custom-resources-1.124.0.tar.gz", hash = "sha256:d2be1a1636b65e275521970b9c9accd02718f678ebb074a580b15b695e4b60d5"}, + {file = "aws_cdk.custom_resources-1.124.0-py3-none-any.whl", hash = "sha256:6c9abcc046a92dc6845c8a81e33ac727da95e0c0d95b3fba0d433de7dae10a61"}, ] "aws-cdk.cx-api" = [ - {file = "aws-cdk.cx-api-1.115.0.tar.gz", hash = "sha256:10251ef8deaf7acfb7f7356e07c53cd86bbd8725631795e1ce8f8891bcaffad0"}, - {file = "aws_cdk.cx_api-1.115.0-py3-none-any.whl", hash = "sha256:6c03bc14f8d645e63329cb152b2f1fe339a556c297f1c3ecfa75ca9a981f9dca"}, + {file = "aws-cdk.cx-api-1.124.0.tar.gz", hash = "sha256:b8ad4e1a2a5545dd256b50d36efb6d59b9b89b4b1034e7b7f9edfdaa476b181b"}, + {file = "aws_cdk.cx_api-1.124.0-py3-none-any.whl", hash = "sha256:64b6f3ba0313cdea9963f9d210932cf770366a9d860520e1f15e64a26e97c5d6"}, ] "aws-cdk.region-info" = [ - {file = "aws-cdk.region-info-1.115.0.tar.gz", hash = "sha256:4f6b282fa495c244c1f96deea4aed77e702312373204e34b3bba53da27851974"}, - {file = "aws_cdk.region_info-1.115.0-py3-none-any.whl", hash = "sha256:b346bdab4bf54a5956fab020bc085b6c2c304f485dd2d09c8fb586728dfe7c11"}, + {file = "aws-cdk.region-info-1.124.0.tar.gz", hash = "sha256:c28d31226f9000db1375044ea22ba496cc75e8c3db6aa1493a687ff0f89ccdae"}, + {file = "aws_cdk.region_info-1.124.0-py3-none-any.whl", hash = "sha256:594b5f275766b22864e6111f194cfe7a12713ffc61963d063ce06812fa484728"}, ] cattrs = [ {file = "cattrs-1.0.0-py2.py3-none-any.whl", hash = "sha256:616972ae3dfa6e623a40ad3cb845420e64942989152774ab055e5c2b2f89f997"}, {file = "cattrs-1.0.0.tar.gz", hash = "sha256:b7ab5cf8ad127c42eefd01410c1c6e28569a45a255ea80ed968511873c433c7a"}, - {file = "cattrs-1.6.0-py3-none-any.whl", hash = "sha256:c8de53900e3acad94ca83750eb12bb38aa85ce9114be47177c943e2f0eca63b0"}, - {file = "cattrs-1.6.0.tar.gz", hash = "sha256:3e2cd5dc8a1006d5da53ddcbf4f0b1dd3a21e294323b257678d0a96721f8253a"}, + {file = "cattrs-1.8.0-py3-none-any.whl", hash = "sha256:901fb2040529ae8fc9d93f48a2cdf7de3e983312ffb2a164ffa4e9847f253af1"}, + {file = "cattrs-1.8.0.tar.gz", hash = "sha256:5c121ab06a7cac494813c228721a7feb5a6423b17316eeaebf13f5a03e5b0d53"}, ] constructs = [ {file = "constructs-3.3.101-py3-none-any.whl", hash = "sha256:0605ea091dda433f0915ba5b3c74bf967d90fb0cf975a5c3b34a7150a3cf48d1"}, @@ -747,8 +917,8 @@ importlib-resources = [ {file = "importlib_resources-5.2.0.tar.gz", hash = "sha256:22a2c42d8c6a1d30aa8a0e1f57293725bfd5c013d562585e46aff469e0ff78b3"}, ] jsii = [ - {file = "jsii-1.32.0-py3-none-any.whl", hash = "sha256:c71321c4b74ed2c29edc9943c22a36c60a8626df6e0a7173b9ae41366b1a9cb9"}, - {file = "jsii-1.32.0.tar.gz", hash = "sha256:b95e7747812e16cafbfde80b714d9b684c7a4ee57a00cbaf8f138d5868bdb2ae"}, + {file = "jsii-1.34.0-py3-none-any.whl", hash = "sha256:d0a703d0d44bf78bb90529699599d2a58a68ca764f996808e97eafc68e2467de"}, + {file = "jsii-1.34.0.tar.gz", hash = "sha256:e72ba5fafabdd5b6a3a65bd2cf42302eb87f2fe7c6339bddb808226a91623654"}, ] publication = [ {file = "publication-0.0.3-py2.py3-none-any.whl", hash = "sha256:0248885351febc11d8a1098d5c8e3ab2dabcf3e8c0c96db1e17ecd12b53afbe6"}, diff --git a/test_infra/pyproject.toml b/test_infra/pyproject.toml index e6dda67cb..761c315d7 100644 --- a/test_infra/pyproject.toml +++ b/test_infra/pyproject.toml @@ -18,3 +18,4 @@ python = ">=3.6.2, <3.10" "aws-cdk.aws-rds" = "^1.115.0" "aws-cdk.aws-secretsmanager" = "^1.115.0" "aws-cdk.aws-ssm" = "^1.115.0" +"aws-cdk.aws-opensearchservice" = "^1.124.0" diff --git a/test_infra/scripts/delete-opensearch.sh b/test_infra/scripts/delete-opensearch.sh new file mode 100755 index 000000000..1c1c01ba2 --- /dev/null +++ b/test_infra/scripts/delete-opensearch.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -e + +pushd .. +cdk destroy aws-data-wrangler-opensearch +popd diff --git a/test_infra/scripts/deploy-opensearch.sh b/test_infra/scripts/deploy-opensearch.sh new file mode 100755 index 000000000..e94818af4 --- /dev/null +++ b/test_infra/scripts/deploy-opensearch.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -e + +pushd .. +cdk bootstrap +cdk deploy aws-data-wrangler-opensearch +popd diff --git a/test_infra/stacks/opensearch_stack.py b/test_infra/stacks/opensearch_stack.py new file mode 100644 index 000000000..e32ecbbaa --- /dev/null +++ b/test_infra/stacks/opensearch_stack.py @@ -0,0 +1,113 @@ +from aws_cdk import aws_ec2 as ec2 +from aws_cdk import aws_iam as iam +from aws_cdk import aws_kms as kms +from aws_cdk import aws_s3 as s3 +from aws_cdk import aws_secretsmanager as secrets +from aws_cdk import core as cdk +from aws_cdk import aws_opensearchservice as opensearch + + +def validate_domain_name(name: str): + if not 3 <= len(name) <= 28: + raise ValueError(f'invalid domain name ({name}) - bad length ({len(name)})') + for c in name: + if not ('a' <= c <= 'z' + or c.isdigit() + or c in ['-']): + raise ValueError(f'invalid domain name ({name}) - bad character ("{c}")') + + +class OpenSearchStack(cdk.Stack): # type: ignore + def __init__( + self, + scope: cdk.Construct, + construct_id: str, + vpc: ec2.IVpc, + bucket: s3.IBucket, + key: kms.Key, + **kwargs: str, + ) -> None: + """ + AWS Data Wrangler Development OpenSearch Infrastructure. + Includes OpenSearch, Elasticsearch, ... + """ + super().__init__(scope, construct_id, **kwargs) + + self.vpc = vpc + self.key = key + self.bucket = bucket + + self._set_opensearch_infra() + self._setup_opensearch_1_0() + self._setup_elasticsearch_7_10_fgac() + + def _set_opensearch_infra(self) -> None: + self.username = "test" + # fmt: off + self.password_secret = secrets.Secret( + self, + "opensearch-password-secret", + secret_name="aws-data-wrangler/opensearch_password", + generate_secret_string=secrets.SecretStringGenerator(exclude_characters="/@\"\' \\"), + ).secret_value + # fmt: on + self.password = self.password_secret.to_string() + + def _setup_opensearch_1_0(self) -> None: + domain_name = 'wrangler-os-1-0' + validate_domain_name(domain_name) + domain_arn = f'arn:aws:es:{self.region}:{self.account}:domain/{domain_name}' + domain = opensearch.Domain(self, domain_name, + domain_name=domain_name, + version=opensearch.EngineVersion.OPENSEARCH_1_0, + capacity=opensearch.CapacityConfig( + data_node_instance_type='t3.small.search', + data_nodes=1 + ), + access_policies=[ + iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=["es:*"], + principals=[iam.AccountRootPrincipal()], + resources=[f"{domain_arn}/*"] + ) + ], + removal_policy=cdk.RemovalPolicy.DESTROY + ) + + cdk.CfnOutput(self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint) + + def _setup_elasticsearch_7_10_fgac(self) -> None: + domain_name = 'wrangler-es-7-10-fgac' + validate_domain_name(domain_name) + domain_arn = f'arn:aws:es:{self.region}:{self.account}:domain/{domain_name}' + domain = opensearch.Domain(self, domain_name, + domain_name=domain_name, + version=opensearch.EngineVersion.ELASTICSEARCH_7_10, + capacity=opensearch.CapacityConfig( + data_node_instance_type='t3.small.search', + data_nodes=1 + ), + access_policies=[ + iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=["es:*"], + principals=[iam.AnyPrincipal()], # FGAC + resources=[f"{domain_arn}/*"] + ) + ], + fine_grained_access_control=opensearch.AdvancedSecurityOptions( + master_user_name=self.username, + # master_user_password=self.password_secret.plain_text("aws-data-wrangler/opensearch_password") + master_user_password=self.password_secret + ), + # Node-to-node encryption is required when fine-grained access control is enabled + node_to_node_encryption=True, + # Encryption-at-rest is required when fine-grained access control is enabled + encryption_at_rest=opensearch.EncryptionAtRestOptions(enabled=True, kms_key=self.key), + # Enforce HTTPS is required when fine-grained access control is enabled + enforce_https=True, + removal_policy=cdk.RemovalPolicy.DESTROY + ) + + cdk.CfnOutput(self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint) diff --git a/tests/_utils.py b/tests/_utils.py index 85df69484..9e4b595e3 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -528,7 +528,7 @@ def extract_cloudformation_outputs(): client = boto3.client("cloudformation") response = try_it(client.describe_stacks, botocore.exceptions.ClientError, max_num_tries=5) for stack in response.get("Stacks"): - if (stack["StackName"] in ["aws-data-wrangler-base", "aws-data-wrangler-databases"]) and ( + if (stack["StackName"] in ["aws-data-wrangler-base", "aws-data-wrangler-databases", "aws-data-wrangler-opensearch"]) and ( stack["StackStatus"] in CFN_VALID_STATUS ): for output in stack.get("Outputs"): diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 28fdf5c5c..84e669757 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -1,18 +1,17 @@ import logging import boto3 +import pytest # type: ignore import pandas as pd import json +import tempfile import awswrangler as wr +from ._utils import extract_cloudformation_outputs logging.getLogger("awswrangler").setLevel(logging.DEBUG) -# TODO: create test_infra for opensearch -OPENSEARCH_DOMAIN = 'search-es71-public-z63iyqxccc4ungar5vx45xwgfi.us-east-1.es.amazonaws.com' # change to your domain -OPENSEARCH_DOMAIN_FGAC = 'search-os1-public-urixc6vui2il7oawwiox2e57n4.us-east-1.es.amazonaws.com' -BUCKET = 'mentzera' inspections_documents = [ {"business_address":"315 California St","business_city":"San Francisco","business_id":"24936","business_latitude":"37.793199","business_location":{"lon": -122.400152,"lat": 37.793199},"business_longitude":"-122.400152","business_name":"San Francisco Soup Company","business_postal_code":"94104","business_state":"CA","inspection_date":"2016-06-09T00:00:00.000","inspection_id":"24936_20160609","inspection_score":77,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Improper food labeling or menu misrepresentation","violation_id":"24936_20160609_103141"}, @@ -23,23 +22,70 @@ {"business_address":"2162 24th Ave","business_city":"San Francisco","business_id":"5794","business_latitude":"37.747228","business_location":{"lon": -122.481299,"lat": 37.747228},"business_longitude":"-122.481299","business_name":"Soup-or-Salad","business_phone_number":"+14155752700","business_postal_code":"94116","business_state":"CA","inspection_date":"2016-09-07T00:00:00.000","inspection_id":"5794_20160907","inspection_score":96,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unapproved or unmaintained equipment or utensils","violation_id":"5794_20160907_103144"} ] -def test_connection(): - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) + +@pytest.fixture(scope="session") +def cloudformation_outputs(): + return extract_cloudformation_outputs() + + +@pytest.fixture(scope="session") +def opensearch_password(): + return boto3.client("secretsmanager").get_secret_value(SecretId="aws-data-wrangler/opensearch_password")["SecretString"] + + +@pytest.fixture(scope="session") +def domain_endpoint_opensearch_1_0(cloudformation_outputs): + return cloudformation_outputs["DomainEndpointwrangleros10"] + + +@pytest.fixture(scope="session") +def domain_endpoint_elasticsearch_7_10_fgac(cloudformation_outputs): + return cloudformation_outputs["DomainEndpointwrangleres710fgac"] + + +def test_connection_opensearch_1_0(domain_endpoint_opensearch_1_0): + client = wr.opensearch.connect(host=domain_endpoint_opensearch_1_0) print(client.info()) + assert len(client.info()) > 0 -# def test_fgac_connection(): -# client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN_FGAC, -# fgac_user='admin', -# fgac_password='SECRET') -# print(client.info()) +def test_connection_elasticsearch_7_10_fgac(domain_endpoint_elasticsearch_7_10_fgac, opensearch_password): + client = wr.opensearch.connect( + host=domain_endpoint_elasticsearch_7_10_fgac, + fgac_user='test', + fgac_password=opensearch_password + ) + print(client.info()) + assert len(client.info()) > 0 + + +@pytest.fixture(scope="session") +def opensearch_1_0_client(domain_endpoint_opensearch_1_0): + client = wr.opensearch.connect(host=domain_endpoint_opensearch_1_0) + return client + + +@pytest.fixture(scope="session") +def elasticsearch_7_10_fgac_client(domain_endpoint_elasticsearch_7_10_fgac, opensearch_password): + client = wr.opensearch.connect( + host=domain_endpoint_elasticsearch_7_10_fgac, + fgac_user='test', + fgac_password=opensearch_password + ) + return client + +# testing multiple versions +@pytest.fixture(params=['opensearch_1_0_client', 'elasticsearch_7_10_fgac_client']) +def client(request): + return request.getfixturevalue(request.param) -def test_create_index(): - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) +def test_create_index(client): + index = 'test_create_index' + wr.opensearch.delete_index(client, index) response = wr.opensearch.create_index( - client, - index='test-index1', + client=client, + index=index, mappings={ 'properties': { 'name': {'type': 'text'}, @@ -53,12 +99,11 @@ def test_create_index(): } } ) - print(response) + assert response.get('acknowledged', False) is True -def test_delete_index(): +def test_delete_index(client): index = 'test_delete_index' - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) wr.opensearch.create_index( client, index=index @@ -68,10 +113,10 @@ def test_delete_index(): index=index ) print(response) + assert response.get('acknowledged', False) is True -def test_index_df(): - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) +def test_index_df(client): response = wr.opensearch.index_df(client, df=pd.DataFrame([{'_id': '1', 'name': 'John'}, {'_id': '2', 'name': 'George'}, @@ -82,8 +127,7 @@ def test_index_df(): print(response) -def test_index_documents(): - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) +def test_index_documents(client): response = wr.opensearch.index_documents(client, documents=[{'_id': '1', 'name': 'John'}, {'_id': '2', 'name': 'George'}, @@ -94,8 +138,7 @@ def test_index_documents(): print(response) -def test_index_documents_id_keys(): - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) +def test_index_documents_id_keys(client): response = wr.opensearch.index_documents(client, documents=inspections_documents, index='test_index_documents_id_keys', @@ -104,8 +147,7 @@ def test_index_documents_id_keys(): print(response) -def test_index_documents_no_id_keys(): - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) +def test_index_documents_no_id_keys(client): response = wr.opensearch.index_documents(client, documents=inspections_documents, index='test_index_documents_no_id_keys' @@ -113,9 +155,8 @@ def test_index_documents_no_id_keys(): print(response) -def test_search(): +def test_search(client): index = 'test_search' - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) response = wr.opensearch.index_documents(client, documents=inspections_documents, index=index, @@ -139,9 +180,8 @@ def test_search(): print(df.to_string()) -def test_search_filter_path(): +def test_search_filter_path(client): index = 'test_search' - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) response = wr.opensearch.index_documents(client, documents=inspections_documents, index=index, @@ -166,9 +206,8 @@ def test_search_filter_path(): print(df.to_string()) -def test_search_scroll(): +def test_search_scroll(client): index = 'test_search_scroll' - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) response = wr.opensearch.index_documents(client, documents=inspections_documents, index=index, @@ -186,9 +225,8 @@ def test_search_scroll(): print(df.to_string()) -def test_search_sql(): +def test_search_sql(client): index = 'test_search_sql' - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) response = wr.opensearch.index_documents(client, documents=inspections_documents, index=index, @@ -204,9 +242,8 @@ def test_search_sql(): print(df.to_string()) -def test_index_json_local(): - file_path = '/tmp/inspections.json' - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) +def test_index_json_local(client): + file_path = f'{tempfile.gettempdir()}/inspections.json' with open(file_path, 'w') as filehandle: for doc in inspections_documents: filehandle.write('%s\n' % json.dumps(doc)) @@ -218,33 +255,48 @@ def test_index_json_local(): print(response) -def test_index_json_s3(): - file_path = '/tmp/inspections.json' - s3_key = 'tmp/inspections.json' - client = wr.opensearch.connect(host=OPENSEARCH_DOMAIN) +def test_index_json_s3(client, path): + file_path = f'{tempfile.gettempdir()}/inspections.json' with open(file_path, 'w') as filehandle: for doc in inspections_documents: filehandle.write('%s\n' % json.dumps(doc)) s3 = boto3.client('s3') - s3.upload_file(file_path, BUCKET, s3_key) + path = f"{path}opensearch/inspections.json" + bucket, key = wr._utils.parse_path(path) + s3.upload_file(file_path, bucket, key) response = wr.opensearch.index_json( client, index='test_index_json_s3', - path=f's3://{BUCKET}/{s3_key}' + path=path ) print(response) -def test_index_csv_local(): - file_path = '/tmp/inspections.csv' +def test_index_csv_local(client): + file_path = f'{tempfile.gettempdir()}/inspections.csv' index = 'test_index_csv_local' df=pd.DataFrame(inspections_documents) df.to_csv(file_path, index=False) - client = wr.opensearch.connect(OPENSEARCH_DOMAIN) - wr.opensearch.delete_index(client, index) response = wr.opensearch.index_csv( client, path=file_path, index=index ) - print(response) \ No newline at end of file + print(response) + + +def test_index_csv_s3(client, path): + file_path = f'{tempfile.gettempdir()}/inspections.csv' + index = 'test_index_csv_s3' + df=pd.DataFrame(inspections_documents) + df.to_csv(file_path, index=False) + s3 = boto3.client('s3') + path = f"{path}opensearch/inspections.csv" + bucket, key = wr._utils.parse_path(path) + s3.upload_file(file_path, bucket, key) + response = wr.opensearch.index_csv( + client, + path=path, + index=index + ) + print(response) From d57434190b54257f6922bbebc38df21349931580 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Mon, 27 Sep 2021 23:57:20 -0400 Subject: [PATCH 14/41] [skip ci] index create/delete ignore exceptions --- awswrangler/opensearch/_write.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index ad845c2b1..e2c76e115 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -144,7 +144,9 @@ def create_index( body['settings'] = settings if body == {}: body = None - response = client.indices.create(index, body, ignore=[400, 404]) + + # ignore 400 cause by IndexAlreadyExistsException when creating an index + response = client.indices.create(index, body, ignore=400) if 'error' in response: _logger.warning(response) if str(response['error']).startswith(u'MapperParsingException'): @@ -182,6 +184,8 @@ def delete_index( ... ) """ + + # ignore 400/404 IndexNotFoundError exception response = client.indices.delete(index, ignore=[400, 404]) if 'error' in response: _logger.warning(response) @@ -381,7 +385,7 @@ def index_df( def index_documents( client: Elasticsearch, - documents: Union[Iterable[Dict[str, Any]], Iterable[Mapping[str, Any]]], + documents: Iterable[Mapping[str, Any]], index: str, doc_type: Optional[str] = None, keys_to_write: Optional[List[str]] = None, From 7bb6779c84a52b3db473fc4491cd4389ddb417d2 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Mon, 27 Sep 2021 23:59:31 -0400 Subject: [PATCH 15/41] [skip ci] index_documents documents type --- awswrangler/opensearch/_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index e2c76e115..6126d0dfb 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -411,7 +411,7 @@ def index_documents( ---------- client : Elasticsearch instance of elasticsearch.Elasticsearch to use. - documents : Union[Iterable[Dict[str, Any]], Iterable[Mapping[str, Any]]] + documents : Iterable[Mapping[str, Any]] List which contains the documents that will be inserted. index : str Name of the index. From 75a2701617373ada50b64b024a29e65ef34fb66f Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 00:42:39 -0400 Subject: [PATCH 16/41] [skip ci] removed pandasticsearch dependency --- awswrangler/opensearch/_read.py | 50 ++++++++++++++++++++++++--------- tests/test_opensearch.py | 8 ++++++ 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index 80ec509d5..ef64bd275 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -1,6 +1,5 @@ """Amazon OpenSearch Read Module (PRIVATE).""" -from pandasticsearch import Select, DataFrame from typing import Any, Dict, Optional from elasticsearch import Elasticsearch from elasticsearch.helpers import scan @@ -8,6 +7,37 @@ import pandas as pd +def _resolve_fields(row): + fields = {} + for field in row: + if isinstance(row[field], dict): + nested_fields = _resolve_fields(row[field]) + for n_field, val in nested_fields.items(): + fields["{}.{}".format(field, n_field)] = val + else: + fields[field] = row[field] + return fields + + +def _hit_to_row(hit): + row = {} + for k in hit.keys(): + if k == '_source': + solved_fields = _resolve_fields(hit['_source']) + row.update(solved_fields) + elif k.startswith('_'): + row[k] = hit[k] + return row + + +def _search_response_to_documents(response: dict): + return [_hit_to_row(hit) for hit in response['hits']['hits']] + + +def _search_response_to_df(response: dict): + return pd.DataFrame(_search_response_to_documents(response)) + + def search( client: Elasticsearch, index: Optional[str] = '_all', @@ -15,7 +45,7 @@ def search( doc_type: Optional[str] = None, is_scroll: Optional[bool] = False, **kwargs -) -> DataFrame: +) -> pd.DataFrame: """Returns results matching query DSL as pandas dataframe. Parameters @@ -65,10 +95,6 @@ def search( if doc_type: kwargs['doc_type'] = doc_type - # pandasticsearch.Select.from_dict requires `took` field - if 'filter_path' in kwargs: - if 'took' not in kwargs['filter_path']: - kwargs['filter_path'].append('took') if is_scroll: documents_generator = scan( client, @@ -76,12 +102,11 @@ def search( query=search_body, **kwargs ) - s = Select() - documents = map(lambda x: s.hit_to_row(x), documents_generator) + documents = map(lambda x: _hit_to_row(x), documents_generator) df = pd.DataFrame(documents) else: - documents = client.search(index=index, body=search_body, **kwargs) - df = Select.from_dict(documents).to_pandas() + response = client.search(index=index, body=search_body, **kwargs) + df = _search_response_to_df(response) return df @@ -89,7 +114,7 @@ def search_by_sql( client: Elasticsearch, sql_query: str, **kwargs -) -> DataFrame: +) -> pd.DataFrame: """Returns results matching [SQL query](https://opensearch.org/docs/search-plugins/sql/index/) as pandas dataframe Parameters @@ -144,6 +169,5 @@ def _sql_response_to_docs(response: Dict[str, Any]): body=body, params=kwargs ) - - df = Select.from_dict(response).to_pandas() + df = _search_response_to_df(response) return df diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 84e669757..c409e368b 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -178,6 +178,7 @@ def test_search(client): print('') print(df.to_string()) + assert df.shape[0] == 3 def test_search_filter_path(client): @@ -204,6 +205,7 @@ def test_search_filter_path(client): print('') print(df.to_string()) + assert df.shape[0] == 3 def test_search_scroll(client): @@ -223,6 +225,7 @@ def test_search_scroll(client): print('') print(df.to_string()) + assert df.shape[0] == 5 def test_search_sql(client): @@ -240,6 +243,7 @@ def test_search_sql(client): print('') print(df.to_string()) + assert df.shape[0] == 5 def test_index_json_local(client): @@ -253,6 +257,7 @@ def test_index_json_local(client): path=file_path ) print(response) + assert response.get('success', 0) == 6 def test_index_json_s3(client, path): @@ -270,6 +275,7 @@ def test_index_json_s3(client, path): path=path ) print(response) + assert response.get('success', 0) == 6 def test_index_csv_local(client): @@ -283,6 +289,7 @@ def test_index_csv_local(client): index=index ) print(response) + assert response.get('success', 0) == 6 def test_index_csv_s3(client, path): @@ -300,3 +307,4 @@ def test_index_csv_s3(client, path): index=index ) print(response) + assert response.get('success', 0) == 6 From cea9abbc3c6b0b07ad4652dc92277110d484eee2 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 00:44:52 -0400 Subject: [PATCH 17/41] [skip ci] port typo --- awswrangler/opensearch/_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index 1bbfeedf7..143fdb366 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -5,7 +5,6 @@ import boto3 import logging -from awswrangler import _utils, exceptions from elasticsearch import Elasticsearch, RequestsHttpConnection from requests_aws4auth import AWS4Auth @@ -77,7 +76,7 @@ def connect( valid_ports = {80, 443} if port not in valid_ports: - raise ValueError("results: status must be one of %r." % valid_ports) + raise ValueError("results: port must be one of %r." % valid_ports) if fgac_user and fgac_password: http_auth = (fgac_user, fgac_password) From f6c7dd4c6533540bb53edf000af1d97e0d518061 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 00:57:50 -0400 Subject: [PATCH 18/41] [skip ci] enforced_pandas_params --- awswrangler/opensearch/_write.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 6126d0dfb..0df6cee64 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -63,8 +63,6 @@ def _deserialize(v): df_iter = df.iterrows() for i, document in df_iter: - # print(document) - # yield document yield {k: _deserialize(v) for k, v in document.items() if notna(v)} @@ -281,8 +279,9 @@ def index_csv( Name of the document type (only for Elasticsearch versions 5.x and older). pandas_kwargs : Dictionary of arguments forwarded to pandas.read_csv(). - e.g. pandas_kwargs={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} + e.g. pandas_kwargs={'sep': '|', 'na_values': ['null', 'none']} https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html + Note: these params values are enforced: `skip_blank_lines=True` **kwargs : KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents` which is used to execute the operation @@ -313,14 +312,15 @@ def index_csv( ... client=client, ... path='docs.csv', ... index='sample-index1', - ... pandas_kwargs={'sep': '|', 'na_values': ['null', 'none'], 'skip_blank_lines': True} + ... pandas_kwargs={'sep': '|', 'na_values': ['null', 'none']} ... ) """ - custom_pandas_params = { + enforced_pandas_params = { 'skip_blank_lines': True, - 'na_filter': True # will generate Nan value for empty cells. We remove Nan keys in _df_doc_generator + # 'na_filter': True # will generate Nan value for empty cells. We remove Nan keys in _df_doc_generator + # Note: if the user will pass na_filter=False null fields will be indexed as well ({"k1": null, "k2": null}) } - pandas_kwargs.update(custom_pandas_params) + pandas_kwargs.update(enforced_pandas_params) df = pd.read_csv(path, **pandas_kwargs) return index_df( client, From 030e21c0aa00e7c82d4578ea2bf21b680850310e Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 02:06:24 -0400 Subject: [PATCH 19/41] [skip ci] isort & black --- awswrangler/opensearch/__init__.py | 25 +- awswrangler/opensearch/_read.py | 55 ++-- awswrangler/opensearch/_utils.py | 25 +- awswrangler/opensearch/_write.py | 131 ++++------ test_infra/app.py | 2 +- test_infra/stacks/opensearch_stack.py | 58 ++--- tests/_utils.py | 7 +- tests/test_opensearch.py | 344 ++++++++++++++------------ 8 files changed, 315 insertions(+), 332 deletions(-) diff --git a/awswrangler/opensearch/__init__.py b/awswrangler/opensearch/__init__.py index 222519747..205e70b59 100644 --- a/awswrangler/opensearch/__init__.py +++ b/awswrangler/opensearch/__init__.py @@ -1,16 +1,17 @@ """Utilities Module for Amazon OpenSearch.""" -from awswrangler.opensearch._utils import connect -from awswrangler.opensearch._write import create_index, delete_index, index_csv, index_documents, index_df, index_json from awswrangler.opensearch._read import search, search_by_sql +from awswrangler.opensearch._utils import connect +from awswrangler.opensearch._write import create_index, delete_index, index_csv, index_df, index_documents, index_json -__all__ = ["connect", - "create_index", - "delete_index", - "index_csv", - "index_documents", - "index_df", - "index_json", - "search", - "search_by_sql" - ] +__all__ = [ + "connect", + "create_index", + "delete_index", + "index_csv", + "index_documents", + "index_df", + "index_json", + "search", + "search_by_sql", +] diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index ef64bd275..33a2a9b2c 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -1,10 +1,12 @@ """Amazon OpenSearch Read Module (PRIVATE).""" from typing import Any, Dict, Optional + +import pandas as pd from elasticsearch import Elasticsearch from elasticsearch.helpers import scan + from awswrangler.opensearch._utils import _get_distribution -import pandas as pd def _resolve_fields(row): @@ -22,16 +24,16 @@ def _resolve_fields(row): def _hit_to_row(hit): row = {} for k in hit.keys(): - if k == '_source': - solved_fields = _resolve_fields(hit['_source']) + if k == "_source": + solved_fields = _resolve_fields(hit["_source"]) row.update(solved_fields) - elif k.startswith('_'): + elif k.startswith("_"): row[k] = hit[k] return row def _search_response_to_documents(response: dict): - return [_hit_to_row(hit) for hit in response['hits']['hits']] + return [_hit_to_row(hit) for hit in response["hits"]["hits"]] def _search_response_to_df(response: dict): @@ -40,11 +42,11 @@ def _search_response_to_df(response: dict): def search( client: Elasticsearch, - index: Optional[str] = '_all', + index: Optional[str] = "_all", search_body: Optional[Dict[str, Any]] = None, doc_type: Optional[str] = None, is_scroll: Optional[bool] = False, - **kwargs + **kwargs, ) -> pd.DataFrame: """Returns results matching query DSL as pandas dataframe. @@ -93,15 +95,10 @@ def search( """ if doc_type: - kwargs['doc_type'] = doc_type + kwargs["doc_type"] = doc_type if is_scroll: - documents_generator = scan( - client, - index=index, - query=search_body, - **kwargs - ) + documents_generator = scan(client, index=index, query=search_body, **kwargs) documents = map(lambda x: _hit_to_row(x), documents_generator) df = pd.DataFrame(documents) else: @@ -110,11 +107,7 @@ def search( return df -def search_by_sql( - client: Elasticsearch, - sql_query: str, - **kwargs -) -> pd.DataFrame: +def search_by_sql(client: Elasticsearch, sql_query: str, **kwargs) -> pd.DataFrame: """Returns results matching [SQL query](https://opensearch.org/docs/search-plugins/sql/index/) as pandas dataframe Parameters @@ -147,27 +140,23 @@ def search_by_sql( # can be used if not passing format def _sql_response_to_docs(response: Dict[str, Any]): - header = list(map(lambda x: x['name'], response.get('schema', []))) - for datarow in response.get('datarows', []): + header = list(map(lambda x: x["name"], response.get("schema", []))) + for datarow in response.get("datarows", []): yield dict(zip(header, datarow)) - if _get_distribution(client) == 'opensearch': - url = '/_plugins/_sql' + if _get_distribution(client) == "opensearch": + url = "/_plugins/_sql" else: - url = '/_opendistro/_sql' + url = "/_opendistro/_sql" - kwargs['format'] = 'json' - body = {'query': sql_query} - for size_att in ['size', 'fetch_size']: + kwargs["format"] = "json" + body = {"query": sql_query} + for size_att in ["size", "fetch_size"]: if size_att in kwargs: - body['fetch_size'] = kwargs[size_att] + body["fetch_size"] = kwargs[size_att] del kwargs[size_att] # unrecognized parameter response = client.transport.perform_request( - "POST", - url, - headers={'Content-Type': 'application/json'}, - body=body, - params=kwargs + "POST", url, headers={"Content-Type": "application/json"}, body=body, params=kwargs ) df = _search_response_to_df(response) return df diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index 143fdb366..f15f6105c 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -1,29 +1,27 @@ """Amazon OpenSearch Utils Module (PRIVATE).""" +import logging from typing import Optional import boto3 -import logging - from elasticsearch import Elasticsearch, RequestsHttpConnection from requests_aws4auth import AWS4Auth - _logger: logging.Logger = logging.getLogger(__name__) -def _get_distribution(client: Elasticsearch): - return client.info().get('version', {}).get('distribution', 'elasticsearch') +def _get_distribution(client: Elasticsearch) -> str: + return client.info().get("version", {}).get("distribution", "elasticsearch") def _get_version(client: Elasticsearch): - return client.info().get('version', {}).get('number') + return client.info().get("version", {}).get("number") def _get_version_major(client: Elasticsearch): version = _get_version(client) if version: - return int(version.split('.')[0]) + return int(version.split(".")[0]) return None @@ -33,8 +31,7 @@ def connect( boto3_session: Optional[boto3.Session] = boto3.Session(), region: Optional[str] = None, fgac_user: Optional[str] = None, - fgac_password: Optional[str] = None - + fgac_password: Optional[str] = None, ) -> Elasticsearch: """Creates a secure connection to the specified Amazon OpenSearch domain. @@ -84,13 +81,7 @@ def connect( if region is None: region = boto3_session.region_name creds = boto3_session.get_credentials() - http_auth = AWS4Auth( - creds.access_key, - creds.secret_key, - region, - 'es', - creds.token - ) + http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", creds.token) try: es = Elasticsearch( host=host, @@ -98,7 +89,7 @@ def connect( http_auth=http_auth, use_ssl=True, verify_certs=True, - connection_class=RequestsHttpConnection + connection_class=RequestsHttpConnection, ) except Exception as e: _logger.error("Error connecting to Opensearch cluster. Please verify authentication details") diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 0df6cee64..ace119b02 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -1,19 +1,20 @@ """Amazon OpenSearch Write Module (PRIVATE).""" +import ast +import json import logging import uuid -import boto3 -import json -import ast from pathlib import Path -from typing import Any, Dict, List, Mapping, Optional, Union, Tuple, Iterable -from awswrangler.opensearch._utils import _get_distribution, _get_version_major -from awswrangler._utils import parse_path -import pandas as pd -from pandas import notna +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +import boto3 +import pandas as pd from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk +from pandas import notna + +from awswrangler._utils import parse_path +from awswrangler.opensearch._utils import _get_distribution, _get_version_major _logger: logging.Logger = logging.getLogger(__name__) @@ -21,43 +22,42 @@ def _selected_keys(document: Dict, keys_to_write: Optional[List[str]]): if keys_to_write is None: keys_to_write = document.keys() - keys_to_write = filter(lambda x: x != '_id', keys_to_write) - return {key: document[key] for key in keys_to_write } + keys_to_write = filter(lambda x: x != "_id", keys_to_write) + return {key: document[key] for key in keys_to_write} -def _actions_generator(documents: Union[Iterable[Dict[str, Any]], Iterable[Mapping[str, Any]]], - index: str, - doc_type: Optional[str], - keys_to_write: Optional[List[str]], - id_keys: Optional[List[str]]): +def _actions_generator( + documents: Union[Iterable[Dict[str, Any]], Iterable[Mapping[str, Any]]], + index: str, + doc_type: Optional[str], + keys_to_write: Optional[List[str]], + id_keys: Optional[List[str]], +): for document in documents: if id_keys: - _id = '-'.join(list(map(lambda x: str(document[x]), id_keys))) + _id = "-".join(list(map(lambda x: str(document[x]), id_keys))) else: - _id = document.get('_id', uuid.uuid4()) + _id = document.get("_id", uuid.uuid4()) yield { - "_index": index, - "_type": doc_type, - "_id" : _id, - "_source": _selected_keys(document, keys_to_write), - } + "_index": index, + "_type": doc_type, + "_id": _id, + "_source": _selected_keys(document, keys_to_write), + } def _df_doc_generator(df: pd.DataFrame): def _deserialize(v): if isinstance(v, str): v = v.strip() - if (v.startswith('{') and v.endswith('}') - or - v.startswith('[') and v.endswith(']') - ): + if v.startswith("{") and v.endswith("}") or v.startswith("[") and v.endswith("]"): try: v = json.loads(v) except Exception as e: try: v = ast.literal_eval(v) # if properties are enclosed with single quotes except: - _logger.warning(f'could not convert string to json: {v}') + _logger.warning(f"could not convert string to json: {v}") _logger.warning(e) return v @@ -80,7 +80,7 @@ def create_index( index: str, doc_type: Optional[str] = None, settings: Optional[Dict[str, Any]] = None, - mappings: Optional[Dict[str, Any]] = None + mappings: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Creates an index. @@ -131,31 +131,28 @@ def create_index( body = {} if mappings: - if _get_distribution(client) == 'opensearch' or _get_version_major(client) >= 7: - body['mappings'] = mappings # doc type deprecated + if _get_distribution(client) == "opensearch" or _get_version_major(client) >= 7: + body["mappings"] = mappings # doc type deprecated else: if doc_type: - body['mappings'] = {doc_type: mappings} + body["mappings"] = {doc_type: mappings} else: - body['mappings'] = {index: mappings} + body["mappings"] = {index: mappings} if settings: - body['settings'] = settings + body["settings"] = settings if body == {}: body = None # ignore 400 cause by IndexAlreadyExistsException when creating an index response = client.indices.create(index, body, ignore=400) - if 'error' in response: + if "error" in response: _logger.warning(response) - if str(response['error']).startswith(u'MapperParsingException'): - raise ValueError(response['error']) + if str(response["error"]).startswith("MapperParsingException"): + raise ValueError(response["error"]) return response -def delete_index( - client: Elasticsearch, - index: str -) -> Dict[str, Any]: +def delete_index(client: Elasticsearch, index: str) -> Dict[str, Any]: """Creates an index. Parameters @@ -185,7 +182,7 @@ def delete_index( # ignore 400/404 IndexNotFoundError exception response = client.indices.delete(index, ignore=[400, 404]) - if 'error' in response: + if "error" in response: _logger.warning(response) return response @@ -196,7 +193,7 @@ def index_json( index: str, doc_type: Optional[str] = None, boto3_session: Optional[boto3.Session] = boto3.Session(), - **kwargs + **kwargs, ) -> Dict[str, Any]: """Index all documents from JSON file to OpenSearch index. @@ -241,20 +238,14 @@ def index_json( if path.startswith("s3://"): bucket, key = parse_path(path) - s3 = boto3_session.client('s3') + s3 = boto3_session.client("s3") obj = s3.get_object(Bucket=bucket, Key=key) - body = obj['Body'].read() + body = obj["Body"].read() lines = body.splitlines() documents = map(lambda x: json.loads(x), lines) - else: # local path + else: # local path documents = _file_line_generator(path, is_json=True) - return index_documents( - client=client, - documents=documents, - index=index, - doc_type=doc_type, - **kwargs - ) + return index_documents(client=client, documents=documents, index=index, doc_type=doc_type, **kwargs) def index_csv( @@ -263,7 +254,7 @@ def index_csv( index: str, doc_type: Optional[str] = None, pandas_kwargs: Optional[Dict[str, Any]] = {}, - **kwargs + **kwargs, ) -> Dict[str, Any]: """Index all documents from a CSV file to OpenSearch index. @@ -316,27 +307,17 @@ def index_csv( ... ) """ enforced_pandas_params = { - 'skip_blank_lines': True, + "skip_blank_lines": True, # 'na_filter': True # will generate Nan value for empty cells. We remove Nan keys in _df_doc_generator # Note: if the user will pass na_filter=False null fields will be indexed as well ({"k1": null, "k2": null}) } pandas_kwargs.update(enforced_pandas_params) df = pd.read_csv(path, **pandas_kwargs) - return index_df( - client, - df=df, - index=index, - doc_type=doc_type, - **kwargs - ) + return index_df(client, df=df, index=index, doc_type=doc_type, **kwargs) def index_df( - client: Elasticsearch, - df: pd.DataFrame, - index: str, - doc_type: Optional[str] = None, - **kwargs + client: Elasticsearch, df: pd.DataFrame, index: str, doc_type: Optional[str] = None, **kwargs ) -> Dict[str, Any]: """Index all documents from a DataFrame to OpenSearch index. @@ -374,13 +355,7 @@ def index_df( ... ) """ - return index_documents( - client=client, - documents=_df_doc_generator(df), - index=index, - doc_type=doc_type, - **kwargs - ) + return index_documents(client=client, documents=_df_doc_generator(df), index=index, doc_type=doc_type, **kwargs) def index_documents( @@ -396,8 +371,7 @@ def index_documents( max_retries: Optional[int] = 0, initial_backoff: Optional[int] = 2, max_backoff: Optional[int] = 600, - **kwargs - + **kwargs, ) -> Dict[str, Any]: """Index all documents to OpenSearch index. @@ -467,9 +441,6 @@ def index_documents( max_retries=max_retries, initial_backoff=initial_backoff, max_backoff=max_backoff, - **kwargs + **kwargs, ) - return { - 'success': success, - 'errors': errors - } + return {"success": success, "errors": errors} diff --git a/test_infra/app.py b/test_infra/app.py index b14c1fc81..8c3395e22 100644 --- a/test_infra/app.py +++ b/test_infra/app.py @@ -16,7 +16,7 @@ ) OpenSearchStack( -app, + app, "aws-data-wrangler-opensearch", base.get_vpc, base.get_bucket, diff --git a/test_infra/stacks/opensearch_stack.py b/test_infra/stacks/opensearch_stack.py index e32ecbbaa..d5f6d1c67 100644 --- a/test_infra/stacks/opensearch_stack.py +++ b/test_infra/stacks/opensearch_stack.py @@ -1,19 +1,17 @@ from aws_cdk import aws_ec2 as ec2 from aws_cdk import aws_iam as iam from aws_cdk import aws_kms as kms +from aws_cdk import aws_opensearchservice as opensearch from aws_cdk import aws_s3 as s3 from aws_cdk import aws_secretsmanager as secrets from aws_cdk import core as cdk -from aws_cdk import aws_opensearchservice as opensearch def validate_domain_name(name: str): if not 3 <= len(name) <= 28: - raise ValueError(f'invalid domain name ({name}) - bad length ({len(name)})') + raise ValueError(f"invalid domain name ({name}) - bad length ({len(name)})") for c in name: - if not ('a' <= c <= 'z' - or c.isdigit() - or c in ['-']): + if not ("a" <= c <= "z" or c.isdigit() or c in ["-"]): raise ValueError(f'invalid domain name ({name}) - bad character ("{c}")') @@ -54,60 +52,64 @@ def _set_opensearch_infra(self) -> None: self.password = self.password_secret.to_string() def _setup_opensearch_1_0(self) -> None: - domain_name = 'wrangler-os-1-0' + domain_name = "wrangler-os-1-0" validate_domain_name(domain_name) - domain_arn = f'arn:aws:es:{self.region}:{self.account}:domain/{domain_name}' - domain = opensearch.Domain(self, domain_name, + domain_arn = f"arn:aws:es:{self.region}:{self.account}:domain/{domain_name}" + domain = opensearch.Domain( + self, + domain_name, domain_name=domain_name, version=opensearch.EngineVersion.OPENSEARCH_1_0, capacity=opensearch.CapacityConfig( - data_node_instance_type='t3.small.search', - data_nodes=1 + data_node_instance_type="t3.small.search", data_nodes=1 ), access_policies=[ iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=["es:*"], principals=[iam.AccountRootPrincipal()], - resources=[f"{domain_arn}/*"] + resources=[f"{domain_arn}/*"], ) ], - removal_policy=cdk.RemovalPolicy.DESTROY + removal_policy=cdk.RemovalPolicy.DESTROY, ) - cdk.CfnOutput(self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint) + cdk.CfnOutput( + self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint + ) def _setup_elasticsearch_7_10_fgac(self) -> None: - domain_name = 'wrangler-es-7-10-fgac' + domain_name = "wrangler-es-7-10-fgac" validate_domain_name(domain_name) - domain_arn = f'arn:aws:es:{self.region}:{self.account}:domain/{domain_name}' - domain = opensearch.Domain(self, domain_name, + domain_arn = f"arn:aws:es:{self.region}:{self.account}:domain/{domain_name}" + domain = opensearch.Domain( + self, + domain_name, domain_name=domain_name, version=opensearch.EngineVersion.ELASTICSEARCH_7_10, capacity=opensearch.CapacityConfig( - data_node_instance_type='t3.small.search', - data_nodes=1 + data_node_instance_type="t3.small.search", data_nodes=1 ), access_policies=[ iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=["es:*"], - principals=[iam.AnyPrincipal()], # FGAC - resources=[f"{domain_arn}/*"] + principals=[iam.AnyPrincipal()], # FGACs + resources=[f"{domain_arn}/*"], ) ], fine_grained_access_control=opensearch.AdvancedSecurityOptions( master_user_name=self.username, - # master_user_password=self.password_secret.plain_text("aws-data-wrangler/opensearch_password") - master_user_password=self.password_secret + master_user_password=self.password_secret, ), - # Node-to-node encryption is required when fine-grained access control is enabled node_to_node_encryption=True, - # Encryption-at-rest is required when fine-grained access control is enabled - encryption_at_rest=opensearch.EncryptionAtRestOptions(enabled=True, kms_key=self.key), - # Enforce HTTPS is required when fine-grained access control is enabled + encryption_at_rest=opensearch.EncryptionAtRestOptions( + enabled=True, kms_key=self.key + ), enforce_https=True, - removal_policy=cdk.RemovalPolicy.DESTROY + removal_policy=cdk.RemovalPolicy.DESTROY, ) - cdk.CfnOutput(self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint) + cdk.CfnOutput( + self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint + ) diff --git a/tests/_utils.py b/tests/_utils.py index 9e4b595e3..5f74c4e83 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -528,9 +528,10 @@ def extract_cloudformation_outputs(): client = boto3.client("cloudformation") response = try_it(client.describe_stacks, botocore.exceptions.ClientError, max_num_tries=5) for stack in response.get("Stacks"): - if (stack["StackName"] in ["aws-data-wrangler-base", "aws-data-wrangler-databases", "aws-data-wrangler-opensearch"]) and ( - stack["StackStatus"] in CFN_VALID_STATUS - ): + if ( + stack["StackName"] + in ["aws-data-wrangler-base", "aws-data-wrangler-databases", "aws-data-wrangler-opensearch"] + ) and (stack["StackStatus"] in CFN_VALID_STATUS): for output in stack.get("Outputs"): outputs[output.get("OutputKey")] = output.get("OutputValue") return outputs diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index c409e368b..68a5f4f8b 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -1,10 +1,10 @@ +import json import logging +import tempfile import boto3 -import pytest # type: ignore import pandas as pd -import json -import tempfile +import pytest # type: ignore import awswrangler as wr @@ -14,12 +14,110 @@ inspections_documents = [ -{"business_address":"315 California St","business_city":"San Francisco","business_id":"24936","business_latitude":"37.793199","business_location":{"lon": -122.400152,"lat": 37.793199},"business_longitude":"-122.400152","business_name":"San Francisco Soup Company","business_postal_code":"94104","business_state":"CA","inspection_date":"2016-06-09T00:00:00.000","inspection_id":"24936_20160609","inspection_score":77,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Improper food labeling or menu misrepresentation","violation_id":"24936_20160609_103141"}, -{"business_address":"10 Mason St","business_city":"San Francisco","business_id":"60354","business_latitude":"37.783527","business_location":{"lon": -122.409061,"lat": 37.783527},"business_longitude":"-122.409061","business_name":"Soup Unlimited","business_postal_code":"94102","business_state":"CA","inspection_date":"2016-11-23T00:00:00.000","inspection_id":"60354_20161123","inspection_type":"Routine", "inspection_score": 95}, -{"business_address":"2872 24th St","business_city":"San Francisco","business_id":"1797","business_latitude":"37.752807","business_location":{"lon": -122.409752,"lat": 37.752807},"business_longitude":"-122.409752","business_name":"TIO CHILOS GRILL","business_postal_code":"94110","business_state":"CA","inspection_date":"2016-07-05T00:00:00.000","inspection_id":"1797_20160705","inspection_score":90,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unclean nonfood contact surfaces","violation_id":"1797_20160705_103142"}, -{"business_address":"1661 Tennessee St Suite 3B","business_city":"San Francisco Whard Restaurant","business_id":"66198","business_latitude":"37.75072","business_location":{"lon": -122.388478,"lat": 37.75072},"business_longitude":"-122.388478","business_name":"San Francisco Restaurant","business_postal_code":"94107","business_state":"CA","inspection_date":"2016-05-27T00:00:00.000","inspection_id":"66198_20160527","inspection_type":"Routine","inspection_score":56 }, -{"business_address":"2162 24th Ave","business_city":"San Francisco","business_id":"5794","business_latitude":"37.747228","business_location":{"lon": -122.481299,"lat": 37.747228},"business_longitude":"-122.481299","business_name":"Soup House","business_phone_number":"+14155752700","business_postal_code":"94116","business_state":"CA","inspection_date":"2016-09-07T00:00:00.000","inspection_id":"5794_20160907","inspection_score":96,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unapproved or unmaintained equipment or utensils","violation_id":"5794_20160907_103144"}, -{"business_address":"2162 24th Ave","business_city":"San Francisco","business_id":"5794","business_latitude":"37.747228","business_location":{"lon": -122.481299,"lat": 37.747228},"business_longitude":"-122.481299","business_name":"Soup-or-Salad","business_phone_number":"+14155752700","business_postal_code":"94116","business_state":"CA","inspection_date":"2016-09-07T00:00:00.000","inspection_id":"5794_20160907","inspection_score":96,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unapproved or unmaintained equipment or utensils","violation_id":"5794_20160907_103144"} + { + "business_address": "315 California St", + "business_city": "San Francisco", + "business_id": "24936", + "business_latitude": "37.793199", + "business_location": {"lon": -122.400152, "lat": 37.793199}, + "business_longitude": "-122.400152", + "business_name": "San Francisco Soup Company", + "business_postal_code": "94104", + "business_state": "CA", + "inspection_date": "2016-06-09T00:00:00.000", + "inspection_id": "24936_20160609", + "inspection_score": 77, + "inspection_type": "Routine - Unscheduled", + "risk_category": "Low Risk", + "violation_description": "Improper food labeling or menu misrepresentation", + "violation_id": "24936_20160609_103141", + }, + { + "business_address": "10 Mason St", + "business_city": "San Francisco", + "business_id": "60354", + "business_latitude": "37.783527", + "business_location": {"lon": -122.409061, "lat": 37.783527}, + "business_longitude": "-122.409061", + "business_name": "Soup Unlimited", + "business_postal_code": "94102", + "business_state": "CA", + "inspection_date": "2016-11-23T00:00:00.000", + "inspection_id": "60354_20161123", + "inspection_type": "Routine", + "inspection_score": 95, + }, + { + "business_address": "2872 24th St", + "business_city": "San Francisco", + "business_id": "1797", + "business_latitude": "37.752807", + "business_location": {"lon": -122.409752, "lat": 37.752807}, + "business_longitude": "-122.409752", + "business_name": "TIO CHILOS GRILL", + "business_postal_code": "94110", + "business_state": "CA", + "inspection_date": "2016-07-05T00:00:00.000", + "inspection_id": "1797_20160705", + "inspection_score": 90, + "inspection_type": "Routine - Unscheduled", + "risk_category": "Low Risk", + "violation_description": "Unclean nonfood contact surfaces", + "violation_id": "1797_20160705_103142", + }, + { + "business_address": "1661 Tennessee St Suite 3B", + "business_city": "San Francisco Whard Restaurant", + "business_id": "66198", + "business_latitude": "37.75072", + "business_location": {"lon": -122.388478, "lat": 37.75072}, + "business_longitude": "-122.388478", + "business_name": "San Francisco Restaurant", + "business_postal_code": "94107", + "business_state": "CA", + "inspection_date": "2016-05-27T00:00:00.000", + "inspection_id": "66198_20160527", + "inspection_type": "Routine", + "inspection_score": 56, + }, + { + "business_address": "2162 24th Ave", + "business_city": "San Francisco", + "business_id": "5794", + "business_latitude": "37.747228", + "business_location": {"lon": -122.481299, "lat": 37.747228}, + "business_longitude": "-122.481299", + "business_name": "Soup House", + "business_phone_number": "+14155752700", + "business_postal_code": "94116", + "business_state": "CA", + "inspection_date": "2016-09-07T00:00:00.000", + "inspection_id": "5794_20160907", + "inspection_score": 96, + "inspection_type": "Routine - Unscheduled", + "risk_category": "Low Risk", + "violation_description": "Unapproved or unmaintained equipment or utensils", + "violation_id": "5794_20160907_103144", + }, + { + "business_address": "2162 24th Ave", + "business_city": "San Francisco", + "business_id": "5794", + "business_latitude": "37.747228", + "business_location": {"lon": -122.481299, "lat": 37.747228}, + "business_longitude": "-122.481299", + "business_name": "Soup-or-Salad", + "business_phone_number": "+14155752700", + "business_postal_code": "94116", + "business_state": "CA", + "inspection_date": "2016-09-07T00:00:00.000", + "inspection_id": "5794_20160907", + "inspection_score": 96, + "inspection_type": "Routine - Unscheduled", + "risk_category": "Low Risk", + "violation_description": "Unapproved or unmaintained equipment or utensils", + "violation_id": "5794_20160907_103144", + }, ] @@ -30,7 +128,9 @@ def cloudformation_outputs(): @pytest.fixture(scope="session") def opensearch_password(): - return boto3.client("secretsmanager").get_secret_value(SecretId="aws-data-wrangler/opensearch_password")["SecretString"] + return boto3.client("secretsmanager").get_secret_value(SecretId="aws-data-wrangler/opensearch_password")[ + "SecretString" + ] @pytest.fixture(scope="session") @@ -51,9 +151,7 @@ def test_connection_opensearch_1_0(domain_endpoint_opensearch_1_0): def test_connection_elasticsearch_7_10_fgac(domain_endpoint_elasticsearch_7_10_fgac, opensearch_password): client = wr.opensearch.connect( - host=domain_endpoint_elasticsearch_7_10_fgac, - fgac_user='test', - fgac_password=opensearch_password + host=domain_endpoint_elasticsearch_7_10_fgac, fgac_user="test", fgac_password=opensearch_password ) print(client.info()) assert len(client.info()) > 0 @@ -68,243 +166,173 @@ def opensearch_1_0_client(domain_endpoint_opensearch_1_0): @pytest.fixture(scope="session") def elasticsearch_7_10_fgac_client(domain_endpoint_elasticsearch_7_10_fgac, opensearch_password): client = wr.opensearch.connect( - host=domain_endpoint_elasticsearch_7_10_fgac, - fgac_user='test', - fgac_password=opensearch_password + host=domain_endpoint_elasticsearch_7_10_fgac, fgac_user="test", fgac_password=opensearch_password ) return client + # testing multiple versions -@pytest.fixture(params=['opensearch_1_0_client', 'elasticsearch_7_10_fgac_client']) +@pytest.fixture(params=["opensearch_1_0_client", "elasticsearch_7_10_fgac_client"]) def client(request): return request.getfixturevalue(request.param) def test_create_index(client): - index = 'test_create_index' + index = "test_create_index" wr.opensearch.delete_index(client, index) response = wr.opensearch.create_index( client=client, index=index, - mappings={ - 'properties': { - 'name': {'type': 'text'}, - 'age': {'type': 'integer'} - } - }, - settings={ - 'index': { - 'number_of_shards': 1, - 'number_of_replicas': 1 - } - } + mappings={"properties": {"name": {"type": "text"}, "age": {"type": "integer"}}}, + settings={"index": {"number_of_shards": 1, "number_of_replicas": 1}}, ) - assert response.get('acknowledged', False) is True + assert response.get("acknowledged", False) is True def test_delete_index(client): - index = 'test_delete_index' - wr.opensearch.create_index( - client, - index=index - ) - response = wr.opensearch.delete_index( - client, - index=index - ) + index = "test_delete_index" + wr.opensearch.create_index(client, index=index) + response = wr.opensearch.delete_index(client, index=index) print(response) - assert response.get('acknowledged', False) is True + assert response.get("acknowledged", False) is True def test_index_df(client): - response = wr.opensearch.index_df(client, - df=pd.DataFrame([{'_id': '1', 'name': 'John'}, - {'_id': '2', 'name': 'George'}, - {'_id': '3', 'name': 'Julia'} - ]), - index='test_index_df1' - ) + response = wr.opensearch.index_df( + client, + df=pd.DataFrame([{"_id": "1", "name": "John"}, {"_id": "2", "name": "George"}, {"_id": "3", "name": "Julia"}]), + index="test_index_df1", + ) print(response) def test_index_documents(client): - response = wr.opensearch.index_documents(client, - documents=[{'_id': '1', 'name': 'John'}, - {'_id': '2', 'name': 'George'}, - {'_id': '3', 'name': 'Julia'} - ], - index='test_index_documents1' - ) + response = wr.opensearch.index_documents( + client, + documents=[{"_id": "1", "name": "John"}, {"_id": "2", "name": "George"}, {"_id": "3", "name": "Julia"}], + index="test_index_documents1", + ) print(response) def test_index_documents_id_keys(client): - response = wr.opensearch.index_documents(client, - documents=inspections_documents, - index='test_index_documents_id_keys', - id_keys=['inspection_id'] - ) + response = wr.opensearch.index_documents( + client, documents=inspections_documents, index="test_index_documents_id_keys", id_keys=["inspection_id"] + ) print(response) def test_index_documents_no_id_keys(client): - response = wr.opensearch.index_documents(client, - documents=inspections_documents, - index='test_index_documents_no_id_keys' - ) + response = wr.opensearch.index_documents( + client, documents=inspections_documents, index="test_index_documents_no_id_keys" + ) print(response) def test_search(client): - index = 'test_search' - response = wr.opensearch.index_documents(client, - documents=inspections_documents, - index=index, - id_keys=['inspection_id'], - refresh='wait_for' - ) + index = "test_search" + response = wr.opensearch.index_documents( + client, documents=inspections_documents, index=index, id_keys=["inspection_id"], refresh="wait_for" + ) df = wr.opensearch.search( client, index=index, - search_body={ - "query": { - "match": { - "business_name": "soup" - } - } - }, - _source=['inspection_id', 'business_name', 'business_location'] + search_body={"query": {"match": {"business_name": "soup"}}}, + _source=["inspection_id", "business_name", "business_location"], ) - print('') + print("") print(df.to_string()) assert df.shape[0] == 3 def test_search_filter_path(client): - index = 'test_search' - response = wr.opensearch.index_documents(client, - documents=inspections_documents, - index=index, - id_keys=['inspection_id'], - refresh='wait_for' - ) + index = "test_search" + response = wr.opensearch.index_documents( + client, documents=inspections_documents, index=index, id_keys=["inspection_id"], refresh="wait_for" + ) df = wr.opensearch.search( client, index=index, - search_body={ - "query": { - "match": { - "business_name": "soup" - } - } - }, - _source=['inspection_id', 'business_name', 'business_location'], - filter_path=['hits.hits._source'] + search_body={"query": {"match": {"business_name": "soup"}}}, + _source=["inspection_id", "business_name", "business_location"], + filter_path=["hits.hits._source"], ) - print('') + print("") print(df.to_string()) assert df.shape[0] == 3 def test_search_scroll(client): - index = 'test_search_scroll' - response = wr.opensearch.index_documents(client, - documents=inspections_documents, - index=index, - id_keys=['inspection_id'], - refresh='wait_for' - ) + index = "test_search_scroll" + response = wr.opensearch.index_documents( + client, documents=inspections_documents, index=index, id_keys=["inspection_id"], refresh="wait_for" + ) df = wr.opensearch.search( - client, - index=index, - is_scroll=True, - _source=['inspection_id', 'business_name', 'business_location'] + client, index=index, is_scroll=True, _source=["inspection_id", "business_name", "business_location"] ) - print('') + print("") print(df.to_string()) assert df.shape[0] == 5 def test_search_sql(client): - index = 'test_search_sql' - response = wr.opensearch.index_documents(client, - documents=inspections_documents, - index=index, - id_keys=['inspection_id'], - refresh='wait_for' - ) - df = wr.opensearch.search_by_sql( - client, - sql_query=f'select * from {index}' + index = "test_search_sql" + response = wr.opensearch.index_documents( + client, documents=inspections_documents, index=index, id_keys=["inspection_id"], refresh="wait_for" ) + df = wr.opensearch.search_by_sql(client, sql_query=f"select * from {index}") - print('') + print("") print(df.to_string()) assert df.shape[0] == 5 def test_index_json_local(client): - file_path = f'{tempfile.gettempdir()}/inspections.json' - with open(file_path, 'w') as filehandle: + file_path = f"{tempfile.gettempdir()}/inspections.json" + with open(file_path, "w") as filehandle: for doc in inspections_documents: - filehandle.write('%s\n' % json.dumps(doc)) - response = wr.opensearch.index_json( - client, - index='test_index_json_local', - path=file_path - ) + filehandle.write("%s\n" % json.dumps(doc)) + response = wr.opensearch.index_json(client, index="test_index_json_local", path=file_path) print(response) - assert response.get('success', 0) == 6 + assert response.get("success", 0) == 6 def test_index_json_s3(client, path): - file_path = f'{tempfile.gettempdir()}/inspections.json' - with open(file_path, 'w') as filehandle: + file_path = f"{tempfile.gettempdir()}/inspections.json" + with open(file_path, "w") as filehandle: for doc in inspections_documents: - filehandle.write('%s\n' % json.dumps(doc)) - s3 = boto3.client('s3') + filehandle.write("%s\n" % json.dumps(doc)) + s3 = boto3.client("s3") path = f"{path}opensearch/inspections.json" bucket, key = wr._utils.parse_path(path) s3.upload_file(file_path, bucket, key) - response = wr.opensearch.index_json( - client, - index='test_index_json_s3', - path=path - ) + response = wr.opensearch.index_json(client, index="test_index_json_s3", path=path) print(response) - assert response.get('success', 0) == 6 + assert response.get("success", 0) == 6 def test_index_csv_local(client): - file_path = f'{tempfile.gettempdir()}/inspections.csv' - index = 'test_index_csv_local' - df=pd.DataFrame(inspections_documents) + file_path = f"{tempfile.gettempdir()}/inspections.csv" + index = "test_index_csv_local" + df = pd.DataFrame(inspections_documents) df.to_csv(file_path, index=False) - response = wr.opensearch.index_csv( - client, - path=file_path, - index=index - ) + response = wr.opensearch.index_csv(client, path=file_path, index=index) print(response) - assert response.get('success', 0) == 6 + assert response.get("success", 0) == 6 def test_index_csv_s3(client, path): - file_path = f'{tempfile.gettempdir()}/inspections.csv' - index = 'test_index_csv_s3' - df=pd.DataFrame(inspections_documents) + file_path = f"{tempfile.gettempdir()}/inspections.csv" + index = "test_index_csv_s3" + df = pd.DataFrame(inspections_documents) df.to_csv(file_path, index=False) - s3 = boto3.client('s3') + s3 = boto3.client("s3") path = f"{path}opensearch/inspections.csv" bucket, key = wr._utils.parse_path(path) s3.upload_file(file_path, bucket, key) - response = wr.opensearch.index_csv( - client, - path=path, - index=index - ) + response = wr.opensearch.index_csv(client, path=path, index=index) print(response) - assert response.get('success', 0) == 6 + assert response.get("success", 0) == 6 From 950231d4bf88a9550e97549a9bd2bc5b04f47f4a Mon Sep 17 00:00:00 2001 From: Muralidhar Reddy Date: Tue, 28 Sep 2021 19:10:01 +0530 Subject: [PATCH 20/41] Added OpenSearch tutorial --- README.md | 1 + tutorials/031 - OpenSearch.ipynb | 235 +++++++++++++++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 tutorials/031 - OpenSearch.ipynb diff --git a/README.md b/README.md index 16ab96390..bed91146f 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,7 @@ FROM "sampleDB"."sampleTable" ORDER BY time DESC LIMIT 3 - [026 - Amazon Timestream](https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/026%20-%20Amazon%20Timestream.ipynb) - [027 - Amazon Timestream 2](https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/027%20-%20Amazon%20Timestream%202.ipynb) - [028 - Amazon DynamoDB](https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/028%20-%20DynamoDB.ipynb) + - [031 - OpenSearch](https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/031%20-%20OpenSearch.ipynb) - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/2.11.0/api.html) - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/2.11.0/api.html#amazon-s3) - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/2.11.0/api.html#aws-glue-catalog) diff --git a/tutorials/031 - OpenSearch.ipynb b/tutorials/031 - OpenSearch.ipynb new file mode 100644 index 000000000..cb5cbbbf4 --- /dev/null +++ b/tutorials/031 - OpenSearch.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 31 - OpenSearch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "* [1. Create Indices](#2.-Create-Indices)\n", + "* [2. Write Indices](#3.-Write-Indices)\n", + "\t* [2.1 Writing from JSON file](#2.1-Writing-from-JSON-file)\n", + "\t* [2.2 Writing from CSV file](#2.2-Writing-from-CSV-file)\n", + "* [3. Search Indices](#1.-Search-Indices)\n", + "\t* [3.1 Search by DSL](#1.1-Search-by-DSL)\n", + "\t* [3.2 Search by SQL](#1.2-Search-by-SQL)\n", + "* [4. Delete Indices](#7.-Delete-Indices)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import awswrangler as wr\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enter your domain endpoint:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "client = wr.opensearch.connect(host='DOMAIN-ENDPOINT')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Create Indices" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "response = wr.opensearch.create_index(\n", + " client=client,\n", + " index=\"tutorials\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"id\": { \"type\" : \"integer\" },\n", + " \"id\": { \"type\" : \"string\" },\n", + " }\n", + " },\n", + " settings={\n", + " \"index\": {\n", + " \"number_of_shards\": 2\n", + " \"number_of_replicas\": 1\n", + " }\n", + " }\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Write Indices" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Write from JSON files" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "wr.opensearch.index_json(\n", + " client=client,\n", + " path='s3://awswrangler-opensearch/dataload/doc1.json',\n", + " index='tutorials'\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Write from CSV files" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "wr.opensearch.index_csv(\n", + " client=client,\n", + " path='s3://awswrangler-opensearch/dataload/doc1.csv',\n", + " index='tutorials'\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Search Indices" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.1 Search by DSL" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df = wr.opensearch.search(\n", + " client=client,\n", + " index='tutorials',\n", + " search_body={\n", + " \"query\": {\n", + " \"match_all\": {\n", + " }\n", + " }\n", + " }\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.1 Search by SQL" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df = wr.opensearch.search_by_sql(\n", + " client=client,\n", + " sql_query='SELECT * FROM tutorials LIMIT 50'\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Delete Index" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "response = wr.opensearch.delete_index(\n", + " client=client,\n", + " index=\"tutorials\",\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 982975505167c7a8efe42ffa3e79515d7cb3d94e Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 14:49:00 -0400 Subject: [PATCH 21/41] typing fixes --- awswrangler/opensearch/_read.py | 22 +++++------- awswrangler/opensearch/_utils.py | 19 ++++++----- awswrangler/opensearch/_write.py | 49 ++++++++++++++------------- test_infra/stacks/opensearch_stack.py | 20 +++-------- 4 files changed, 50 insertions(+), 60 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index 33a2a9b2c..cb918a74d 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -1,6 +1,6 @@ """Amazon OpenSearch Read Module (PRIVATE).""" -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Mapping, List, Union import pandas as pd from elasticsearch import Elasticsearch @@ -9,7 +9,7 @@ from awswrangler.opensearch._utils import _get_distribution -def _resolve_fields(row): +def _resolve_fields(row: Mapping[str, Any]) -> Mapping[str, Any]: fields = {} for field in row: if isinstance(row[field], dict): @@ -21,8 +21,8 @@ def _resolve_fields(row): return fields -def _hit_to_row(hit): - row = {} +def _hit_to_row(hit: Mapping[str, Any]) -> Mapping[str, Any]: + row: Dict[str, Any] = {} for k in hit.keys(): if k == "_source": solved_fields = _resolve_fields(hit["_source"]) @@ -32,11 +32,11 @@ def _hit_to_row(hit): return row -def _search_response_to_documents(response: dict): +def _search_response_to_documents(response: Mapping[str, Any]) -> List[Mapping[str, Any]]: return [_hit_to_row(hit) for hit in response["hits"]["hits"]] -def _search_response_to_df(response: dict): +def _search_response_to_df(response: Union[Mapping[str, Any], Any]) -> pd.DataFrame: return pd.DataFrame(_search_response_to_documents(response)) @@ -46,7 +46,7 @@ def search( search_body: Optional[Dict[str, Any]] = None, doc_type: Optional[str] = None, is_scroll: Optional[bool] = False, - **kwargs, + **kwargs: Any, ) -> pd.DataFrame: """Returns results matching query DSL as pandas dataframe. @@ -107,7 +107,7 @@ def search( return df -def search_by_sql(client: Elasticsearch, sql_query: str, **kwargs) -> pd.DataFrame: +def search_by_sql(client: Elasticsearch, sql_query: str, **kwargs: Any) -> pd.DataFrame: """Returns results matching [SQL query](https://opensearch.org/docs/search-plugins/sql/index/) as pandas dataframe Parameters @@ -138,12 +138,6 @@ def search_by_sql(client: Elasticsearch, sql_query: str, **kwargs) -> pd.DataFra """ - # can be used if not passing format - def _sql_response_to_docs(response: Dict[str, Any]): - header = list(map(lambda x: x["name"], response.get("schema", []))) - for datarow in response.get("datarows", []): - yield dict(zip(header, datarow)) - if _get_distribution(client) == "opensearch": url = "/_plugins/_sql" else: diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index f15f6105c..04b40b422 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -1,7 +1,7 @@ """Amazon OpenSearch Utils Module (PRIVATE).""" import logging -from typing import Optional +from typing import Optional, Any import boto3 from elasticsearch import Elasticsearch, RequestsHttpConnection @@ -10,15 +10,15 @@ _logger: logging.Logger = logging.getLogger(__name__) -def _get_distribution(client: Elasticsearch) -> str: +def _get_distribution(client: Elasticsearch) -> Any: return client.info().get("version", {}).get("distribution", "elasticsearch") -def _get_version(client: Elasticsearch): +def _get_version(client: Elasticsearch) -> Any: return client.info().get("version", {}).get("number") -def _get_version_major(client: Elasticsearch): +def _get_version_major(client: Elasticsearch) -> Any: version = _get_version(client) if version: return int(version.split(".")[0]) @@ -78,10 +78,13 @@ def connect( if fgac_user and fgac_password: http_auth = (fgac_user, fgac_password) else: - if region is None: - region = boto3_session.region_name - creds = boto3_session.get_credentials() - http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", creds.token) + if boto3_session is None: + raise ValueError('Please provide either boto3_session or fgac_user+fgac_password') + else: + if region is None: + region = boto3_session.region_name + creds = boto3_session.get_credentials() + http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", creds.token) try: es = Elasticsearch( host=host, diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index ace119b02..1781473d3 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -5,7 +5,7 @@ import logging import uuid from pathlib import Path -from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, Generator import boto3 import pandas as pd @@ -19,10 +19,10 @@ _logger: logging.Logger = logging.getLogger(__name__) -def _selected_keys(document: Dict, keys_to_write: Optional[List[str]]): +def _selected_keys(document: Mapping[str, Any], keys_to_write: Optional[List[str]]) -> Mapping[str, Any]: if keys_to_write is None: - keys_to_write = document.keys() - keys_to_write = filter(lambda x: x != "_id", keys_to_write) + keys_to_write = list(document.keys()) + keys_to_write = list(filter(lambda x: x != "_id", keys_to_write)) return {key: document[key] for key in keys_to_write} @@ -32,7 +32,7 @@ def _actions_generator( doc_type: Optional[str], keys_to_write: Optional[List[str]], id_keys: Optional[List[str]], -): +) -> Generator[Dict[str, Any], None, None]: for document in documents: if id_keys: _id = "-".join(list(map(lambda x: str(document[x]), id_keys))) @@ -46,8 +46,8 @@ def _actions_generator( } -def _df_doc_generator(df: pd.DataFrame): - def _deserialize(v): +def _df_doc_generator(df: pd.DataFrame) -> Generator[Dict[str, Any], None, None]: + def _deserialize(v: Any) -> Any: if isinstance(v, str): v = v.strip() if v.startswith("{") and v.endswith("}") or v.startswith("[") and v.endswith("]"): @@ -66,7 +66,7 @@ def _deserialize(v): yield {k: _deserialize(v) for k, v in document.items() if notna(v)} -def _file_line_generator(path: str, is_json: bool = False): +def _file_line_generator(path: str, is_json: bool = False) -> Generator[Any, None, None]: with open(path) as fp: for line in fp: if is_json: @@ -141,10 +141,10 @@ def create_index( if settings: body["settings"] = settings if body == {}: - body = None + body = None # type: ignore # ignore 400 cause by IndexAlreadyExistsException when creating an index - response = client.indices.create(index, body, ignore=400) + response: Dict[str, Any] = client.indices.create(index, body=body, ignore=400) if "error" in response: _logger.warning(response) if str(response["error"]).startswith("MapperParsingException"): @@ -181,7 +181,7 @@ def delete_index(client: Elasticsearch, index: str) -> Dict[str, Any]: """ # ignore 400/404 IndexNotFoundError exception - response = client.indices.delete(index, ignore=[400, 404]) + response: Dict[str, Any] = client.indices.delete(index, ignore=[400, 404]) if "error" in response: _logger.warning(response) return response @@ -189,11 +189,11 @@ def delete_index(client: Elasticsearch, index: str) -> Dict[str, Any]: def index_json( client: Elasticsearch, - path: Union[str, Path], + path: str, index: str, doc_type: Optional[str] = None, boto3_session: Optional[boto3.Session] = boto3.Session(), - **kwargs, + **kwargs: Any, ) -> Dict[str, Any]: """Index all documents from JSON file to OpenSearch index. @@ -203,7 +203,7 @@ def index_json( ---------- client : Elasticsearch instance of elasticsearch.Elasticsearch to use. - path : Union[str, Path] + path : str s3 or local path to the JSON file which contains the documents. index : str Name of the index. @@ -236,25 +236,28 @@ def index_json( """ # Loading data from file + if boto3_session is None: + raise ValueError('boto3_session cannot be None') + if path.startswith("s3://"): bucket, key = parse_path(path) s3 = boto3_session.client("s3") obj = s3.get_object(Bucket=bucket, Key=key) body = obj["Body"].read() lines = body.splitlines() - documents = map(lambda x: json.loads(x), lines) + documents = list(map(lambda x: json.loads(x), lines)) # type: ignore else: # local path - documents = _file_line_generator(path, is_json=True) + documents = list(_file_line_generator(path, is_json=True)) return index_documents(client=client, documents=documents, index=index, doc_type=doc_type, **kwargs) def index_csv( client: Elasticsearch, - path: Union[str, Path], + path: str, index: str, doc_type: Optional[str] = None, - pandas_kwargs: Optional[Dict[str, Any]] = {}, - **kwargs, + pandas_kwargs: Dict[str, Any] = {}, + **kwargs: Any, ) -> Dict[str, Any]: """Index all documents from a CSV file to OpenSearch index. @@ -262,13 +265,13 @@ def index_csv( ---------- client : Elasticsearch instance of elasticsearch.Elasticsearch to use. - path : Union[str, Path] + path : str s3 or local path to the CSV file which contains the documents. index : str Name of the index. doc_type : str, optional Name of the document type (only for Elasticsearch versions 5.x and older). - pandas_kwargs : + pandas_kwargs : Dict[str, Any], optional Dictionary of arguments forwarded to pandas.read_csv(). e.g. pandas_kwargs={'sep': '|', 'na_values': ['null', 'none']} https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html @@ -317,7 +320,7 @@ def index_csv( def index_df( - client: Elasticsearch, df: pd.DataFrame, index: str, doc_type: Optional[str] = None, **kwargs + client: Elasticsearch, df: pd.DataFrame, index: str, doc_type: Optional[str] = None, **kwargs: Any ) -> Dict[str, Any]: """Index all documents from a DataFrame to OpenSearch index. @@ -371,7 +374,7 @@ def index_documents( max_retries: Optional[int] = 0, initial_backoff: Optional[int] = 2, max_backoff: Optional[int] = 600, - **kwargs, + **kwargs: Any, ) -> Dict[str, Any]: """Index all documents to OpenSearch index. diff --git a/test_infra/stacks/opensearch_stack.py b/test_infra/stacks/opensearch_stack.py index d5f6d1c67..f3bc6a1f8 100644 --- a/test_infra/stacks/opensearch_stack.py +++ b/test_infra/stacks/opensearch_stack.py @@ -60,9 +60,7 @@ def _setup_opensearch_1_0(self) -> None: domain_name, domain_name=domain_name, version=opensearch.EngineVersion.OPENSEARCH_1_0, - capacity=opensearch.CapacityConfig( - data_node_instance_type="t3.small.search", data_nodes=1 - ), + capacity=opensearch.CapacityConfig(data_node_instance_type="t3.small.search", data_nodes=1), access_policies=[ iam.PolicyStatement( effect=iam.Effect.ALLOW, @@ -74,9 +72,7 @@ def _setup_opensearch_1_0(self) -> None: removal_policy=cdk.RemovalPolicy.DESTROY, ) - cdk.CfnOutput( - self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint - ) + cdk.CfnOutput(self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint) def _setup_elasticsearch_7_10_fgac(self) -> None: domain_name = "wrangler-es-7-10-fgac" @@ -87,9 +83,7 @@ def _setup_elasticsearch_7_10_fgac(self) -> None: domain_name, domain_name=domain_name, version=opensearch.EngineVersion.ELASTICSEARCH_7_10, - capacity=opensearch.CapacityConfig( - data_node_instance_type="t3.small.search", data_nodes=1 - ), + capacity=opensearch.CapacityConfig(data_node_instance_type="t3.small.search", data_nodes=1), access_policies=[ iam.PolicyStatement( effect=iam.Effect.ALLOW, @@ -103,13 +97,9 @@ def _setup_elasticsearch_7_10_fgac(self) -> None: master_user_password=self.password_secret, ), node_to_node_encryption=True, - encryption_at_rest=opensearch.EncryptionAtRestOptions( - enabled=True, kms_key=self.key - ), + encryption_at_rest=opensearch.EncryptionAtRestOptions(enabled=True, kms_key=self.key), enforce_https=True, removal_policy=cdk.RemovalPolicy.DESTROY, ) - cdk.CfnOutput( - self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint - ) + cdk.CfnOutput(self, f"DomainEndpoint-{domain_name}", value=domain.domain_endpoint) From 0120e31865641810bf09db41a90b42ecdc5e5fc9 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 14:51:55 -0400 Subject: [PATCH 22/41] [skip ci] isort --- awswrangler/opensearch/_read.py | 2 +- awswrangler/opensearch/_utils.py | 2 +- awswrangler/opensearch/_write.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index cb918a74d..49046830b 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -1,6 +1,6 @@ """Amazon OpenSearch Read Module (PRIVATE).""" -from typing import Any, Dict, Optional, Mapping, List, Union +from typing import Any, Dict, List, Mapping, Optional, Union import pandas as pd from elasticsearch import Elasticsearch diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index 04b40b422..283531601 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -1,7 +1,7 @@ """Amazon OpenSearch Utils Module (PRIVATE).""" import logging -from typing import Optional, Any +from typing import Any, Optional import boto3 from elasticsearch import Elasticsearch, RequestsHttpConnection diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 1781473d3..8ece6f6a9 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -5,7 +5,7 @@ import logging import uuid from pathlib import Path -from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, Generator +from typing import Any, Dict, Generator, Iterable, List, Mapping, Optional, Tuple, Union import boto3 import pandas as pd From b4700f6a4cab2d73be9ac8a3ba5d6ede36ec7f89 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 14:52:36 -0400 Subject: [PATCH 23/41] [skip ci] black opensearch --- awswrangler/opensearch/_utils.py | 2 +- awswrangler/opensearch/_write.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index 283531601..a22f68339 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -79,7 +79,7 @@ def connect( http_auth = (fgac_user, fgac_password) else: if boto3_session is None: - raise ValueError('Please provide either boto3_session or fgac_user+fgac_password') + raise ValueError("Please provide either boto3_session or fgac_user+fgac_password") else: if region is None: region = boto3_session.region_name diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 8ece6f6a9..671809058 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -237,7 +237,7 @@ def index_json( # Loading data from file if boto3_session is None: - raise ValueError('boto3_session cannot be None') + raise ValueError("boto3_session cannot be None") if path.startswith("s3://"): bucket, key = parse_path(path) From 51b81103c3b7293fd6e65774de3f9ac435d2a40b Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 17:08:34 -0400 Subject: [PATCH 24/41] [skip ci] opensearch validation --- awswrangler/opensearch/_read.py | 19 +++++++++++-------- awswrangler/opensearch/_utils.py | 16 ++++++++-------- awswrangler/opensearch/_write.py | 32 +++++++++++++++++--------------- tests/test_opensearch.py | 8 ++++---- 4 files changed, 40 insertions(+), 35 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index 49046830b..5afe44d80 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -48,7 +48,7 @@ def search( is_scroll: Optional[bool] = False, **kwargs: Any, ) -> pd.DataFrame: - """Returns results matching query DSL as pandas dataframe. + """Return results matching query DSL as pandas dataframe. Parameters ---------- @@ -62,12 +62,16 @@ def search( doc_type : str, optional Name of the document type (for Elasticsearch versions 5.x and earlier). is_scroll : bool, optional - Allows to retrieve a large numbers of results from a single search request using [scroll](https://opensearch.org/docs/opensearch/rest-api/scroll/) + Allows to retrieve a large numbers of results from a single search request using + [scroll](https://opensearch.org/docs/opensearch/rest-api/scroll/) for example, for machine learning jobs. - Because scroll search contexts consume a lot of memory, we suggest you don’t use the scroll operation for frequent user queries. + Because scroll search contexts consume a lot of memory, we suggest you don’t use the scroll operation + for frequent user queries. **kwargs : - KEYWORD arguments forwarded to [elasticsearch.Elasticsearch.search](https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch.Elasticsearch.search) - and also to [elasticsearch.helpers.scan](https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan) if `is_scroll=True` + KEYWORD arguments forwarded to [elasticsearch.Elasticsearch.search]\ +(https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch.Elasticsearch.search) + and also to [elasticsearch.helpers.scan](https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan) + if `is_scroll=True` Returns ------- @@ -99,7 +103,7 @@ def search( if is_scroll: documents_generator = scan(client, index=index, query=search_body, **kwargs) - documents = map(lambda x: _hit_to_row(x), documents_generator) + documents = [_hit_to_row(doc) for doc in documents_generator] df = pd.DataFrame(documents) else: response = client.search(index=index, body=search_body, **kwargs) @@ -108,7 +112,7 @@ def search( def search_by_sql(client: Elasticsearch, sql_query: str, **kwargs: Any) -> pd.DataFrame: - """Returns results matching [SQL query](https://opensearch.org/docs/search-plugins/sql/index/) as pandas dataframe + """Return results matching [SQL query](https://opensearch.org/docs/search-plugins/sql/index/) as pandas dataframe. Parameters ---------- @@ -137,7 +141,6 @@ def search_by_sql(client: Elasticsearch, sql_query: str, **kwargs: Any) -> pd.Da """ - if _get_distribution(client) == "opensearch": url = "/_plugins/_sql" else: diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index a22f68339..4d05c41f6 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -33,7 +33,7 @@ def connect( fgac_user: Optional[str] = None, fgac_password: Optional[str] = None, ) -> Elasticsearch: - """Creates a secure connection to the specified Amazon OpenSearch domain. + """Create a secure connection to the specified Amazon OpenSearch domain. Note ---- @@ -45,7 +45,8 @@ def connect( https://aws.amazon.com/blogs/opensource/keeping-clients-of-opensearch-and-elasticsearch-compatible-with-open-source/ https://opensearch.org/docs/clients/index/ - The username and password are mandatory if the OS Cluster uses [Fine Grained Access Control](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/fgac.html). + The username and password are mandatory if the OS Cluster uses [Fine Grained Access Control]\ +(https://docs.aws.amazon.com/opensearch-service/latest/developerguide/fgac.html). If fine grained access control is disabled, session access key and secret keys are used. Parameters @@ -69,7 +70,6 @@ def connect( Elasticsearch low-level client. https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch """ - valid_ports = {80, 443} if port not in valid_ports: @@ -80,11 +80,11 @@ def connect( else: if boto3_session is None: raise ValueError("Please provide either boto3_session or fgac_user+fgac_password") - else: - if region is None: - region = boto3_session.region_name - creds = boto3_session.get_credentials() - http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", creds.token) + # else: + if region is None: + region = boto3_session.region_name + creds = boto3_session.get_credentials() + http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", creds.token) try: es = Elasticsearch( host=host, diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 671809058..3f65974ce 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -4,7 +4,6 @@ import json import logging import uuid -from pathlib import Path from typing import Any, Dict, Generator, Iterable, List, Mapping, Optional, Tuple, Union import boto3 @@ -35,7 +34,7 @@ def _actions_generator( ) -> Generator[Dict[str, Any], None, None]: for document in documents: if id_keys: - _id = "-".join(list(map(lambda x: str(document[x]), id_keys))) + _id = "-".join([document[id_key] for id_key in id_keys]) else: _id = document.get("_id", uuid.uuid4()) yield { @@ -53,16 +52,18 @@ def _deserialize(v: Any) -> Any: if v.startswith("{") and v.endswith("}") or v.startswith("[") and v.endswith("]"): try: v = json.loads(v) - except Exception as e: + except json.decoder.JSONDecodeError: try: v = ast.literal_eval(v) # if properties are enclosed with single quotes - except: - _logger.warning(f"could not convert string to json: {v}") + if not isinstance(v, dict): + _logger.warning("could not convert string to json: %s", v) + except SyntaxError as e: + _logger.warning("could not convert string to json: %s", v) _logger.warning(e) return v df_iter = df.iterrows() - for i, document in df_iter: + for _, document in df_iter: yield {k: _deserialize(v) for k, v in document.items() if notna(v)} @@ -82,7 +83,7 @@ def create_index( settings: Optional[Dict[str, Any]] = None, mappings: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: - """Creates an index. + """Create an index. Parameters ---------- @@ -128,7 +129,6 @@ def create_index( ... ) """ - body = {} if mappings: if _get_distribution(client) == "opensearch" or _get_version_major(client) >= 7: @@ -153,7 +153,7 @@ def create_index( def delete_index(client: Elasticsearch, index: str) -> Dict[str, Any]: - """Creates an index. + """Create an index. Parameters ---------- @@ -179,7 +179,6 @@ def delete_index(client: Elasticsearch, index: str) -> Dict[str, Any]: ... ) """ - # ignore 400/404 IndexNotFoundError exception response: Dict[str, Any] = client.indices.delete(index, ignore=[400, 404]) if "error" in response: @@ -245,7 +244,7 @@ def index_json( obj = s3.get_object(Bucket=bucket, Key=key) body = obj["Body"].read() lines = body.splitlines() - documents = list(map(lambda x: json.loads(x), lines)) # type: ignore + documents = [json.loads(line) for line in lines] else: # local path documents = list(_file_line_generator(path, is_json=True)) return index_documents(client=client, documents=documents, index=index, doc_type=doc_type, **kwargs) @@ -256,7 +255,7 @@ def index_csv( path: str, index: str, doc_type: Optional[str] = None, - pandas_kwargs: Dict[str, Any] = {}, + pandas_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> Dict[str, Any]: """Index all documents from a CSV file to OpenSearch index. @@ -309,6 +308,8 @@ def index_csv( ... pandas_kwargs={'sep': '|', 'na_values': ['null', 'none']} ... ) """ + if pandas_kwargs is None: + pandas_kwargs = {} enforced_pandas_params = { "skip_blank_lines": True, # 'na_filter': True # will generate Nan value for empty cells. We remove Nan keys in _df_doc_generator @@ -357,7 +358,6 @@ def index_df( ... index='sample-index1' ... ) """ - return index_documents(client=client, documents=_df_doc_generator(df), index=index, doc_type=doc_type, **kwargs) @@ -415,8 +415,10 @@ def index_documents( maximum number of seconds a retry will wait (default: 600) **kwargs : KEYWORD arguments forwarded to bulk operation - elasticsearch >= 7.10.2 / opensearch: https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters - elasticsearch < 7.10.2: https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters + elasticsearch >= 7.10.2 / opensearch: \ +https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#url-parameters + elasticsearch < 7.10.2: \ +https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/rest-api-reference/#url-parameters Returns ------- diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 68a5f4f8b..df40e2042 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -231,7 +231,7 @@ def test_index_documents_no_id_keys(client): def test_search(client): index = "test_search" - response = wr.opensearch.index_documents( + wr.opensearch.index_documents( client, documents=inspections_documents, index=index, id_keys=["inspection_id"], refresh="wait_for" ) df = wr.opensearch.search( @@ -248,7 +248,7 @@ def test_search(client): def test_search_filter_path(client): index = "test_search" - response = wr.opensearch.index_documents( + wr.opensearch.index_documents( client, documents=inspections_documents, index=index, id_keys=["inspection_id"], refresh="wait_for" ) df = wr.opensearch.search( @@ -266,7 +266,7 @@ def test_search_filter_path(client): def test_search_scroll(client): index = "test_search_scroll" - response = wr.opensearch.index_documents( + wr.opensearch.index_documents( client, documents=inspections_documents, index=index, id_keys=["inspection_id"], refresh="wait_for" ) df = wr.opensearch.search( @@ -280,7 +280,7 @@ def test_search_scroll(client): def test_search_sql(client): index = "test_search_sql" - response = wr.opensearch.index_documents( + wr.opensearch.index_documents( client, documents=inspections_documents, index=index, id_keys=["inspection_id"], refresh="wait_for" ) df = wr.opensearch.search_by_sql(client, sql_query=f"select * from {index}") From 39457fc1fc09d3ab09ace66ad36cb33a4bd6fcbf Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 20:50:03 -0400 Subject: [PATCH 25/41] [skip ci] opensearch: poetry add requests-aws4auth and elasticsearch --- poetry.lock | 40 +++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 ++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index fab569724..c20cccf14 100644 --- a/poetry.lock +++ b/poetry.lock @@ -433,6 +433,24 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "elasticsearch" +version = "7.13.4" +description = "Python client for Elasticsearch" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" + +[package.dependencies] +certifi = "*" +urllib3 = ">=1.21.1,<2" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +develop = ["requests (>=2.0.0,<3.0.0)", "coverage", "mock", "pyyaml", "pytest", "pytest-cov", "sphinx (<1.7)", "sphinx-rtd-theme", "black", "jinja2"] +docs = ["sphinx (<1.7)", "sphinx-rtd-theme"] +requests = ["requests (>=2.4.0,<3.0.0)"] + [[package]] name = "entrypoints" version = "0.3" @@ -1617,6 +1635,18 @@ urllib3 = ">=1.21.1,<1.27" security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"] socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] +[[package]] +name = "requests-aws4auth" +version = "1.1.1" +description = "AWS4 authentication for Requests" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +requests = "*" +six = "*" + [[package]] name = "requests-unixsocket" version = "0.2.0" @@ -2113,7 +2143,7 @@ sqlserver = ["pyodbc"] [metadata] lock-version = "1.1" python-versions = ">=3.6.2, <3.10" -content-hash = "d944bf99e7c7b4406442e1fcbc7125036eafb9081a7ca7638e9b9d9c377bb943" +content-hash = "b273034ed005e309039619d35491a4c4f615b49334487db5aa982eada794901a" [metadata.files] aiobotocore = [ @@ -2423,6 +2453,10 @@ docutils = [ {file = "docutils-0.17.1-py2.py3-none-any.whl", hash = "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"}, {file = "docutils-0.17.1.tar.gz", hash = "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125"}, ] +elasticsearch = [ + {file = "elasticsearch-7.13.4-py2.py3-none-any.whl", hash = "sha256:5920df0ab2630778680376d86bea349dc99860977eec9b6d2bd0860f337313f2"}, + {file = "elasticsearch-7.13.4.tar.gz", hash = "sha256:52dda85f76eeb85ec873bf9ffe0ba6849e544e591f66d4048a5e48016de268e0"}, +] entrypoints = [ {file = "entrypoints-0.3-py2.py3-none-any.whl", hash = "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19"}, {file = "entrypoints-0.3.tar.gz", hash = "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"}, @@ -3184,6 +3218,10 @@ requests = [ {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"}, {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"}, ] +requests-aws4auth = [ + {file = "requests-aws4auth-1.1.1.tar.gz", hash = "sha256:c0883346ce30b5018903a67da88df72f73ff06e1a320845bba9cd85e811ba0ba"}, + {file = "requests_aws4auth-1.1.1-py2.py3-none-any.whl", hash = "sha256:dfd9f930ffde48a756b72b55698a8522875ea6358dcffbcc44a66700ace31783"}, +] requests-unixsocket = [ {file = "requests-unixsocket-0.2.0.tar.gz", hash = "sha256:9e5c1a20afc3cf786197ae59c79bcdb0e7565f218f27df5f891307ee8817c1ea"}, {file = "requests_unixsocket-0.2.0-py2.py3-none-any.whl", hash = "sha256:014d07bfb66dc805a011a8b4b306cf4ec96d2eddb589f6b2b5765e626f0dc0cc"}, diff --git a/pyproject.toml b/pyproject.toml index ff715d6d9..ae3d1f2ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,8 @@ pyodbc = { version = "~4.0.30", optional = true } sphinx-bootstrap-theme = "^0.8.0" Sphinx = "^4.2.0" tox = "^3.24.4" +elasticsearch = "7.13.4" +requests-aws4auth = "^1.1.1" [tool.poetry.extras] From 7be50627da9dbe5bb7a52c849ae7398817c51408 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 28 Sep 2021 22:12:42 -0400 Subject: [PATCH 26/41] [skip ci] opensearch: add support for host with schema http/https --- awswrangler/opensearch/_utils.py | 8 +++++++- tests/test_opensearch.py | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index 4d05c41f6..d84c319cc 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -1,6 +1,7 @@ """Amazon OpenSearch Utils Module (PRIVATE).""" import logging +import re from typing import Any, Optional import boto3 @@ -25,6 +26,11 @@ def _get_version_major(client: Elasticsearch) -> Any: return None +def _strip_endpoint(endpoint: str) -> str: + uri_schema = re.compile(r"https?://") + return uri_schema.sub("", endpoint).strip().strip("/") + + def connect( host: str, port: Optional[int] = 443, @@ -87,7 +93,7 @@ def connect( http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", creds.token) try: es = Elasticsearch( - host=host, + host=_strip_endpoint(host), port=port, http_auth=http_auth, use_ssl=True, diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index df40e2042..0048ba937 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -149,6 +149,12 @@ def test_connection_opensearch_1_0(domain_endpoint_opensearch_1_0): assert len(client.info()) > 0 +def test_connection_opensearch_1_0_https(domain_endpoint_opensearch_1_0): + client = wr.opensearch.connect(host=f"https://{domain_endpoint_opensearch_1_0}") + print(client.info()) + assert len(client.info()) > 0 + + def test_connection_elasticsearch_7_10_fgac(domain_endpoint_elasticsearch_7_10_fgac, opensearch_password): client = wr.opensearch.connect( host=domain_endpoint_elasticsearch_7_10_fgac, fgac_user="test", fgac_password=opensearch_password From cb8656c316c4eb06ab8e82067e96654410d0d40e Mon Sep 17 00:00:00 2001 From: Muralidhar Reddy Date: Wed, 29 Sep 2021 11:42:02 +0530 Subject: [PATCH 27/41] Update 031 - OpenSearch.ipynb Fixed typo's --- tutorials/031 - OpenSearch.ipynb | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tutorials/031 - OpenSearch.ipynb b/tutorials/031 - OpenSearch.ipynb index cb5cbbbf4..f1b2b5ccb 100644 --- a/tutorials/031 - OpenSearch.ipynb +++ b/tutorials/031 - OpenSearch.ipynb @@ -67,22 +67,22 @@ "metadata": {}, "outputs": [], "source": [ - "response = wr.opensearch.create_index(\n", - " client=client,\n", - " index=\"tutorials\",\n", - " mappings={\n", - " \"properties\": {\n", - " \"id\": { \"type\" : \"integer\" },\n", - " \"id\": { \"type\" : \"string\" },\n", - " }\n", - " },\n", - " settings={\n", - " \"index\": {\n", - " \"number_of_shards\": 2\n", - " \"number_of_replicas\": 1\n", - " }\n", - " }\n", - ")\n" + "response = wr.opensearch.create_index(\n", + " client=client,\n", + " index=\"tutorials\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"id\": { \"type\" : \"integer\" },\n", + " \"name\": { \"type\" : \"string\" }\n", + " }\n", + " },\n", + " settings={\n", + " \"index\": {\n", + " \"number_of_shards\": 2,\n", + " \"number_of_replicas\": 1\n", + " }\n", + " }\n", + " )\n" ] }, { From 22b5e9b993fcd2c4bd1d0691f45b2b8db567337e Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Thu, 30 Sep 2021 01:45:14 -0400 Subject: [PATCH 28/41] [skip ci] opensearch: index_documents 429 error --- awswrangler/opensearch/_write.py | 147 ++++++++++++++++++++++++++----- tests/test_opensearch.py | 12 +++ 2 files changed, 137 insertions(+), 22 deletions(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 3f65974ce..7386941bf 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -9,13 +9,17 @@ import boto3 import pandas as pd from elasticsearch import Elasticsearch +from elasticsearch.exceptions import NotFoundError from elasticsearch.helpers import bulk +from jsonpath_ng import parse +from jsonpath_ng.exceptions import JsonPathParserError from pandas import notna from awswrangler._utils import parse_path from awswrangler.opensearch._utils import _get_distribution, _get_version_major _logger: logging.Logger = logging.getLogger(__name__) +_logger.setLevel(logging.DEBUG) def _selected_keys(document: Mapping[str, Any], keys_to_write: Optional[List[str]]) -> Mapping[str, Any]: @@ -31,18 +35,27 @@ def _actions_generator( doc_type: Optional[str], keys_to_write: Optional[List[str]], id_keys: Optional[List[str]], -) -> Generator[Dict[str, Any], None, None]: - for document in documents: + bulk_size: int = 10000, +) -> Generator[List[Dict[str, Any]], None, None]: + bulk_chunk_documents = [] + for i, document in enumerate(documents): if id_keys: _id = "-".join([document[id_key] for id_key in id_keys]) else: _id = document.get("_id", uuid.uuid4()) - yield { - "_index": index, - "_type": doc_type, - "_id": _id, - "_source": _selected_keys(document, keys_to_write), - } + bulk_chunk_documents.append( + { + "_index": index, + "_type": doc_type, + "_id": _id, + "_source": _selected_keys(document, keys_to_write), + } + ) + if (i + 1) % bulk_size == 0: + yield bulk_chunk_documents + bulk_chunk_documents = [] + if len(bulk_chunk_documents) > 0: + yield bulk_chunk_documents def _df_doc_generator(df: pd.DataFrame) -> Generator[Dict[str, Any], None, None]: @@ -76,6 +89,51 @@ def _file_line_generator(path: str, is_json: bool = False) -> Generator[Any, Non yield line.strip() +def _get_documents_w_json_path(documents: List[Mapping[str, Any]], json_path: str) -> List[Any]: + try: + jsonpath_expression = parse(json_path) + except JsonPathParserError as e: + _logger.error("invalid json_path: %s", json_path) + raise e + output_documents = [] + for doc in documents: + for match in jsonpath_expression.find(doc): + match_value = match.value + if isinstance(match_value, list): + output_documents += match_value + elif isinstance(match_value, dict): + output_documents.append(match_value) + else: + msg = f"expected json_path value to be a list/dict. received type {type(match_value)} ({match_value})" + raise ValueError(msg) + return output_documents + + +def _get_refresh_interval(client: Elasticsearch, index: str) -> Any: + url = f"/{index}/_settings" + try: + response = client.transport.perform_request("GET", url) + refresh_interval = response.get(index, {}).get("index", {}).get("refresh_interval", "1s") # type: ignore + return refresh_interval + except NotFoundError: + return None + + +def _set_refresh_interval(client: Elasticsearch, index: str, refresh_interval: str) -> Any: + url = f"/{index}/_settings" + body = {"index": {"refresh_interval": refresh_interval}} + response = client.transport.perform_request("PUT", url, headers={"Content-Type": "application/json"}, body=body) + + return response + + +def _disable_refresh_interval( + client: Elasticsearch, + index: str, +) -> Any: + return _set_refresh_interval(client=client, index=index, refresh_interval="-1") + + def create_index( client: Elasticsearch, index: str, @@ -192,11 +250,13 @@ def index_json( index: str, doc_type: Optional[str] = None, boto3_session: Optional[boto3.Session] = boto3.Session(), + json_path: Optional[str] = None, **kwargs: Any, ) -> Dict[str, Any]: """Index all documents from JSON file to OpenSearch index. - The JSON file should be in a JSON-Lines text format (newline-delimited JSON) - https://jsonlines.org/. + The JSON file should be in a JSON-Lines text format (newline-delimited JSON) - https://jsonlines.org/ + OR if the is a single large JSON please provide `json_path`. Parameters ---------- @@ -208,6 +268,10 @@ def index_json( Name of the index. doc_type : str, optional Name of the document type (only for Elasticsearch versions 5.x and earlier). + json_path : str, optional + JsonPath expression to specify explicit path to a single name element + in a JSON hierarchical data structure. + Read more about [JsonPath](https://jsonpath.com) boto3_session : boto3.Session(), optional Boto3 Session to be used to access s3 if s3 path is provided. The default boto3 Session will be used if boto3_session receive None. @@ -233,7 +297,7 @@ def index_json( ... index='sample-index1' ... ) """ - # Loading data from file + _logger.debug("indexing %s from %s", index, path) if boto3_session is None: raise ValueError("boto3_session cannot be None") @@ -245,8 +309,12 @@ def index_json( body = obj["Body"].read() lines = body.splitlines() documents = [json.loads(line) for line in lines] + if json_path: + documents = _get_documents_w_json_path(documents, json_path) else: # local path documents = list(_file_line_generator(path, is_json=True)) + if json_path: + documents = _get_documents_w_json_path(documents, json_path) return index_documents(client=client, documents=documents, index=index, doc_type=doc_type, **kwargs) @@ -308,6 +376,7 @@ def index_csv( ... pandas_kwargs={'sep': '|', 'na_values': ['null', 'none']} ... ) """ + _logger.debug("indexing %s from %s", index, path) if pandas_kwargs is None: pandas_kwargs = {} enforced_pandas_params = { @@ -369,9 +438,10 @@ def index_documents( keys_to_write: Optional[List[str]] = None, id_keys: Optional[List[str]] = None, ignore_status: Optional[Union[List[Any], Tuple[Any]]] = None, + bulk_size: int = 1000, chunk_size: Optional[int] = 500, max_chunk_bytes: Optional[int] = 100 * 1024 * 1024, - max_retries: Optional[int] = 0, + max_retries: Optional[int] = 2, initial_backoff: Optional[int] = 2, max_backoff: Optional[int] = 600, **kwargs: Any, @@ -384,6 +454,10 @@ def index_documents( https://elasticsearch-py.readthedocs.io/en/v7.13.4/helpers.html#elasticsearch.helpers.bulk https://elasticsearch-py.readthedocs.io/en/v7.13.4/helpers.html#elasticsearch.helpers.streaming_bulk + If you receive `Error 429 (Too Many Requests) /_bulk` please to to decrease `bulk_size` value. + Please also consider modifying the cluster size and instance type - + Read more here: https://aws.amazon.com/premiumsupport/knowledge-center/resolve-429-error-es/ + Parameters ---------- client : Elasticsearch @@ -401,13 +475,15 @@ def index_documents( otherwise will generate unique identifier for each document. ignore_status: Union[List[Any], Tuple[Any]], optional list of HTTP status codes that you want to ignore (not raising an exception) + bulk_size: int, + number of docs in each _bulk request (default: 1000) chunk_size : int, optional number of docs in one chunk sent to es (default: 500) max_chunk_bytes: int, optional the maximum size of the request in bytes (default: 100MB) max_retries : int, optional maximum number of times a document will be retried when - ``429`` is received, set to 0 (default) for no retries on ``429`` (default: 0) + ``429`` is received, set to 0 (default) for no retries on ``429`` (default: 2) initial_backoff : int, optional number of seconds we should wait before the first retry. Any subsequent retries will be powers of ``initial_backoff*2**retry_number`` (default: 2) @@ -437,15 +513,42 @@ def index_documents( ... index='sample-index1' ... ) """ - success, errors = bulk( - client=client, - actions=_actions_generator(documents, index, doc_type, keys_to_write=keys_to_write, id_keys=id_keys), - ignore_status=ignore_status, - chunk_size=chunk_size, - max_chunk_bytes=max_chunk_bytes, - max_retries=max_retries, - initial_backoff=initial_backoff, - max_backoff=max_backoff, - **kwargs, + if not isinstance(documents, list): + documents = list(documents) + total_documents = len(documents) + _logger.debug("indexing %s documents into %s", total_documents, index) + + actions = _actions_generator( + documents, index, doc_type, keys_to_write=keys_to_write, id_keys=id_keys, bulk_size=bulk_size ) + + success = 0 + errors: List[Any] = [] + refresh_interval = None + try: + if total_documents > bulk_size: + refresh_interval = _get_refresh_interval(client, index) + if refresh_interval: + _disable_refresh_interval(client, index) + for bulk_chunk_documents in actions: + _logger.debug("running bulk index of %s documents", len(bulk_chunk_documents)) + _success, _errors = bulk( + client=client, + actions=bulk_chunk_documents, + ignore_status=ignore_status, + chunk_size=chunk_size, + max_chunk_bytes=max_chunk_bytes, + max_retries=max_retries, + initial_backoff=initial_backoff, + max_backoff=max_backoff, + request_timeout=30, + **kwargs, + ) + success += _success + errors += _errors # type: ignore + _logger.debug("indexed %s documents (%s/%s)", _success, success, total_documents) + finally: + if refresh_interval: + _set_refresh_interval(client, index, refresh_interval) + return {"success": success, "errors": errors} diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 0048ba937..9c97c6d0f 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -210,6 +210,7 @@ def test_index_df(client): index="test_index_df1", ) print(response) + assert response.get("success", 0) == 3 def test_index_documents(client): @@ -219,6 +220,7 @@ def test_index_documents(client): index="test_index_documents1", ) print(response) + assert response.get("success", 0) == 3 def test_index_documents_id_keys(client): @@ -342,3 +344,13 @@ def test_index_csv_s3(client, path): response = wr.opensearch.index_csv(client, path=path, index=index) print(response) assert response.get("success", 0) == 6 + + +@pytest.mark.skip(reason="takes a long time (~5 mins) since testing against small clusters") +def test_index_json_s3_large_file(client): + path = "s3://irs-form-990/index_2011.json" + response = wr.opensearch.index_json( + client, index="test_index_json_s3_large_file", path=path, json_path="Filings2011", id_keys=["EIN"], bulk_size=20 + ) + print(response) + assert response.get("success", 0) > 0 From c5092a2b8202a2f8abdb830ec9e9582109bf2639 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Thu, 30 Sep 2021 01:46:12 -0400 Subject: [PATCH 29/41] [skip ci] opensearch: add jsonpath_ng library --- poetry.lock | 34 ++++++++++++++++++++++++++++++++-- pyproject.toml | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index c20cccf14..864bdd64f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -391,7 +391,7 @@ python-versions = ">=3.6, <3.7" name = "decorator" version = "5.0.9" description = "Decorators for Humans" -category = "dev" +category = "main" optional = false python-versions = ">=3.5" @@ -725,6 +725,19 @@ python-versions = "*" [package.extras] dev = ["hypothesis"] +[[package]] +name = "jsonpath-ng" +version = "1.5.3" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +decorator = "*" +ply = "*" +six = "*" + [[package]] name = "jsonschema" version = "3.2.0" @@ -1287,6 +1300,14 @@ importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} [package.extras] dev = ["pre-commit", "tox"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "prometheus-client" version = "0.11.0" @@ -2143,7 +2164,7 @@ sqlserver = ["pyodbc"] [metadata] lock-version = "1.1" python-versions = ">=3.6.2, <3.10" -content-hash = "b273034ed005e309039619d35491a4c4f615b49334487db5aa982eada794901a" +content-hash = "e01439ee1c27186731e13aa463473f281cf349c44dd7c9dc3b6112a6c49c533c" [metadata.files] aiobotocore = [ @@ -2565,6 +2586,11 @@ json5 = [ {file = "json5-0.9.6-py2.py3-none-any.whl", hash = "sha256:823e510eb355949bed817e1f3e2d682455dc6af9daf6066d5698d6a2ca4481c2"}, {file = "json5-0.9.6.tar.gz", hash = "sha256:9175ad1bc248e22bb8d95a8e8d765958bf0008fef2fe8abab5bc04e0f1ac8302"}, ] +jsonpath-ng = [ + {file = "jsonpath-ng-1.5.3.tar.gz", hash = "sha256:a273b182a82c1256daab86a313b937059261b5c5f8c4fa3fc38b882b344dd567"}, + {file = "jsonpath_ng-1.5.3-py2-none-any.whl", hash = "sha256:f75b95dbecb8a0f3b86fd2ead21c2b022c3f5770957492b9b6196ecccfeb10aa"}, + {file = "jsonpath_ng-1.5.3-py3-none-any.whl", hash = "sha256:292a93569d74029ba75ac2dc3d3630fc0e17b2df26119a165fa1d498ca47bf65"}, +] jsonschema = [ {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, @@ -2963,6 +2989,10 @@ pluggy = [ {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, ] +ply = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] prometheus-client = [ {file = "prometheus_client-0.11.0-py2.py3-none-any.whl", hash = "sha256:b014bc76815eb1399da8ce5fc84b7717a3e63652b0c0f8804092c9363acab1b2"}, {file = "prometheus_client-0.11.0.tar.gz", hash = "sha256:3a8baade6cb80bcfe43297e33e7623f3118d660d41387593758e2fb1ea173a86"}, diff --git a/pyproject.toml b/pyproject.toml index ae3d1f2ee..c488464e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ Sphinx = "^4.2.0" tox = "^3.24.4" elasticsearch = "7.13.4" requests-aws4auth = "^1.1.1" +jsonpath-ng = "^1.5.3" [tool.poetry.extras] From 97a35bde0da80ff9722222fe95ed504e4acf55b3 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Thu, 30 Sep 2021 02:22:29 -0400 Subject: [PATCH 30/41] [skip ci] opensearch: renamed fgac user/password --- awswrangler/opensearch/_utils.py | 14 +++++++------- tests/test_opensearch.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index d84c319cc..f38b71181 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -36,8 +36,8 @@ def connect( port: Optional[int] = 443, boto3_session: Optional[boto3.Session] = boto3.Session(), region: Optional[str] = None, - fgac_user: Optional[str] = None, - fgac_password: Optional[str] = None, + user: Optional[str] = None, + password: Optional[str] = None, ) -> Elasticsearch: """Create a secure connection to the specified Amazon OpenSearch domain. @@ -65,9 +65,9 @@ def connect( Boto3 Session. The default boto3 Session will be used if boto3_session receive None. region : AWS region of the Amazon OS domain. If not provided will be extracted from boto3_session. - fgac_user : + user : Fine-grained access control user. Mandatory if OS Cluster uses Fine Grained Access Control. - fgac_password : + password : Fine-grained access control password. Mandatory if OS Cluster uses Fine Grained Access Control. Returns @@ -81,11 +81,11 @@ def connect( if port not in valid_ports: raise ValueError("results: port must be one of %r." % valid_ports) - if fgac_user and fgac_password: - http_auth = (fgac_user, fgac_password) + if user and password: + http_auth = (user, password) else: if boto3_session is None: - raise ValueError("Please provide either boto3_session or fgac_user+fgac_password") + raise ValueError("Please provide either boto3_session or FGAC user+password") # else: if region is None: region = boto3_session.region_name diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index 9c97c6d0f..a62d5ed2e 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -157,7 +157,7 @@ def test_connection_opensearch_1_0_https(domain_endpoint_opensearch_1_0): def test_connection_elasticsearch_7_10_fgac(domain_endpoint_elasticsearch_7_10_fgac, opensearch_password): client = wr.opensearch.connect( - host=domain_endpoint_elasticsearch_7_10_fgac, fgac_user="test", fgac_password=opensearch_password + host=domain_endpoint_elasticsearch_7_10_fgac, user="test", password=opensearch_password ) print(client.info()) assert len(client.info()) > 0 @@ -172,7 +172,7 @@ def opensearch_1_0_client(domain_endpoint_opensearch_1_0): @pytest.fixture(scope="session") def elasticsearch_7_10_fgac_client(domain_endpoint_elasticsearch_7_10_fgac, opensearch_password): client = wr.opensearch.connect( - host=domain_endpoint_elasticsearch_7_10_fgac, fgac_user="test", fgac_password=opensearch_password + host=domain_endpoint_elasticsearch_7_10_fgac, user="test", password=opensearch_password ) return client From a73d8759fdf225724a599fc5c6fc1fcd34871dfa Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Thu, 30 Sep 2021 13:20:10 -0400 Subject: [PATCH 31/41] [skip ci] opensearch: add connection timeout --- awswrangler/opensearch/_utils.py | 14 +++++++++----- tests/test_opensearch.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index f38b71181..e3afee730 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -36,7 +36,7 @@ def connect( port: Optional[int] = 443, boto3_session: Optional[boto3.Session] = boto3.Session(), region: Optional[str] = None, - user: Optional[str] = None, + username: Optional[str] = None, password: Optional[str] = None, ) -> Elasticsearch: """Create a secure connection to the specified Amazon OpenSearch domain. @@ -46,7 +46,8 @@ def connect( We use [elasticsearch-py](https://elasticsearch-py.readthedocs.io/en/v7.13.4/), an Elasticsearch client for Python, version 7.13.4, which is the recommended version for best compatibility Amazon OpenSearch, since later versions may reject connections to Amazon OpenSearch clusters. - In the future will move to a new open source client under the [OpenSearch project](https://www.opensearch.org/) + In the future we will use [opensearch-py](https://github.com/opensearch-project/opensearch-py) \ +(currently in the works). You can read more here: https://aws.amazon.com/blogs/opensource/keeping-clients-of-opensearch-and-elasticsearch-compatible-with-open-source/ https://opensearch.org/docs/clients/index/ @@ -65,7 +66,7 @@ def connect( Boto3 Session. The default boto3 Session will be used if boto3_session receive None. region : AWS region of the Amazon OS domain. If not provided will be extracted from boto3_session. - user : + username : Fine-grained access control user. Mandatory if OS Cluster uses Fine Grained Access Control. password : Fine-grained access control password. Mandatory if OS Cluster uses Fine Grained Access Control. @@ -81,8 +82,8 @@ def connect( if port not in valid_ports: raise ValueError("results: port must be one of %r." % valid_ports) - if user and password: - http_auth = (user, password) + if username and password: + http_auth = (username, password) else: if boto3_session is None: raise ValueError("Please provide either boto3_session or FGAC user+password") @@ -99,6 +100,9 @@ def connect( use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, + timeout=30, + max_retries=10, + retry_on_timeout=True, ) except Exception as e: _logger.error("Error connecting to Opensearch cluster. Please verify authentication details") diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index a62d5ed2e..bf7c8fa39 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -157,7 +157,7 @@ def test_connection_opensearch_1_0_https(domain_endpoint_opensearch_1_0): def test_connection_elasticsearch_7_10_fgac(domain_endpoint_elasticsearch_7_10_fgac, opensearch_password): client = wr.opensearch.connect( - host=domain_endpoint_elasticsearch_7_10_fgac, user="test", password=opensearch_password + host=domain_endpoint_elasticsearch_7_10_fgac, username="test", password=opensearch_password ) print(client.info()) assert len(client.info()) > 0 @@ -172,7 +172,7 @@ def opensearch_1_0_client(domain_endpoint_opensearch_1_0): @pytest.fixture(scope="session") def elasticsearch_7_10_fgac_client(domain_endpoint_elasticsearch_7_10_fgac, opensearch_password): client = wr.opensearch.connect( - host=domain_endpoint_elasticsearch_7_10_fgac, user="test", password=opensearch_password + host=domain_endpoint_elasticsearch_7_10_fgac, username="test", password=opensearch_password ) return client From ed7a57c638639e7d60ff550b4ba06cebc5d27e1e Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Thu, 30 Sep 2021 20:34:22 -0400 Subject: [PATCH 32/41] opensearch: get_credentials_from_session --- awswrangler/opensearch/_utils.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index e3afee730..92efc10ea 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -8,6 +8,8 @@ from elasticsearch import Elasticsearch, RequestsHttpConnection from requests_aws4auth import AWS4Auth +from awswrangler import _utils, exceptions + _logger: logging.Logger = logging.getLogger(__name__) @@ -67,7 +69,7 @@ def connect( region : AWS region of the Amazon OS domain. If not provided will be extracted from boto3_session. username : - Fine-grained access control user. Mandatory if OS Cluster uses Fine Grained Access Control. + Fine-grained access control username. Mandatory if OS Cluster uses Fine Grained Access Control. password : Fine-grained access control password. Mandatory if OS Cluster uses Fine Grained Access Control. @@ -85,13 +87,16 @@ def connect( if username and password: http_auth = (username, password) else: - if boto3_session is None: - raise ValueError("Please provide either boto3_session or FGAC user+password") - # else: if region is None: - region = boto3_session.region_name - creds = boto3_session.get_credentials() - http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", creds.token) + region = _utils.get_region_from_session(boto3_session=boto3_session) + creds = _utils.get_credentials_from_session(boto3_session=boto3_session) + if creds.access_key is None or creds.secret_key is None: + raise exceptions.InvalidArgument( + "One of IAM Role or AWS ACCESS_KEY_ID and SECRET_ACCESS_KEY must be " + "given. Unable to find ACCESS_KEY_ID and SECRET_ACCESS_KEY in boto3 " + "session." + ) + http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", session_token=creds.token) try: es = Elasticsearch( host=_strip_endpoint(host), From 545e16319e04ade43a7e264bcc50c105041abd14 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Fri, 1 Oct 2021 00:58:39 -0400 Subject: [PATCH 33/41] [skip ci] opensearch: indexing progressbar --- awswrangler/opensearch/_write.py | 39 ++++++++++++++++++++++++-------- poetry.lock | 37 +++++++++++++++++++++++++++++- pyproject.toml | 1 + tests/test_opensearch.py | 2 ++ 4 files changed, 68 insertions(+), 11 deletions(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 7386941bf..1478983f9 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -8,7 +8,8 @@ import boto3 import pandas as pd -from elasticsearch import Elasticsearch +import progressbar +from elasticsearch import Elasticsearch, TransportError from elasticsearch.exceptions import NotFoundError from elasticsearch.helpers import bulk from jsonpath_ng import parse @@ -21,6 +22,8 @@ _logger: logging.Logger = logging.getLogger(__name__) _logger.setLevel(logging.DEBUG) +_DEFAULT_REFRESH_INTERVAL = "1s" + def _selected_keys(document: Mapping[str, Any], keys_to_write: Optional[List[str]]) -> Mapping[str, Any]: if keys_to_write is None: @@ -40,7 +43,7 @@ def _actions_generator( bulk_chunk_documents = [] for i, document in enumerate(documents): if id_keys: - _id = "-".join([document[id_key] for id_key in id_keys]) + _id = "-".join([str(document[id_key]) for id_key in id_keys]) else: _id = document.get("_id", uuid.uuid4()) bulk_chunk_documents.append( @@ -113,13 +116,14 @@ def _get_refresh_interval(client: Elasticsearch, index: str) -> Any: url = f"/{index}/_settings" try: response = client.transport.perform_request("GET", url) - refresh_interval = response.get(index, {}).get("index", {}).get("refresh_interval", "1s") # type: ignore + index_settings = response.get(index, {}).get("index", {}) # type: ignore + refresh_interval = index_settings.get("refresh_interval", _DEFAULT_REFRESH_INTERVAL) return refresh_interval except NotFoundError: return None -def _set_refresh_interval(client: Elasticsearch, index: str, refresh_interval: str) -> Any: +def _set_refresh_interval(client: Elasticsearch, index: str, refresh_interval: Optional[Any]) -> Any: url = f"/{index}/_settings" body = {"index": {"refresh_interval": refresh_interval}} response = client.transport.perform_request("PUT", url, headers={"Content-Type": "application/json"}, body=body) @@ -526,11 +530,17 @@ def index_documents( errors: List[Any] = [] refresh_interval = None try: - if total_documents > bulk_size: - refresh_interval = _get_refresh_interval(client, index) - if refresh_interval: + widgets = [ + progressbar.Percentage(), + progressbar.SimpleProgress(format=" (%(value_s)s/%(max_value_s)s)"), + progressbar.Bar(), + progressbar.Timer(), + ] + progress_bar = progressbar.ProgressBar(widgets=widgets, max_value=total_documents, prefix="Indexing: ").start() + for i, bulk_chunk_documents in enumerate(actions): + if i == 1: # second bulk iteration, in case the index didn't exist before + refresh_interval = _get_refresh_interval(client, index) _disable_refresh_interval(client, index) - for bulk_chunk_documents in actions: _logger.debug("running bulk index of %s documents", len(bulk_chunk_documents)) _success, _errors = bulk( client=client, @@ -547,8 +557,17 @@ def index_documents( success += _success errors += _errors # type: ignore _logger.debug("indexed %s documents (%s/%s)", _success, success, total_documents) + progress_bar.update(success, force=True) + except TransportError as e: + if str(e.status_code) == "429": # Too Many Requests + _logger.error( + "Error 429 (Too Many Requests):" + "Try to tune bulk_size parameter." + "Read more here: https://aws.amazon.com/premiumsupport/knowledge-center/resolve-429-error-es" + ) + raise e + finally: - if refresh_interval: - _set_refresh_interval(client, index, refresh_interval) + _set_refresh_interval(client, index, refresh_interval) return {"success": success, "errors": errors} diff --git a/poetry.lock b/poetry.lock index 864bdd64f..58c0ff6d8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1308,6 +1308,22 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "progressbar2" +version = "3.53.3" +description = "A Python Progressbar library to provide visual (yet text based) progress to long running operations." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +python-utils = ">=2.3.0" +six = "*" + +[package.extras] +docs = ["sphinx (>=1.7.4)"] +tests = ["flake8 (>=3.7.7)", "pytest (>=4.6.9)", "pytest-cov (>=2.6.1)", "freezegun (>=0.3.11)", "sphinx (>=1.8.5)"] + [[package]] name = "prometheus-client" version = "0.11.0" @@ -1573,6 +1589,17 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "python-utils" +version = "2.5.6" +description = "Python Utils is a module with some convenient utilities not included with the standard Python install" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +six = "*" + [[package]] name = "pytz" version = "2021.1" @@ -2164,7 +2191,7 @@ sqlserver = ["pyodbc"] [metadata] lock-version = "1.1" python-versions = ">=3.6.2, <3.10" -content-hash = "e01439ee1c27186731e13aa463473f281cf349c44dd7c9dc3b6112a6c49c533c" +content-hash = "dc43ca4a72073bdd82a0c36ec5b1b60eb68ae95055e885b190dc35b36a89137f" [metadata.files] aiobotocore = [ @@ -2993,6 +3020,10 @@ ply = [ {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, ] +progressbar2 = [ + {file = "progressbar2-3.53.3-py2.py3-none-any.whl", hash = "sha256:6610fe393a4591967ecf9062d42c0663c8862092245c490e5971ec5f348755ca"}, + {file = "progressbar2-3.53.3.tar.gz", hash = "sha256:f4e1c2d48e608850c59f793d6e74ccdebbcbaac7ffe917d45e9646ec0d664d6d"}, +] prometheus-client = [ {file = "prometheus_client-0.11.0-py2.py3-none-any.whl", hash = "sha256:b014bc76815eb1399da8ce5fc84b7717a3e63652b0c0f8804092c9363acab1b2"}, {file = "prometheus_client-0.11.0.tar.gz", hash = "sha256:3a8baade6cb80bcfe43297e33e7623f3118d660d41387593758e2fb1ea173a86"}, @@ -3144,6 +3175,10 @@ python-dateutil = [ python-levenshtein = [ {file = "python-Levenshtein-0.12.2.tar.gz", hash = "sha256:dc2395fbd148a1ab31090dd113c366695934b9e85fe5a4b2a032745efd0346f6"}, ] +python-utils = [ + {file = "python-utils-2.5.6.tar.gz", hash = "sha256:352d5b1febeebf9b3cdb9f3c87a3b26ef22d3c9e274a8ec1e7048ecd2fac4349"}, + {file = "python_utils-2.5.6-py2.py3-none-any.whl", hash = "sha256:18fbc1a1df9a9061e3059a48ebe5c8a66b654d688b0e3ecca8b339a7f168f208"}, +] pytz = [ {file = "pytz-2021.1-py2.py3-none-any.whl", hash = "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"}, {file = "pytz-2021.1.tar.gz", hash = "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da"}, diff --git a/pyproject.toml b/pyproject.toml index c488464e2..0071a8533 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ tox = "^3.24.4" elasticsearch = "7.13.4" requests-aws4auth = "^1.1.1" jsonpath-ng = "^1.5.3" +progressbar2 = "^3.53.3" [tool.poetry.extras] diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py index bf7c8fa39..345d248e3 100644 --- a/tests/test_opensearch.py +++ b/tests/test_opensearch.py @@ -1,6 +1,7 @@ import json import logging import tempfile +import time import boto3 import pandas as pd @@ -186,6 +187,7 @@ def client(request): def test_create_index(client): index = "test_create_index" wr.opensearch.delete_index(client, index) + time.sleep(0.5) # let the cluster clean up response = wr.opensearch.create_index( client=client, index=index, From 6042ae4b4ce3df4ec9d05b4e93ccc3cedee7c65a Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Sun, 3 Oct 2021 02:22:20 -0400 Subject: [PATCH 34/41] [skip ci] opensearch.index_documents.max_retries default 5 --- awswrangler/opensearch/_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 1478983f9..b54c7d7e6 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -445,7 +445,7 @@ def index_documents( bulk_size: int = 1000, chunk_size: Optional[int] = 500, max_chunk_bytes: Optional[int] = 100 * 1024 * 1024, - max_retries: Optional[int] = 2, + max_retries: Optional[int] = 5, initial_backoff: Optional[int] = 2, max_backoff: Optional[int] = 600, **kwargs: Any, From c53cd6fe095ab31c568b79ef9ef43c3b1a335e55 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Mon, 4 Oct 2021 16:02:05 -0400 Subject: [PATCH 35/41] opensearch: replace elasticsearch-py with opensearch-py low-level client --- awswrangler/opensearch/_read.py | 22 ++++++------ awswrangler/opensearch/_utils.py | 27 ++++++-------- awswrangler/opensearch/_write.py | 62 ++++++++++++++++---------------- poetry.lock | 45 ++++++++++++----------- pyproject.toml | 2 +- 5 files changed, 75 insertions(+), 83 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index 5afe44d80..015e47afa 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -3,8 +3,8 @@ from typing import Any, Dict, List, Mapping, Optional, Union import pandas as pd -from elasticsearch import Elasticsearch -from elasticsearch.helpers import scan +from opensearchpy import OpenSearch +from opensearchpy.helpers import scan from awswrangler.opensearch._utils import _get_distribution @@ -41,7 +41,7 @@ def _search_response_to_df(response: Union[Mapping[str, Any], Any]) -> pd.DataFr def search( - client: Elasticsearch, + client: OpenSearch, index: Optional[str] = "_all", search_body: Optional[Dict[str, Any]] = None, doc_type: Optional[str] = None, @@ -52,8 +52,8 @@ def search( Parameters ---------- - client : Elasticsearch - instance of elasticsearch.Elasticsearch to use. + client : OpenSearch + instance of opensearchpy.OpenSearch to use. index : str, optional A comma-separated list of index names to search. use `_all` or empty string to perform the operation on all indices. @@ -68,9 +68,9 @@ def search( Because scroll search contexts consume a lot of memory, we suggest you don’t use the scroll operation for frequent user queries. **kwargs : - KEYWORD arguments forwarded to [elasticsearch.Elasticsearch.search]\ -(https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch.Elasticsearch.search) - and also to [elasticsearch.helpers.scan](https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan) + KEYWORD arguments forwarded to [opensearchpy.OpenSearch.search]\ +(https://opensearch-py.readthedocs.io/en/latest/api.html#opensearchpy.OpenSearch.search) + and also to [opensearchpy.helpers.scan](https://opensearch-py.readthedocs.io/en/master/helpers.html#scan) if `is_scroll=True` Returns @@ -111,13 +111,13 @@ def search( return df -def search_by_sql(client: Elasticsearch, sql_query: str, **kwargs: Any) -> pd.DataFrame: +def search_by_sql(client: OpenSearch, sql_query: str, **kwargs: Any) -> pd.DataFrame: """Return results matching [SQL query](https://opensearch.org/docs/search-plugins/sql/index/) as pandas dataframe. Parameters ---------- - client : Elasticsearch - instance of elasticsearch.Elasticsearch to use. + client : OpenSearch + instance of opensearchpy.OpenSearch to use. sql_query : str SQL query **kwargs : diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index 92efc10ea..b2a139cbf 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -5,7 +5,7 @@ from typing import Any, Optional import boto3 -from elasticsearch import Elasticsearch, RequestsHttpConnection +from opensearchpy import OpenSearch, RequestsHttpConnection from requests_aws4auth import AWS4Auth from awswrangler import _utils, exceptions @@ -13,15 +13,15 @@ _logger: logging.Logger = logging.getLogger(__name__) -def _get_distribution(client: Elasticsearch) -> Any: +def _get_distribution(client: OpenSearch) -> Any: return client.info().get("version", {}).get("distribution", "elasticsearch") -def _get_version(client: Elasticsearch) -> Any: +def _get_version(client: OpenSearch) -> Any: return client.info().get("version", {}).get("number") -def _get_version_major(client: Elasticsearch) -> Any: +def _get_version_major(client: OpenSearch) -> Any: version = _get_version(client) if version: return int(version.split(".")[0]) @@ -40,19 +40,12 @@ def connect( region: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None, -) -> Elasticsearch: +) -> OpenSearch: """Create a secure connection to the specified Amazon OpenSearch domain. Note ---- - We use [elasticsearch-py](https://elasticsearch-py.readthedocs.io/en/v7.13.4/), an Elasticsearch client for Python, - version 7.13.4, which is the recommended version for best compatibility Amazon OpenSearch, - since later versions may reject connections to Amazon OpenSearch clusters. - In the future we will use [opensearch-py](https://github.com/opensearch-project/opensearch-py) \ -(currently in the works). - You can read more here: - https://aws.amazon.com/blogs/opensource/keeping-clients-of-opensearch-and-elasticsearch-compatible-with-open-source/ - https://opensearch.org/docs/clients/index/ + We use [opensearch-py](https://github.com/opensearch-project/opensearch-py), an OpenSearch low-level python client. The username and password are mandatory if the OS Cluster uses [Fine Grained Access Control]\ (https://docs.aws.amazon.com/opensearch-service/latest/developerguide/fgac.html). @@ -75,9 +68,9 @@ def connect( Returns ------- - elasticsearch.Elasticsearch - Elasticsearch low-level client. - https://elasticsearch-py.readthedocs.io/en/v7.13.4/api.html#elasticsearch + opensearchpy.OpenSearch + OpenSearch low-level client. + https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py """ valid_ports = {80, 443} @@ -98,7 +91,7 @@ def connect( ) http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, "es", session_token=creds.token) try: - es = Elasticsearch( + es = OpenSearch( host=_strip_endpoint(host), port=port, http_auth=http_auth, diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index b54c7d7e6..5bb081909 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -9,11 +9,11 @@ import boto3 import pandas as pd import progressbar -from elasticsearch import Elasticsearch, TransportError -from elasticsearch.exceptions import NotFoundError -from elasticsearch.helpers import bulk from jsonpath_ng import parse from jsonpath_ng.exceptions import JsonPathParserError +from opensearchpy import OpenSearch, TransportError +from opensearchpy.exceptions import NotFoundError +from opensearchpy.helpers import bulk from pandas import notna from awswrangler._utils import parse_path @@ -112,7 +112,7 @@ def _get_documents_w_json_path(documents: List[Mapping[str, Any]], json_path: st return output_documents -def _get_refresh_interval(client: Elasticsearch, index: str) -> Any: +def _get_refresh_interval(client: OpenSearch, index: str) -> Any: url = f"/{index}/_settings" try: response = client.transport.perform_request("GET", url) @@ -123,7 +123,7 @@ def _get_refresh_interval(client: Elasticsearch, index: str) -> Any: return None -def _set_refresh_interval(client: Elasticsearch, index: str, refresh_interval: Optional[Any]) -> Any: +def _set_refresh_interval(client: OpenSearch, index: str, refresh_interval: Optional[Any]) -> Any: url = f"/{index}/_settings" body = {"index": {"refresh_interval": refresh_interval}} response = client.transport.perform_request("PUT", url, headers={"Content-Type": "application/json"}, body=body) @@ -132,14 +132,14 @@ def _set_refresh_interval(client: Elasticsearch, index: str, refresh_interval: O def _disable_refresh_interval( - client: Elasticsearch, + client: OpenSearch, index: str, ) -> Any: return _set_refresh_interval(client=client, index=index, refresh_interval="-1") def create_index( - client: Elasticsearch, + client: OpenSearch, index: str, doc_type: Optional[str] = None, settings: Optional[Dict[str, Any]] = None, @@ -149,8 +149,8 @@ def create_index( Parameters ---------- - client : Elasticsearch - instance of elasticsearch.Elasticsearch to use. + client : OpenSearch + instance of opensearchpy.OpenSearch to use. index : str Name of the index. doc_type : str, optional @@ -214,13 +214,13 @@ def create_index( return response -def delete_index(client: Elasticsearch, index: str) -> Dict[str, Any]: +def delete_index(client: OpenSearch, index: str) -> Dict[str, Any]: """Create an index. Parameters ---------- - client : Elasticsearch - instance of elasticsearch.Elasticsearch to use. + client : OpenSearch + instance of opensearchpy.OpenSearch to use. index : str Name of the index. @@ -249,7 +249,7 @@ def delete_index(client: Elasticsearch, index: str) -> Dict[str, Any]: def index_json( - client: Elasticsearch, + client: OpenSearch, path: str, index: str, doc_type: Optional[str] = None, @@ -264,14 +264,14 @@ def index_json( Parameters ---------- - client : Elasticsearch - instance of elasticsearch.Elasticsearch to use. + client : OpenSearch + instance of opensearchpy.OpenSearch to use. path : str s3 or local path to the JSON file which contains the documents. index : str Name of the index. doc_type : str, optional - Name of the document type (only for Elasticsearch versions 5.x and earlier). + Name of the document type (for Elasticsearch versions 5.x and earlier). json_path : str, optional JsonPath expression to specify explicit path to a single name element in a JSON hierarchical data structure. @@ -323,7 +323,7 @@ def index_json( def index_csv( - client: Elasticsearch, + client: OpenSearch, path: str, index: str, doc_type: Optional[str] = None, @@ -334,14 +334,14 @@ def index_csv( Parameters ---------- - client : Elasticsearch - instance of elasticsearch.Elasticsearch to use. + client : OpenSearch + instance of opensearchpy.OpenSearch to use. path : str s3 or local path to the CSV file which contains the documents. index : str Name of the index. doc_type : str, optional - Name of the document type (only for Elasticsearch versions 5.x and older). + Name of the document type (for Elasticsearch versions 5.x and earlier). pandas_kwargs : Dict[str, Any], optional Dictionary of arguments forwarded to pandas.read_csv(). e.g. pandas_kwargs={'sep': '|', 'na_values': ['null', 'none']} @@ -394,20 +394,20 @@ def index_csv( def index_df( - client: Elasticsearch, df: pd.DataFrame, index: str, doc_type: Optional[str] = None, **kwargs: Any + client: OpenSearch, df: pd.DataFrame, index: str, doc_type: Optional[str] = None, **kwargs: Any ) -> Dict[str, Any]: """Index all documents from a DataFrame to OpenSearch index. Parameters ---------- - client : Elasticsearch - instance of elasticsearch.Elasticsearch to use. + client : OpenSearch + instance of opensearchpy.OpenSearch to use. df : pd.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html index : str Name of the index. doc_type : str, optional - Name of the document type (only for Elasticsearch versions 5.x and older). + Name of the document type (for Elasticsearch versions 5.x and earlier). **kwargs : KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents` which is used to execute the operation @@ -435,7 +435,7 @@ def index_df( def index_documents( - client: Elasticsearch, + client: OpenSearch, documents: Iterable[Mapping[str, Any]], index: str, doc_type: Optional[str] = None, @@ -454,9 +454,9 @@ def index_documents( Note ---- - Some of the args are referenced from elasticsearch-py client library (bulk helpers) - https://elasticsearch-py.readthedocs.io/en/v7.13.4/helpers.html#elasticsearch.helpers.bulk - https://elasticsearch-py.readthedocs.io/en/v7.13.4/helpers.html#elasticsearch.helpers.streaming_bulk + Some of the args are referenced from opensearch-py client library (bulk helpers) + https://opensearch-py.readthedocs.io/en/latest/helpers.html#opensearchpy.helpers.bulk + https://opensearch-py.readthedocs.io/en/latest/helpers.html#opensearchpy.helpers.streaming_bulk If you receive `Error 429 (Too Many Requests) /_bulk` please to to decrease `bulk_size` value. Please also consider modifying the cluster size and instance type - @@ -464,14 +464,14 @@ def index_documents( Parameters ---------- - client : Elasticsearch - instance of elasticsearch.Elasticsearch to use. + client : OpenSearch + instance of opensearchpy.OpenSearch to use. documents : Iterable[Mapping[str, Any]] List which contains the documents that will be inserted. index : str Name of the index. doc_type : str, optional - Name of the document type (only for Elasticsearch versions 5.x and older). + Name of the document type (for Elasticsearch versions 5.x and earlier). keys_to_write : List[str], optional list of keys to index. If not provided all keys will be indexed id_keys : List[str], optional diff --git a/poetry.lock b/poetry.lock index 58c0ff6d8..790afb055 100644 --- a/poetry.lock +++ b/poetry.lock @@ -433,24 +433,6 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -[[package]] -name = "elasticsearch" -version = "7.13.4" -description = "Python client for Elasticsearch" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" - -[package.dependencies] -certifi = "*" -urllib3 = ">=1.21.1,<2" - -[package.extras] -async = ["aiohttp (>=3,<4)"] -develop = ["requests (>=2.0.0,<3.0.0)", "coverage", "mock", "pyyaml", "pytest", "pytest-cov", "sphinx (<1.7)", "sphinx-rtd-theme", "black", "jinja2"] -docs = ["sphinx (<1.7)", "sphinx-rtd-theme"] -requests = ["requests (>=2.4.0,<3.0.0)"] - [[package]] name = "entrypoints" version = "0.3" @@ -1165,6 +1147,23 @@ python-versions = ">=3.6," [package.dependencies] et-xmlfile = "*" +[[package]] +name = "opensearch-py" +version = "1.0.0" +description = "Python low-level client for OpenSearch" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" + +[package.dependencies] +certifi = "*" +urllib3 = ">=1.21.1,<2" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +develop = ["requests (>=2.0.0,<3.0.0)", "coverage", "mock", "pyyaml", "pytest", "pytest-cov", "black", "jinja2"] +requests = ["requests (>=2.4.0,<3.0.0)"] + [[package]] name = "packaging" version = "21.0" @@ -2191,7 +2190,7 @@ sqlserver = ["pyodbc"] [metadata] lock-version = "1.1" python-versions = ">=3.6.2, <3.10" -content-hash = "dc43ca4a72073bdd82a0c36ec5b1b60eb68ae95055e885b190dc35b36a89137f" +content-hash = "5ae102e5d974439be05598ed53be1817216b1d958b89dfdd5ef1622c9708847a" [metadata.files] aiobotocore = [ @@ -2501,10 +2500,6 @@ docutils = [ {file = "docutils-0.17.1-py2.py3-none-any.whl", hash = "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"}, {file = "docutils-0.17.1.tar.gz", hash = "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125"}, ] -elasticsearch = [ - {file = "elasticsearch-7.13.4-py2.py3-none-any.whl", hash = "sha256:5920df0ab2630778680376d86bea349dc99860977eec9b6d2bd0860f337313f2"}, - {file = "elasticsearch-7.13.4.tar.gz", hash = "sha256:52dda85f76eeb85ec873bf9ffe0ba6849e544e591f66d4048a5e48016de268e0"}, -] entrypoints = [ {file = "entrypoints-0.3-py2.py3-none-any.whl", hash = "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19"}, {file = "entrypoints-0.3.tar.gz", hash = "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"}, @@ -2932,6 +2927,10 @@ openpyxl = [ {file = "openpyxl-3.0.7-py2.py3-none-any.whl", hash = "sha256:46af4eaf201a89b610fcca177eed957635f88770a5462fb6aae4a2a52b0ff516"}, {file = "openpyxl-3.0.7.tar.gz", hash = "sha256:6456a3b472e1ef0facb1129f3c6ef00713cebf62e736cd7a75bcc3247432f251"}, ] +opensearch-py = [ + {file = "opensearch-py-1.0.0.tar.gz", hash = "sha256:fa952836cabfa1b2fb05f852edc1a373342494345e89fd52b7124daf4d296bb4"}, + {file = "opensearch_py-1.0.0-py2.py3-none-any.whl", hash = "sha256:17afebc25dc890b96c4e9ec8692dcfdb6842c028ce8c2d252e8f55c587960177"}, +] packaging = [ {file = "packaging-21.0-py3-none-any.whl", hash = "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"}, {file = "packaging-21.0.tar.gz", hash = "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7"}, diff --git a/pyproject.toml b/pyproject.toml index 0071a8533..0ff7aa28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,10 +48,10 @@ pyodbc = { version = "~4.0.30", optional = true } sphinx-bootstrap-theme = "^0.8.0" Sphinx = "^4.2.0" tox = "^3.24.4" -elasticsearch = "7.13.4" requests-aws4auth = "^1.1.1" jsonpath-ng = "^1.5.3" progressbar2 = "^3.53.3" +opensearch-py = "^1.0.0" [tool.poetry.extras] From 5c5d71780bcb11f59fa84016877bfad638329c1a Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Mon, 4 Oct 2021 23:57:30 -0400 Subject: [PATCH 36/41] [skip ci] opensearch filter_path default value --- awswrangler/opensearch/_read.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index 015e47afa..ba345ab43 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -1,6 +1,6 @@ """Amazon OpenSearch Read Module (PRIVATE).""" -from typing import Any, Dict, List, Mapping, Optional, Union +from typing import Any, Collection, Dict, List, Mapping, Optional, Union import pandas as pd from opensearchpy import OpenSearch @@ -46,6 +46,7 @@ def search( search_body: Optional[Dict[str, Any]] = None, doc_type: Optional[str] = None, is_scroll: Optional[bool] = False, + filter_path: Optional[Union[str, Collection[str]]] = None, **kwargs: Any, ) -> pd.DataFrame: """Return results matching query DSL as pandas dataframe. @@ -67,6 +68,9 @@ def search( for example, for machine learning jobs. Because scroll search contexts consume a lot of memory, we suggest you don’t use the scroll operation for frequent user queries. + filter_path : Union[str, Collection[str]], optional + Use the filter_path parameter to reduce the size of the OpenSearch Service response \ +(default: ['hits.hits._id','hits.hits._source']) **kwargs : KEYWORD arguments forwarded to [opensearchpy.OpenSearch.search]\ (https://opensearch-py.readthedocs.io/en/latest/api.html#opensearchpy.OpenSearch.search) @@ -101,12 +105,18 @@ def search( if doc_type: kwargs["doc_type"] = doc_type + if filter_path is None: + filter_path = ["hits.hits._id", "hits.hits._source"] + if is_scroll: - documents_generator = scan(client, index=index, query=search_body, **kwargs) + if isinstance(filter_path, str): + filter_path = [filter_path] + filter_path = ["_scroll_id", "_shards"] + list(filter_path) # required for scroll + documents_generator = scan(client, index=index, query=search_body, filter_path=filter_path, **kwargs) documents = [_hit_to_row(doc) for doc in documents_generator] df = pd.DataFrame(documents) else: - response = client.search(index=index, body=search_body, **kwargs) + response = client.search(index=index, body=search_body, filter_path=filter_path, **kwargs) df = _search_response_to_df(response) return df From 152c4076ee906b398a7822be62e0778adb17844a Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Mon, 4 Oct 2021 23:58:22 -0400 Subject: [PATCH 37/41] [skip ci] opensearch tutorial --- tutorials/031 - OpenSearch.ipynb | 1575 ++++++++++++++++++++++++++++-- 1 file changed, 1504 insertions(+), 71 deletions(-) diff --git a/tutorials/031 - OpenSearch.ipynb b/tutorials/031 - OpenSearch.ipynb index f1b2b5ccb..afe254669 100644 --- a/tutorials/031 - OpenSearch.ipynb +++ b/tutorials/031 - OpenSearch.ipynb @@ -19,14 +19,30 @@ "metadata": {}, "source": [ "## Table of Contents\n", - "* [1. Create Indices](#2.-Create-Indices)\n", - "* [2. Write Indices](#3.-Write-Indices)\n", - "\t* [2.1 Writing from JSON file](#2.1-Writing-from-JSON-file)\n", - "\t* [2.2 Writing from CSV file](#2.2-Writing-from-CSV-file)\n", - "* [3. Search Indices](#1.-Search-Indices)\n", - "\t* [3.1 Search by DSL](#1.1-Search-by-DSL)\n", - "\t* [3.2 Search by SQL](#1.2-Search-by-SQL)\n", - "* [4. Delete Indices](#7.-Delete-Indices)\n" + "* [1. Initialize](#initialize)\n", + " * [Connect to your Amazon OpenSearch domain](#connect)\n", + " * [Enter your bucket name](#bucket)\n", + " * [Initialize sample data](#sample-data)\n", + "* [2. Indexing (load)](#indexing)\n", + "\t* [Index documents (no Pandas)](#index-documents)\n", + "\t* [Index json file](#index-json)\n", + " * [Index CSV](#index-csv)\n", + "* [3. Search](#search)\n", + "\t* [3.1 Search by DSL](#search-dsl)\n", + "\t* [3.2 Search by SQL](#search-sql)\n", + "* [4. Delete Indices](#delete-index)\n", + "* [5. Bonus - Prepare data and index from DataFrame](#bonus)\n", + "\t* [Prepare the data for indexing](#prepare-data)\n", + " * [Create index with mapping](#create-index-w-mapping)\n", + " * [Index dataframe](#index-df)\n", + " * [Execute geo query](#search-geo)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Initialize" ] }, { @@ -35,14 +51,14 @@ "metadata": {}, "outputs": [], "source": [ - "import awswrangler as wr\n" + "import awswrangler as wr" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Enter your domain endpoint:" + "### Connect to your Amazon OpenSearch domain" ] }, { @@ -51,14 +67,19 @@ "metadata": {}, "outputs": [], "source": [ - "client = wr.opensearch.connect(host='DOMAIN-ENDPOINT')" + "client = wr.opensearch.connect(\n", + " host='OPENSEARCH-ENDPOINT',\n", + "# username='FGAC-USERNAME(OPTIONAL)',\n", + "# password='FGAC-PASSWORD(OPTIONAL)'\n", + ")\n", + "client.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 1. Create Indices" + "### Enter your bucket name" ] }, { @@ -67,134 +88,944 @@ "metadata": {}, "outputs": [], "source": [ - "response = wr.opensearch.create_index(\n", - " client=client,\n", - " index=\"tutorials\",\n", - " mappings={\n", - " \"properties\": {\n", - " \"id\": { \"type\" : \"integer\" },\n", - " \"name\": { \"type\" : \"string\" }\n", - " }\n", - " },\n", - " settings={\n", - " \"index\": {\n", - " \"number_of_shards\": 2,\n", - " \"number_of_replicas\": 1\n", - " }\n", - " }\n", - " )\n" + "bucket = 'BUCKET'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize sample data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "sf_restaurants_inspections = [\n", + " {\n", + " \"inspection_id\": \"24936_20160609\",\n", + " \"business_address\": \"315 California St\",\n", + " \"business_city\": \"San Francisco\",\n", + " \"business_id\": \"24936\",\n", + " \"business_location\": {\"lon\": -122.400152, \"lat\": 37.793199},\n", + " \"business_name\": \"San Francisco Soup Company\",\n", + " \"business_postal_code\": \"94104\",\n", + " \"business_state\": \"CA\",\n", + " \"inspection_date\": \"2016-06-09T00:00:00.000\",\n", + " \"inspection_score\": 77,\n", + " \"inspection_type\": \"Routine - Unscheduled\",\n", + " \"risk_category\": \"Low Risk\",\n", + " \"violation_description\": \"Improper food labeling or menu misrepresentation\",\n", + " \"violation_id\": \"24936_20160609_103141\",\n", + " },\n", + " {\n", + " \"inspection_id\": \"60354_20161123\",\n", + " \"business_address\": \"10 Mason St\",\n", + " \"business_city\": \"San Francisco\",\n", + " \"business_id\": \"60354\",\n", + " \"business_location\": {\"lon\": -122.409061, \"lat\": 37.783527},\n", + " \"business_name\": \"Soup Unlimited\",\n", + " \"business_postal_code\": \"94102\",\n", + " \"business_state\": \"CA\",\n", + " \"inspection_date\": \"2016-11-23T00:00:00.000\",\n", + " \"inspection_type\": \"Routine\",\n", + " \"inspection_score\": 95,\n", + " },\n", + " {\n", + " \"inspection_id\": \"1797_20160705\",\n", + " \"business_address\": \"2872 24th St\",\n", + " \"business_city\": \"San Francisco\",\n", + " \"business_id\": \"1797\",\n", + " \"business_location\": {\"lon\": -122.409752, \"lat\": 37.752807},\n", + " \"business_name\": \"TIO CHILOS GRILL\",\n", + " \"business_postal_code\": \"94110\",\n", + " \"business_state\": \"CA\",\n", + " \"inspection_date\": \"2016-07-05T00:00:00.000\",\n", + " \"inspection_score\": 90,\n", + " \"inspection_type\": \"Routine - Unscheduled\",\n", + " \"risk_category\": \"Low Risk\",\n", + " \"violation_description\": \"Unclean nonfood contact surfaces\",\n", + " \"violation_id\": \"1797_20160705_103142\",\n", + " },\n", + " {\n", + " \"inspection_id\": \"66198_20160527\",\n", + " \"business_address\": \"1661 Tennessee St Suite 3B\",\n", + " \"business_city\": \"San Francisco Whard Restaurant\",\n", + " \"business_id\": \"66198\",\n", + " \"business_location\": {\"lon\": -122.388478, \"lat\": 37.75072},\n", + " \"business_name\": \"San Francisco Restaurant\",\n", + " \"business_postal_code\": \"94107\",\n", + " \"business_state\": \"CA\",\n", + " \"inspection_date\": \"2016-05-27T00:00:00.000\",\n", + " \"inspection_type\": \"Routine\",\n", + " \"inspection_score\": 56,\n", + " },\n", + " {\n", + " \"inspection_id\": \"5794_20160907\",\n", + " \"business_address\": \"2162 24th Ave\",\n", + " \"business_city\": \"San Francisco\",\n", + " \"business_id\": \"5794\",\n", + " \"business_location\": {\"lon\": -122.481299, \"lat\": 37.747228},\n", + " \"business_name\": \"Soup House\",\n", + " \"business_phone_number\": \"+14155752700\",\n", + " \"business_postal_code\": \"94116\",\n", + " \"business_state\": \"CA\",\n", + " \"inspection_date\": \"2016-09-07T00:00:00.000\",\n", + " \"inspection_score\": 96,\n", + " \"inspection_type\": \"Routine - Unscheduled\",\n", + " \"risk_category\": \"Low Risk\",\n", + " \"violation_description\": \"Unapproved or unmaintained equipment or utensils\",\n", + " \"violation_id\": \"5794_20160907_103144\",\n", + " },\n", + " \n", + " # duplicate record\n", + " {\n", + " \"inspection_id\": \"5794_20160907\",\n", + " \"business_address\": \"2162 24th Ave\",\n", + " \"business_city\": \"San Francisco\",\n", + " \"business_id\": \"5794\",\n", + " \"business_location\": {\"lon\": -122.481299, \"lat\": 37.747228},\n", + " \"business_name\": \"Soup-or-Salad\",\n", + " \"business_phone_number\": \"+14155752700\",\n", + " \"business_postal_code\": \"94116\",\n", + " \"business_state\": \"CA\",\n", + " \"inspection_date\": \"2016-09-07T00:00:00.000\",\n", + " \"inspection_score\": 96,\n", + " \"inspection_type\": \"Routine - Unscheduled\",\n", + " \"risk_category\": \"Low Risk\",\n", + " \"violation_description\": \"Unapproved or unmaintained equipment or utensils\",\n", + " \"violation_id\": \"5794_20160907_103144\",\n", + " },\n", + "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. Write Indices" + "## 2. Indexing (load)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 2.1 Write from JSON files" + "### Index documents (no Pandas)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Indexing: 100% (6/6)|####################################|Elapsed Time: 0:00:01" + ] + }, + { + "data": { + "text/plain": [ + "{'success': 6, 'errors': []}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# index documents w/o providing keys (_id is auto-generated)\n", + "wr.opensearch.index_documents(\n", + " client,\n", + " documents=sf_restaurants_inspections,\n", + " index=\"sf_restaurants_inspections\" \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idbusiness_nameinspection_idbusiness_location.lonbusiness_location.lat
0663dd72d-0da4-495b-b0ae-ed000105ae73TIO CHILOS GRILL1797_20160705-122.40975237.752807
1ff2f50f6-5415-4706-9bcb-af7c5eb0afa3Soup House5794_20160907-122.48129937.747228
2b9e8f6a2-8fd1-4660-b041-2997a1a80984San Francisco Soup Company24936_20160609-122.40015237.793199
356b352e6-102b-4eff-8296-7e1fb2459babSoup Unlimited60354_20161123-122.40906137.783527
46fec5411-f79a-48e4-be7b-e0e44d5ebbabSan Francisco Restaurant66198_20160527-122.38847837.750720
57ba4fb17-f9a9-49da-b90e-8b3553d6d97cSoup-or-Salad5794_20160907-122.48129937.747228
\n", + "
" + ], + "text/plain": [ + " _id business_name \\\n", + "0 663dd72d-0da4-495b-b0ae-ed000105ae73 TIO CHILOS GRILL \n", + "1 ff2f50f6-5415-4706-9bcb-af7c5eb0afa3 Soup House \n", + "2 b9e8f6a2-8fd1-4660-b041-2997a1a80984 San Francisco Soup Company \n", + "3 56b352e6-102b-4eff-8296-7e1fb2459bab Soup Unlimited \n", + "4 6fec5411-f79a-48e4-be7b-e0e44d5ebbab San Francisco Restaurant \n", + "5 7ba4fb17-f9a9-49da-b90e-8b3553d6d97c Soup-or-Salad \n", + "\n", + " inspection_id business_location.lon business_location.lat \n", + "0 1797_20160705 -122.409752 37.752807 \n", + "1 5794_20160907 -122.481299 37.747228 \n", + "2 24936_20160609 -122.400152 37.793199 \n", + "3 60354_20161123 -122.409061 37.783527 \n", + "4 66198_20160527 -122.388478 37.750720 \n", + "5 5794_20160907 -122.481299 37.747228 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# read all documents. There are total 6 documents\n", + "wr.opensearch.search(\n", + " client,\n", + " index=\"sf_restaurants_inspections\",\n", + " _source=[\"inspection_id\", \"business_name\", \"business_location\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Index json file" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "import pandas as pd\n", + "df = pd.DataFrame(sf_restaurants_inspections)\n", + "path = f\"s3://{bucket}/json/sf_restaurants_inspections.json\"\n", + "wr.s3.to_json(df, path,orient='records',lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Indexing: 100% (6/6)|####################################|Elapsed Time: 0:00:00" + ] + }, + { + "data": { + "text/plain": [ + "{'success': 6, 'errors': []}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# index json w/ providing keys\n", "wr.opensearch.index_json(\n", - " client=client,\n", - " path='s3://awswrangler-opensearch/dataload/doc1.json',\n", - " index='tutorials'\n", - " )\n" + " client,\n", + " path=path, # path can be s3 or local\n", + " index=\"sf_restaurants_inspections_dedup\",\n", + " id_keys=[\"inspection_id\"] # can be multiple fields. arg applicable to all index_* functions\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idbusiness_nameinspection_idbusiness_location.lonbusiness_location.lat
024936_20160609San Francisco Soup Company24936_20160609-122.40015237.793199
166198_20160527San Francisco Restaurant66198_20160527-122.38847837.750720
25794_20160907Soup-or-Salad5794_20160907-122.48129937.747228
360354_20161123Soup Unlimited60354_20161123-122.40906137.783527
41797_20160705TIO CHILOS GRILL1797_20160705-122.40975237.752807
\n", + "
" + ], + "text/plain": [ + " _id business_name inspection_id \\\n", + "0 24936_20160609 San Francisco Soup Company 24936_20160609 \n", + "1 66198_20160527 San Francisco Restaurant 66198_20160527 \n", + "2 5794_20160907 Soup-or-Salad 5794_20160907 \n", + "3 60354_20161123 Soup Unlimited 60354_20161123 \n", + "4 1797_20160705 TIO CHILOS GRILL 1797_20160705 \n", + "\n", + " business_location.lon business_location.lat \n", + "0 -122.400152 37.793199 \n", + "1 -122.388478 37.750720 \n", + "2 -122.481299 37.747228 \n", + "3 -122.409061 37.783527 \n", + "4 -122.409752 37.752807 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now there are no duplicates. There are total 5 documents\n", + "wr.opensearch.search(\n", + " client,\n", + " index=\"sf_restaurants_inspections_dedup\",\n", + " _source=[\"inspection_id\", \"business_name\", \"business_location\"]\n", + " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 2.2 Write from CSV files" + "### Index CSV" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Indexing: 100% (1000/1000)|##############################|Elapsed Time: 0:00:00" + ] + }, + { + "data": { + "text/plain": [ + "{'success': 1000, 'errors': []}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "wr.opensearch.index_csv(\n", - " client=client,\n", - " path='s3://awswrangler-opensearch/dataload/doc1.csv',\n", - " index='tutorials'\n", - " )\n" + " client, \n", + " index=\"nyc_restaurants_inspections_sample\", \n", + " path='https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD', # index_csv supports local, s3 and url path\n", + " id_keys=[\"CAMIS\"],\n", + " pandas_kwargs={'na_filter': True, 'nrows': 1000}, # pandas.read_csv() args - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html\n", + " bulk_size=500 # modify based on your cluster size\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idCAMISDBABOROBUILDINGSTREETZIPCODEPHONECUISINE DESCRIPTIONINSPECTION DATE...RECORD DATEINSPECTION TYPELatitudeLongitudeCommunity BoardCouncil DistrictCensus TractBINBBLNTA
04161042641610426GLOW THAI RESTAURANTBrooklyn71073 AVENUE11209.07187481920Thai02/26/2020...10/04/2021Cycle Inspection / Re-inspection40.633865-74.026798310.043.06800.03146519.03.058910e+09BK31
14081116240811162CARMINE'SManhattan2450BROADWAY10024.02123622200Italian05/28/2019...10/04/2021Cycle Inspection / Initial Inspection40.791168-73.974308107.06.017900.01033560.01.012380e+09MN12
25001211350012113TANGQueens196-50NORTHERN BOULEVARD11358.07182797080Korean08/16/2018...10/04/2021Cycle Inspection / Initial Inspection40.757850-73.784593411.019.0145101.04124565.04.055200e+09QN48
35001461850014618TOTTO RAMENManhattan248EAST 52 STREET10022.02124210052Japanese08/20/2018...10/04/2021Cycle Inspection / Re-inspection40.756596-73.968749106.04.09800.01038490.01.013250e+09MN19
45004578250045782OLLIE'S CHINESE RESTAURANTManhattan2705BROADWAY10025.02129323300Chinese10/21/2019...10/04/2021Cycle Inspection / Re-inspection40.799318-73.968440107.06.019100.01056562.01.018750e+09MN12
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " _id CAMIS DBA BORO BUILDING \\\n", + "0 41610426 41610426 GLOW THAI RESTAURANT Brooklyn 7107 \n", + "1 40811162 40811162 CARMINE'S Manhattan 2450 \n", + "2 50012113 50012113 TANG Queens 196-50 \n", + "3 50014618 50014618 TOTTO RAMEN Manhattan 248 \n", + "4 50045782 50045782 OLLIE'S CHINESE RESTAURANT Manhattan 2705 \n", + "\n", + " STREET ZIPCODE PHONE CUISINE DESCRIPTION \\\n", + "0 3 AVENUE 11209.0 7187481920 Thai \n", + "1 BROADWAY 10024.0 2123622200 Italian \n", + "2 NORTHERN BOULEVARD 11358.0 7182797080 Korean \n", + "3 EAST 52 STREET 10022.0 2124210052 Japanese \n", + "4 BROADWAY 10025.0 2129323300 Chinese \n", + "\n", + " INSPECTION DATE ... RECORD DATE INSPECTION TYPE \\\n", + "0 02/26/2020 ... 10/04/2021 Cycle Inspection / Re-inspection \n", + "1 05/28/2019 ... 10/04/2021 Cycle Inspection / Initial Inspection \n", + "2 08/16/2018 ... 10/04/2021 Cycle Inspection / Initial Inspection \n", + "3 08/20/2018 ... 10/04/2021 Cycle Inspection / Re-inspection \n", + "4 10/21/2019 ... 10/04/2021 Cycle Inspection / Re-inspection \n", + "\n", + " Latitude Longitude Community Board Council District Census Tract \\\n", + "0 40.633865 -74.026798 310.0 43.0 6800.0 \n", + "1 40.791168 -73.974308 107.0 6.0 17900.0 \n", + "2 40.757850 -73.784593 411.0 19.0 145101.0 \n", + "3 40.756596 -73.968749 106.0 4.0 9800.0 \n", + "4 40.799318 -73.968440 107.0 6.0 19100.0 \n", + "\n", + " BIN BBL NTA \n", + "0 3146519.0 3.058910e+09 BK31 \n", + "1 1033560.0 1.012380e+09 MN12 \n", + "2 4124565.0 4.055200e+09 QN48 \n", + "3 1038490.0 1.013250e+09 MN19 \n", + "4 1056562.0 1.018750e+09 MN12 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.opensearch.search(\n", + " client,\n", + " index=\"nyc_restaurants_inspections_sample\",\n", + " size=5\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Search Indices" + "## 3. Search\n", + "#### Search results are returned as Pandas DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### 3.1 Search by DSL" + "### 3.1 Search by DSL" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idbusiness_nameinspection_idbusiness_location.lonbusiness_location.lat
0ff2f50f6-5415-4706-9bcb-af7c5eb0afa3Soup House5794_20160907-122.48129937.747228
17ba4fb17-f9a9-49da-b90e-8b3553d6d97cSoup-or-Salad5794_20160907-122.48129937.747228
2b9e8f6a2-8fd1-4660-b041-2997a1a80984San Francisco Soup Company24936_20160609-122.40015237.793199
356b352e6-102b-4eff-8296-7e1fb2459babSoup Unlimited60354_20161123-122.40906137.783527
\n", + "
" + ], + "text/plain": [ + " _id business_name \\\n", + "0 ff2f50f6-5415-4706-9bcb-af7c5eb0afa3 Soup House \n", + "1 7ba4fb17-f9a9-49da-b90e-8b3553d6d97c Soup-or-Salad \n", + "2 b9e8f6a2-8fd1-4660-b041-2997a1a80984 San Francisco Soup Company \n", + "3 56b352e6-102b-4eff-8296-7e1fb2459bab Soup Unlimited \n", + "\n", + " inspection_id business_location.lon business_location.lat \n", + "0 5794_20160907 -122.481299 37.747228 \n", + "1 5794_20160907 -122.481299 37.747228 \n", + "2 24936_20160609 -122.400152 37.793199 \n", + "3 60354_20161123 -122.409061 37.783527 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df = wr.opensearch.search(\n", - " client=client,\n", - " index='tutorials',\n", - " search_body={\n", - " \"query\": {\n", - " \"match_all\": {\n", - " }\n", - " }\n", - " }\n", - " )\n" + "# add a search query. search all soup businesses \n", + "wr.opensearch.search(\n", + " client,\n", + " index=\"sf_restaurants_inspections\",\n", + " _source=[\"inspection_id\", \"business_name\", \"business_location\"],\n", + " filter_path=[\"hits.hits._id\",\"hits.hits._source\"],\n", + " search_body={\n", + " \"query\": {\n", + " \"match\": {\n", + " \"business_name\": \"soup\"\n", + " }\n", + " }\n", + " }\n", + " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### 3.1 Search by SQL" + "### 3.1 Search by SQL" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_index_type_id_scorebusiness_nameinspection_score
0sf_restaurants_inspections_dedup_doc5794_20160907NoneSoup-or-Salad96
1sf_restaurants_inspections_dedup_doc60354_20161123NoneSoup Unlimited95
2sf_restaurants_inspections_dedup_doc24936_20160609NoneSan Francisco Soup Company77
\n", + "
" + ], + "text/plain": [ + " _index _type _id _score \\\n", + "0 sf_restaurants_inspections_dedup _doc 5794_20160907 None \n", + "1 sf_restaurants_inspections_dedup _doc 60354_20161123 None \n", + "2 sf_restaurants_inspections_dedup _doc 24936_20160609 None \n", + "\n", + " business_name inspection_score \n", + "0 Soup-or-Salad 96 \n", + "1 Soup Unlimited 95 \n", + "2 San Francisco Soup Company 77 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df = wr.opensearch.search_by_sql(\n", - " client=client,\n", - " sql_query='SELECT * FROM tutorials LIMIT 50'\n", - " )\n" + "wr.opensearch.search_by_sql(\n", + " client,\n", + " sql_query=\"\"\"SELECT business_name, inspection_score \n", + " FROM sf_restaurants_inspections_dedup\n", + " WHERE business_name LIKE '%soup%'\n", + " ORDER BY inspection_score DESC LIMIT 5\"\"\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Delete Index" + "## 4. Delete Indices" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -202,12 +1033,614 @@ "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'acknowledged': True}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "response = wr.opensearch.delete_index(\n", + "wr.opensearch.delete_index(\n", " client=client,\n", - " index=\"tutorials\",\n", - " )" + " index=\"sf_restaurants_inspections\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Bonus - Prepare data and index from DataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For this exercise we'll use [DOHMH New York City Restaurant Inspection Results dataset](https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare the data for indexing" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# fields names underscore casing \n", + "df.columns = [col.lower().replace(' ', '_') for col in df.columns]\n", + "\n", + "# convert lon/lat to OpenSearch geo_point\n", + "df['business_location'] = \"POINT (\" + df.longitude.fillna('0').astype(str) + \" \" + df.latitude.fillna('0').astype(str) + \")\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create index with mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'acknowledged': True,\n", + " 'shards_acknowledged': True,\n", + " 'index': 'nyc_restaurants_inspections'}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# delete index if exists\n", + "wr.opensearch.delete_index(\n", + " client=client,\n", + " index=\"nyc_restaurants\"\n", + " \n", + ")\n", + "\n", + "# use dynamic_template to map date fields\n", + "# define business_location as geo_point\n", + "wr.opensearch.create_index(\n", + " client=client,\n", + " index=\"nyc_restaurants_inspections\",\n", + " mappings={\n", + " \"dynamic_templates\" : [\n", + " {\n", + " \"dates\" : {\n", + " \"match\" : \"*date\",\n", + " \"mapping\" : {\n", + " \"type\" : \"date\",\n", + " \"format\" : 'MM/dd/yyyy'\n", + " }\n", + " }\n", + " }\n", + " ],\n", + " \"properties\": {\n", + " \"business_location\": {\n", + " \"type\": \"geo_point\"\n", + " }\n", + " }\n", + " } \n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Index dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Indexing: 100% (382655/382655)|##########################|Elapsed Time: 0:04:15" + ] + }, + { + "data": { + "text/plain": [ + "{'success': 382655, 'errors': []}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.opensearch.index_df(\n", + " client,\n", + " df=df,\n", + " index=\"nyc_restaurants_inspections\",\n", + " id_keys=[\"camis\"],\n", + " bulk_size=1000\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Execute geo query\n", + "#### Sort restaurants by distance from Times-Square" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
camisdbaborobuildingstreetzipcodephonecuisine_descriptioninspection_dateaction...inspection_typelatitudelongitudecommunity_boardcouncil_districtcensus_tractbinbblntabusiness_location
041551304THE COUNTERManhattan7TIMES SQUARE10036.02129976801American12/22/2016Violations were cited in the following area(s)....Cycle Inspection / Initial Inspection40.755908-73.986681105.03.011300.01086069.01.009940e+09MN17POINT (-73.986680953809 40.755907817312)
150055665ANN INC CAFEManhattan7TIMES SQUARE10036.02125413287American12/11/2019Violations were cited in the following area(s)....Cycle Inspection / Initial Inspection40.755908-73.986681105.03.011300.01086069.01.009940e+09MN17POINT (-73.986680953809 40.755907817312)
250049552ERNST AND YOUNGManhattan5TIMES SQ10036.02127739994Coffee/Tea11/30/2018Violations were cited in the following area(s)....Cycle Inspection / Initial Inspection40.755702-73.987208105.03.011300.01024656.01.010130e+09MN17POINT (-73.987207980138 40.755702020307)
350014078RED LOBSTERManhattan5TIMES SQ10036.02127306706Seafood10/03/2017Violations were cited in the following area(s)....Cycle Inspection / Initial Inspection40.755702-73.987208105.03.011300.01024656.01.010130e+09MN17POINT (-73.987207980138 40.755702020307)
450015171NEW AMSTERDAM THEATERManhattan214WEST 42 STREET10036.02125825472American06/26/2018Violations were cited in the following area(s)....Cycle Inspection / Re-inspection40.756317-73.987652105.03.011300.01024660.01.010130e+09MN17POINT (-73.987651832547 40.756316895053)
..................................................................
9541552060PROSKAUER ROSEManhattan11TIMES SQUARE10036.02129695493American08/11/2017Violations were cited in the following area(s)....Administrative Miscellaneous / Initial Inspection40.756891-73.990023105.03.011300.01087978.01.010138e+09MN17POINT (-73.990023200823 40.756890780426)
9641242148GABBY O'HARA'SManhattan123WEST 39 STREET10018.02122788984Irish07/30/2019Violations were cited in the following area(s)....Cycle Inspection / Re-inspection40.753405-73.986602105.04.011300.01080611.01.008150e+09MN17POINT (-73.986602050292 40.753404587174)
9750095860THE TIMES EATERYManhattan6808 AVENUE10036.06463867787American02/28/2020Violations were cited in the following area(s)....Pre-permit (Operational) / Initial Inspection40.757991-73.989218105.03.011900.01024703.01.010150e+09MN17POINT (-73.989218092096 40.757991356019)
9850072861ITSUManhattan5307 AVENUE10018.09176393645Asian/Asian Fusion09/10/2018Violations were cited in the following area(s)....Pre-permit (Operational) / Initial Inspection40.753844-73.988551105.03.011300.01014485.01.007880e+09MN17POINT (-73.988551029682 40.753843959794)
9950068109LUKE'S LOBSTERManhattan1407BROADWAY10018.09174759192Seafood09/06/2017Violations were cited in the following area(s)....Pre-permit (Operational) / Initial Inspection40.753432-73.987151105.03.011300.01015265.01.008140e+09MN17POINT (-73.98715066791 40.753432097521)
\n", + "

100 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " camis dba boro building street \\\n", + "0 41551304 THE COUNTER Manhattan 7 TIMES SQUARE \n", + "1 50055665 ANN INC CAFE Manhattan 7 TIMES SQUARE \n", + "2 50049552 ERNST AND YOUNG Manhattan 5 TIMES SQ \n", + "3 50014078 RED LOBSTER Manhattan 5 TIMES SQ \n", + "4 50015171 NEW AMSTERDAM THEATER Manhattan 214 WEST 42 STREET \n", + ".. ... ... ... ... ... \n", + "95 41552060 PROSKAUER ROSE Manhattan 11 TIMES SQUARE \n", + "96 41242148 GABBY O'HARA'S Manhattan 123 WEST 39 STREET \n", + "97 50095860 THE TIMES EATERY Manhattan 680 8 AVENUE \n", + "98 50072861 ITSU Manhattan 530 7 AVENUE \n", + "99 50068109 LUKE'S LOBSTER Manhattan 1407 BROADWAY \n", + "\n", + " zipcode phone cuisine_description inspection_date \\\n", + "0 10036.0 2129976801 American 12/22/2016 \n", + "1 10036.0 2125413287 American 12/11/2019 \n", + "2 10036.0 2127739994 Coffee/Tea 11/30/2018 \n", + "3 10036.0 2127306706 Seafood 10/03/2017 \n", + "4 10036.0 2125825472 American 06/26/2018 \n", + ".. ... ... ... ... \n", + "95 10036.0 2129695493 American 08/11/2017 \n", + "96 10018.0 2122788984 Irish 07/30/2019 \n", + "97 10036.0 6463867787 American 02/28/2020 \n", + "98 10018.0 9176393645 Asian/Asian Fusion 09/10/2018 \n", + "99 10018.0 9174759192 Seafood 09/06/2017 \n", + "\n", + " action ... \\\n", + "0 Violations were cited in the following area(s). ... \n", + "1 Violations were cited in the following area(s). ... \n", + "2 Violations were cited in the following area(s). ... \n", + "3 Violations were cited in the following area(s). ... \n", + "4 Violations were cited in the following area(s). ... \n", + ".. ... ... \n", + "95 Violations were cited in the following area(s). ... \n", + "96 Violations were cited in the following area(s). ... \n", + "97 Violations were cited in the following area(s). ... \n", + "98 Violations were cited in the following area(s). ... \n", + "99 Violations were cited in the following area(s). ... \n", + "\n", + " inspection_type latitude longitude \\\n", + "0 Cycle Inspection / Initial Inspection 40.755908 -73.986681 \n", + "1 Cycle Inspection / Initial Inspection 40.755908 -73.986681 \n", + "2 Cycle Inspection / Initial Inspection 40.755702 -73.987208 \n", + "3 Cycle Inspection / Initial Inspection 40.755702 -73.987208 \n", + "4 Cycle Inspection / Re-inspection 40.756317 -73.987652 \n", + ".. ... ... ... \n", + "95 Administrative Miscellaneous / Initial Inspection 40.756891 -73.990023 \n", + "96 Cycle Inspection / Re-inspection 40.753405 -73.986602 \n", + "97 Pre-permit (Operational) / Initial Inspection 40.757991 -73.989218 \n", + "98 Pre-permit (Operational) / Initial Inspection 40.753844 -73.988551 \n", + "99 Pre-permit (Operational) / Initial Inspection 40.753432 -73.987151 \n", + "\n", + " community_board council_district census_tract bin bbl \\\n", + "0 105.0 3.0 11300.0 1086069.0 1.009940e+09 \n", + "1 105.0 3.0 11300.0 1086069.0 1.009940e+09 \n", + "2 105.0 3.0 11300.0 1024656.0 1.010130e+09 \n", + "3 105.0 3.0 11300.0 1024656.0 1.010130e+09 \n", + "4 105.0 3.0 11300.0 1024660.0 1.010130e+09 \n", + ".. ... ... ... ... ... \n", + "95 105.0 3.0 11300.0 1087978.0 1.010138e+09 \n", + "96 105.0 4.0 11300.0 1080611.0 1.008150e+09 \n", + "97 105.0 3.0 11900.0 1024703.0 1.010150e+09 \n", + "98 105.0 3.0 11300.0 1014485.0 1.007880e+09 \n", + "99 105.0 3.0 11300.0 1015265.0 1.008140e+09 \n", + "\n", + " nta business_location \n", + "0 MN17 POINT (-73.986680953809 40.755907817312) \n", + "1 MN17 POINT (-73.986680953809 40.755907817312) \n", + "2 MN17 POINT (-73.987207980138 40.755702020307) \n", + "3 MN17 POINT (-73.987207980138 40.755702020307) \n", + "4 MN17 POINT (-73.987651832547 40.756316895053) \n", + ".. ... ... \n", + "95 MN17 POINT (-73.990023200823 40.756890780426) \n", + "96 MN17 POINT (-73.986602050292 40.753404587174) \n", + "97 MN17 POINT (-73.989218092096 40.757991356019) \n", + "98 MN17 POINT (-73.988551029682 40.753843959794) \n", + "99 MN17 POINT (-73.98715066791 40.753432097521) \n", + "\n", + "[100 rows x 27 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.opensearch.search(\n", + " client,\n", + " index=\"nyc_restaurants_inspections\",\n", + " filter_path=[\"hits.hits._source\"],\n", + " size=100,\n", + " search_body={\n", + " \"query\": {\n", + " \"match_all\": {}\n", + " },\n", + " \"sort\": [\n", + " {\n", + " \"_geo_distance\": {\n", + " \"business_location\": { # Times-Square - https://geojson.io/#map=16/40.7563/-73.9862\n", + " \"lat\": 40.75613228383523,\n", + " \"lon\": -73.9865791797638\n", + " },\n", + " \"order\": \"asc\"\n", + " }\n", + " }\n", + " ]\n", + " }\n", + ")" ] } ], @@ -227,7 +1660,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.7.7" } }, "nbformat": 4, From 53dff4b9198077bfeec6d73e45cec2c9eb6cd75c Mon Sep 17 00:00:00 2001 From: Abdel Jaidi Date: Tue, 5 Oct 2021 17:15:22 +0100 Subject: [PATCH 38/41] Minor - Pylint --- awswrangler/opensearch/_read.py | 2 +- awswrangler/opensearch/_utils.py | 2 +- awswrangler/opensearch/_write.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index ba345ab43..8f2ef95c1 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -15,7 +15,7 @@ def _resolve_fields(row: Mapping[str, Any]) -> Mapping[str, Any]: if isinstance(row[field], dict): nested_fields = _resolve_fields(row[field]) for n_field, val in nested_fields.items(): - fields["{}.{}".format(field, n_field)] = val + fields[f"{field}.{n_field}"] = val else: fields[field] = row[field] return fields diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index b2a139cbf..a48b0eadc 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -75,7 +75,7 @@ def connect( valid_ports = {80, 443} if port not in valid_ports: - raise ValueError("results: port must be one of %r." % valid_ports) + raise ValueError(f"results: port must be one of {valid_ports}") if username and password: http_auth = (username, password) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 5bb081909..62e9d146e 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -84,7 +84,7 @@ def _deserialize(v: Any) -> Any: def _file_line_generator(path: str, is_json: bool = False) -> Generator[Any, None, None]: - with open(path) as fp: + with open(path) as fp: # pylint: disable=W1514 for line in fp: if is_json: yield json.loads(line) From c6e6d8051ab026550a8c98ec9e134a41fb1334a8 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 5 Oct 2021 14:47:39 -0400 Subject: [PATCH 39/41] [skip ci] opensearch: pylint f-string and file open encoding --- awswrangler/opensearch/_read.py | 2 +- awswrangler/opensearch/_utils.py | 2 +- awswrangler/opensearch/_write.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/awswrangler/opensearch/_read.py b/awswrangler/opensearch/_read.py index ba345ab43..8f2ef95c1 100644 --- a/awswrangler/opensearch/_read.py +++ b/awswrangler/opensearch/_read.py @@ -15,7 +15,7 @@ def _resolve_fields(row: Mapping[str, Any]) -> Mapping[str, Any]: if isinstance(row[field], dict): nested_fields = _resolve_fields(row[field]) for n_field, val in nested_fields.items(): - fields["{}.{}".format(field, n_field)] = val + fields[f"{field}.{n_field}"] = val else: fields[field] = row[field] return fields diff --git a/awswrangler/opensearch/_utils.py b/awswrangler/opensearch/_utils.py index b2a139cbf..6c5ef99c7 100644 --- a/awswrangler/opensearch/_utils.py +++ b/awswrangler/opensearch/_utils.py @@ -75,7 +75,7 @@ def connect( valid_ports = {80, 443} if port not in valid_ports: - raise ValueError("results: port must be one of %r." % valid_ports) + raise ValueError(f"results: port must be one of {valid_ports}.") if username and password: http_auth = (username, password) diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py index 5bb081909..3ea466693 100644 --- a/awswrangler/opensearch/_write.py +++ b/awswrangler/opensearch/_write.py @@ -84,7 +84,7 @@ def _deserialize(v: Any) -> Any: def _file_line_generator(path: str, is_json: bool = False) -> Generator[Any, None, None]: - with open(path) as fp: + with open(path, encoding="utf-8") as fp: for line in fp: if is_json: yield json.loads(line) From 29f892c3c9ed2764b5a3045b8cbd8cfcc6ea87c2 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Tue, 5 Oct 2021 20:38:39 -0400 Subject: [PATCH 40/41] opensearch: add to CONTRIBUTING.md --- CONTRIBUTING.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d3420ade6..e898ec21e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -215,6 +215,10 @@ or ``./deploy-base.sh`` ``./deploy-databases.sh`` +* [OPTIONAL] Deploy the Cloudformation template `opensearch.yaml` (if you need to test Amazon OpenSearch Service). This step could take about 15 minutes to deploy. + +``./deploy-opensearch.sh`` + * Go to the `EC2 -> SecurityGroups` console, open the `aws-data-wrangler-*` security group and configure to accept your IP from any TCP port. - Alternatively run: @@ -244,7 +248,7 @@ or ``pytest -n 8 tests/test_db.py`` -* To run all data lake test functions for all python versions (Only if Amazon QuickSight is activated): +* To run all data lake test functions for all python versions (Only if Amazon QuickSight is activated and Amazon OpenSearch template is deployed): ``./test.sh`` From 827c3bf4c79a71352a14b8260f4057e2b363c051 Mon Sep 17 00:00:00 2001 From: Assaf Mentzer Date: Wed, 6 Oct 2021 11:39:22 -0400 Subject: [PATCH 41/41] opensearch: update aws-cdk packages to have the same minimum version --- test_infra/poetry.lock | 2 +- test_infra/pyproject.toml | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/test_infra/poetry.lock b/test_infra/poetry.lock index c4e5df69b..aa17ff35f 100644 --- a/test_infra/poetry.lock +++ b/test_infra/poetry.lock @@ -755,7 +755,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes [metadata] lock-version = "1.1" python-versions = ">=3.6.2, <3.10" -content-hash = "7fe703d54794d69aab0dd6ad5b4017c43defbff76ed9a3fe10e243c422adfea6" +content-hash = "6d95fccb052c85375178aa3ade72de9e4ee87c009d7e067dd7d4120c23ded9f5" [metadata.files] attrs = [ diff --git a/test_infra/pyproject.toml b/test_infra/pyproject.toml index 761c315d7..02e0241d8 100644 --- a/test_infra/pyproject.toml +++ b/test_infra/pyproject.toml @@ -7,15 +7,15 @@ license = "Apache License 2.0" [tool.poetry.dependencies] python = ">=3.6.2, <3.10" -"aws-cdk.core" = "^1.115.0" -"aws-cdk.aws-ec2" = "^1.115.0" -"aws-cdk.aws-glue" = "^1.115.0" -"aws-cdk.aws-iam" = "^1.115.0" -"aws-cdk.aws-kms" = "^1.115.0" -"aws-cdk.aws-logs" = "^1.115.0" -"aws-cdk.aws-s3" = "^1.115.0" -"aws-cdk.aws-redshift" = "^1.115.0" -"aws-cdk.aws-rds" = "^1.115.0" -"aws-cdk.aws-secretsmanager" = "^1.115.0" -"aws-cdk.aws-ssm" = "^1.115.0" +"aws-cdk.core" = "^1.124.0" +"aws-cdk.aws-ec2" = "^1.124.0" +"aws-cdk.aws-glue" = "^1.124.0" +"aws-cdk.aws-iam" = "^1.124.0" +"aws-cdk.aws-kms" = "^1.124.0" +"aws-cdk.aws-logs" = "^1.124.0" +"aws-cdk.aws-s3" = "^1.124.0" +"aws-cdk.aws-redshift" = "^1.124.0" +"aws-cdk.aws-rds" = "^1.124.0" +"aws-cdk.aws-secretsmanager" = "^1.124.0" +"aws-cdk.aws-ssm" = "^1.124.0" "aws-cdk.aws-opensearchservice" = "^1.124.0"