From 1823dd8b7807d4a03b5bfe889b7079648343f82e Mon Sep 17 00:00:00 2001 From: Patrick Muller Date: Sat, 6 Jun 2020 22:28:35 +0200 Subject: [PATCH 01/28] adding a feature to create and delete dabases in glue datacatalog --- awswrangler/catalog.py | 75 ++++++++++++++++++++++ testing/test_awswrangler/test_data_lake.py | 34 ++++++++++ 2 files changed, 109 insertions(+) diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index 057f9e4a5..ef391b81f 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -17,6 +17,81 @@ _logger: logging.Logger = logging.getLogger(__name__) +def create_glue_database( + name: str, description: str = "", catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> None: + """Create a database un AWS Glue Catalog. + + Parameters + ---------- + name : str + Database name. + description : str + A Descrption for the Database. + catalog_id : str, optional + The ID of the Data Catalog from which to retrieve Databases. + If none is provided, the AWS account ID is used by default. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.catalog.create_glue_database( + ... name='awswrangler_test' + ... ) + """ + args = {} + client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) + args["Name"] = name + args["Description"] = description + + if catalog_id is not None: + client_glue.create_database(CatalogId=catalog_id, DatabaseInput=args) + else: + client_glue.create_database(DatabaseInput=args) + + +def delete_glue_database( + name: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> None: + """Create a database un AWS Glue Catalog. + + Parameters + ---------- + name : str + Database name. + catalog_id : str, optional + The ID of the Data Catalog from which to retrieve Databases. + If none is provided, the AWS account ID is used by default. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.catalog.delete_glue_database( + ... name='awswrangler_test' + ... ) + """ + client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) + + if catalog_id is not None: + client_glue.delete_database(CatalogId=catalog_id, Name=name) + else: + client_glue.delete_database(Name=name) + + def delete_table_if_exists(database: str, table: str, boto3_session: Optional[boto3.Session] = None) -> bool: """Delete Glue table if exists. diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 19fb1ca19..b18c48538 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -198,6 +198,40 @@ def path3(bucket): yield from path_generator(bucket) +def test_glue_database(): + + # Round 1 - Create Database + database_name = f"database_{get_time_str_with_random_suffix()}" + print(f"Database Name: {database_name}") + wr.catalog.create_glue_database(name=database_name, description="Database Description") + databases = wr.catalog.get_databases() + test_database_name = "" + test_database_description = "" + + for database in databases: + if database["Name"] == database_name: + test_database_name = database["Name"] + test_database_description = database["Description"] + + assert test_database_name == database_name + assert test_database_description == "Database Description" + + # Round 2 - Delete Database + print(f"Database Name: {database_name}") + wr.catalog.delete_glue_database(name=database_name) + databases = wr.catalog.get_databases() + test_database_name = "" + test_database_description = "" + + for database in databases: + if database["Name"] == database_name: + test_database_name = database["Name"] + test_database_description = database["Description"] + + assert test_database_name == "" + assert test_database_description == "" + + def test_to_parquet_modes(database, table, path, external_schema): # Round 1 - Warm up From ae3078552a567ab698607626b15227d63614fc6b Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 6 Jun 2020 19:48:26 -0300 Subject: [PATCH 02/28] Small updates in the catalog database functions. --- awswrangler/catalog.py | 30 +++++++++--------- testing/test_awswrangler/test_data_lake.py | 34 --------------------- testing/test_awswrangler/test_data_lake2.py | 34 +++++++++++++++++++++ 3 files changed, 50 insertions(+), 48 deletions(-) diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index ef391b81f..52a46e3a7 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -17,16 +17,19 @@ _logger: logging.Logger = logging.getLogger(__name__) -def create_glue_database( - name: str, description: str = "", catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +def create_database( + name: str, + description: Optional[str] = None, + catalog_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, ) -> None: - """Create a database un AWS Glue Catalog. + """Create a database in AWS Glue Catalog. Parameters ---------- name : str Database name. - description : str + description : str, optional A Descrption for the Database. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. @@ -42,25 +45,24 @@ def create_glue_database( Examples -------- >>> import awswrangler as wr - >>> wr.catalog.create_glue_database( + >>> wr.catalog.create_database( ... name='awswrangler_test' ... ) """ - args = {} + args: Dict[str, str] = {} client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args["Name"] = name - args["Description"] = description + if description is not None: + args["Description"] = description if catalog_id is not None: - client_glue.create_database(CatalogId=catalog_id, DatabaseInput=args) + client_glue.create_database(CatalogId=catalog_id, DatabaseInput=args) # pragma: no cover else: client_glue.create_database(DatabaseInput=args) -def delete_glue_database( - name: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> None: - """Create a database un AWS Glue Catalog. +def delete_database(name: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Create a database in AWS Glue Catalog. Parameters ---------- @@ -80,14 +82,14 @@ def delete_glue_database( Examples -------- >>> import awswrangler as wr - >>> wr.catalog.delete_glue_database( + >>> wr.catalog.delete_database( ... name='awswrangler_test' ... ) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) if catalog_id is not None: - client_glue.delete_database(CatalogId=catalog_id, Name=name) + client_glue.delete_database(CatalogId=catalog_id, Name=name) # pragma: no cover else: client_glue.delete_database(Name=name) diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index b18c48538..19fb1ca19 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -198,40 +198,6 @@ def path3(bucket): yield from path_generator(bucket) -def test_glue_database(): - - # Round 1 - Create Database - database_name = f"database_{get_time_str_with_random_suffix()}" - print(f"Database Name: {database_name}") - wr.catalog.create_glue_database(name=database_name, description="Database Description") - databases = wr.catalog.get_databases() - test_database_name = "" - test_database_description = "" - - for database in databases: - if database["Name"] == database_name: - test_database_name = database["Name"] - test_database_description = database["Description"] - - assert test_database_name == database_name - assert test_database_description == "Database Description" - - # Round 2 - Delete Database - print(f"Database Name: {database_name}") - wr.catalog.delete_glue_database(name=database_name) - databases = wr.catalog.get_databases() - test_database_name = "" - test_database_description = "" - - for database in databases: - if database["Name"] == database_name: - test_database_name = database["Name"] - test_database_description = database["Description"] - - assert test_database_name == "" - assert test_database_description == "" - - def test_to_parquet_modes(database, table, path, external_schema): # Round 1 - Warm up diff --git a/testing/test_awswrangler/test_data_lake2.py b/testing/test_awswrangler/test_data_lake2.py index cbb78fb41..320c2152a 100644 --- a/testing/test_awswrangler/test_data_lake2.py +++ b/testing/test_awswrangler/test_data_lake2.py @@ -423,3 +423,37 @@ def test_read_partitioned_fwf(path, use_threads, chunksize): else: for d in df2: assert d.shape == (1, 4) + + +def test_glue_database(): + + # Round 1 - Create Database + database_name = f"database_{get_time_str_with_random_suffix()}" + print(f"Database Name: {database_name}") + wr.catalog.create_database(name=database_name, description="Database Description") + databases = wr.catalog.get_databases() + test_database_name = "" + test_database_description = "" + + for database in databases: + if database["Name"] == database_name: + test_database_name = database["Name"] + test_database_description = database["Description"] + + assert test_database_name == database_name + assert test_database_description == "Database Description" + + # Round 2 - Delete Database + print(f"Database Name: {database_name}") + wr.catalog.delete_database(name=database_name) + databases = wr.catalog.get_databases() + test_database_name = "" + test_database_description = "" + + for database in databases: + if database["Name"] == database_name: + test_database_name = database["Name"] + test_database_description = database["Description"] + + assert test_database_name == "" + assert test_database_description == "" From a8eff42c78063febe0494e3258ee6e43c5e7b8dd Mon Sep 17 00:00:00 2001 From: igorborgest Date: Wed, 3 Jun 2020 19:25:26 -0300 Subject: [PATCH 03/28] Fix typo in tutorial 8. --- tutorials/008 - Redshift - Copy & Unload.ipynb | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tutorials/008 - Redshift - Copy & Unload.ipynb b/tutorials/008 - Redshift - Copy & Unload.ipynb index 7bb23da14..14133f101 100644 --- a/tutorials/008 - Redshift - Copy & Unload.ipynb +++ b/tutorials/008 - Redshift - Copy & Unload.ipynb @@ -14,7 +14,7 @@ "\n", "2 - [UNLOAD](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html)\n", "\n", - "Let's take a look and how Wrangler could can use it." + "Let's take a look and how Wrangler can use it." ] }, { @@ -781,17 +781,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file From 762655a8dd99ffc4427717090c198f907e2fbbb7 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Wed, 3 Jun 2020 19:52:33 -0300 Subject: [PATCH 04/28] Bumping dependencies versions. --- requirements-dev.txt | 8 ++++---- requirements.txt | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index e0abc8e4a..09b2476ec 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,17 +2,17 @@ awscli>=1.18.0,<2.0.0 black~=19.3b0 pylint~=2.5.2 flake8~=3.8.2 -mypy~=0.770 +mypy~=0.780 isort~=4.3.21 pydocstyle~=5.0.2 -doc8~=0.8.0 +doc8~=0.8.1 tox~=3.15.1 -pytest~=5.4.2 +pytest~=5.4.3 pytest-cov~=2.9.0 pytest-xdist~=1.32.0 pytest-timeout~=1.3.4 scikit-learn~=0.23.1 -cfn-lint~=0.32.1 +cfn-lint~=0.33.0 cfn-flip~=1.2.3 twine~=3.1.1 wheel~=0.34.2 diff --git a/requirements.txt b/requirements.txt index c8eedd045..3c94fd309 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ s3fs~=0.4.2 psycopg2-binary~=2.8.0 pymysql~=0.9.0 sqlalchemy-redshift~=0.7.0 -SQLAlchemy>=1.3.10,<1.3.16 \ No newline at end of file +SQLAlchemy~=1.3.10 \ No newline at end of file From f029a3c522e80ab3bde3dd796eec899e4212af82 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 4 Jun 2020 12:08:52 -0300 Subject: [PATCH 05/28] Passing AWS environment variables to tox environment. --- requirements-dev.txt | 2 +- requirements.txt | 2 +- tox.ini | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 09b2476ec..49be548c4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -19,4 +19,4 @@ wheel~=0.34.2 sphinx~=3.0.4 sphinx_bootstrap_theme~=0.7.1 moto~=1.3.14 -jupyterlab~=2.1.4 \ No newline at end of file +jupyterlab~=2.1.4 diff --git a/requirements.txt b/requirements.txt index 3c94fd309..6ee02a1fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ s3fs~=0.4.2 psycopg2-binary~=2.8.0 pymysql~=0.9.0 sqlalchemy-redshift~=0.7.0 -SQLAlchemy~=1.3.10 \ No newline at end of file +SQLAlchemy~=1.3.10 diff --git a/tox.ini b/tox.ini index 288b563dc..620eeab7e 100644 --- a/tox.ini +++ b/tox.ini @@ -2,15 +2,17 @@ envlist = py{37,38,36} [testenv] +passenv = AWS_PROFILE AWS_DEFAULT_REGION AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY deps = pytest pytest-xdist pytest-timeout moto commands = - pytest --timeout=900 -n 8 testing/test_awswrangler + pytest --timeout=600 -n 8 testing/test_awswrangler [testenv:py36] +passenv = AWS_PROFILE AWS_DEFAULT_REGION AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY deps = {[testenv]deps} pytest-cov From 87d6396d63a820c6fd707ced7ebaabbb5cd7d237 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 6 Jun 2020 19:25:19 -0300 Subject: [PATCH 06/28] Add boto3 session serializer/deserializer on _utils.py. --- awswrangler/_utils.py | 34 +++++++++++++++++++++++++++++++--- awswrangler/s3.py | 33 +++++++++++++++++++++------------ 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py index c399701f8..c20117891 100644 --- a/awswrangler/_utils.py +++ b/awswrangler/_utils.py @@ -1,10 +1,11 @@ """Internal (private) Utilities Module.""" +import copy import logging import math import os import random -from typing import Any, Dict, Generator, List, Optional, Tuple +from typing import Any, Dict, Generator, List, Optional, Tuple, Union import boto3 # type: ignore import botocore.config # type: ignore @@ -17,8 +18,10 @@ _logger: logging.Logger = logging.getLogger(__name__) -def ensure_session(session: Optional[boto3.Session] = None) -> boto3.Session: +def ensure_session(session: Optional[Union[boto3.Session, Dict[str, Optional[str]]]] = None) -> boto3.Session: """Ensure that a valid boto3.Session will be returned.""" + if isinstance(session, dict): # Primitives received + return boto3_from_primitives(primitives=session) if session is not None: return session # Ensure the boto3's default session is used so that its parameters can be @@ -28,6 +31,30 @@ def ensure_session(session: Optional[boto3.Session] = None) -> boto3.Session: return boto3.Session() # pragma: no cover +def boto3_to_primitives(boto3_session: Optional[boto3.Session] = None) -> Dict[str, Optional[str]]: + """Convert Boto3 Session to Python primitives.""" + _boto3_session: boto3.Session = ensure_session(session=boto3_session) + credentials = _boto3_session.get_credentials() + return { + "aws_access_key_id": getattr(credentials, "access_key", None), + "aws_secret_access_key": getattr(credentials, "secret_key", None), + "aws_session_token": getattr(credentials, "token", None), + "region_name": _boto3_session.region_name, + "profile_name": _boto3_session.profile_name, + } + + +def boto3_from_primitives(primitives: Dict[str, Optional[str]] = None) -> boto3.Session: + """Convert Python primitives to Boto3 Session.""" + if primitives is None: + return boto3.DEFAULT_SESSION # pragma: no cover + _primitives: Dict[str, Optional[str]] = copy.deepcopy(primitives) + profile_name: Optional[str] = _primitives.get("profile_name", None) + _primitives["profile_name"] = None if profile_name in (None, "default") else profile_name + args: Dict[str, str] = {k: v for k, v in _primitives.items() if v is not None} + return boto3.Session(**args) + + def client(service_name: str, session: Optional[boto3.Session] = None) -> boto3.client: """Create a valid boto3.client.""" return ensure_session(session=session).client( @@ -139,7 +166,8 @@ def chunkify(lst: List[Any], num_chunks: int = 1, max_length: Optional[int] = No def get_fs( - session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None + session: Optional[Union[boto3.Session, Dict[str, Optional[str]]]] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> s3fs.S3FileSystem: """Build a S3FileSystem from a given boto3 session.""" fs: s3fs.S3FileSystem = s3fs.S3FileSystem( diff --git a/awswrangler/s3.py b/awswrangler/s3.py index d82df8567..0e98e674c 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -1614,7 +1614,7 @@ def _read_text( path_root=path_root, ) return dfs - if (use_threads is False) or (boto3_session is not None): + if use_threads is False: df: pd.DataFrame = pd.concat( objs=[ _read_text_full( @@ -1640,7 +1640,7 @@ def _read_text( repeat(parser_func), repeat(path_root), paths, - repeat(None), # Boto3.Session + repeat(_utils.boto3_to_primitives(boto3_session=session)), # Boto3.Session repeat(pandas_kwargs), repeat(s3_additional_kwargs), repeat(dataset), @@ -1683,7 +1683,7 @@ def _read_text_full( parser_func: Callable, path_root: str, path: str, - boto3_session: boto3.Session, + boto3_session: Union[boto3.Session, Dict[str, Optional[str]]], pandas_args: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]] = None, dataset: bool = False, @@ -2354,29 +2354,38 @@ def _wait_objects( delay = 5 if delay is None else delay max_attempts = 20 if max_attempts is None else max_attempts _delay: int = int(delay) if isinstance(delay, float) else delay - if len(paths) < 1: return None client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - waiter = client_s3.get_waiter(waiter_name) _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths] if use_threads is False: + waiter = client_s3.get_waiter(waiter_name) for bucket, key in _paths: waiter.wait(Bucket=bucket, Key=key, WaiterConfig={"Delay": _delay, "MaxAttempts": max_attempts}) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - futures: List[concurrent.futures.Future] = [] - for bucket, key in _paths: - future: concurrent.futures.Future = executor.submit( - fn=waiter.wait, Bucket=bucket, Key=key, WaiterConfig={"Delay": _delay, "MaxAttempts": max_attempts} + list( + executor.map( + _wait_objects_concurrent, + _paths, + repeat(waiter_name), + repeat(client_s3), + repeat(_delay), + repeat(max_attempts), ) - futures.append(future) - for future in futures: - future.result() + ) return None +def _wait_objects_concurrent( + path: Tuple[str, str], waiter_name: str, client_s3: boto3.client, delay: int, max_attempts: int +) -> None: + waiter = client_s3.get_waiter(waiter_name) + bucket, key = path + waiter.wait(Bucket=bucket, Key=key, WaiterConfig={"Delay": delay, "MaxAttempts": max_attempts}) + + def read_parquet_table( table: str, database: str, From 01cc4ee6119cb204d8290a951c33dd6cceb45eb5 Mon Sep 17 00:00:00 2001 From: Patrick Muller Date: Sun, 7 Jun 2020 15:40:45 +0200 Subject: [PATCH 07/28] Adding the create and delete database in the notebook --- tutorials/005 - Glue Catalog.ipynb | 174 ++++------ tutorials/006 - Amazon Athena.ipynb | 521 +++++++++++++++------------- 2 files changed, 363 insertions(+), 332 deletions(-) diff --git a/tutorials/005 - Glue Catalog.ipynb b/tutorials/005 - Glue Catalog.ipynb index 1e48a20c9..4c3b02540 100644 --- a/tutorials/005 - Glue Catalog.ipynb +++ b/tutorials/005 - Glue Catalog.ipynb @@ -36,10 +36,10 @@ "metadata": {}, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ - " ··········································\n" + "········\n" ] } ], @@ -192,77 +192,62 @@ "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DatabaseDescription
0awswrangler_testAWS Data Wrangler Test Arena - Glue Database
1defaultDefault Hive database
2sampledbSample database
\n", - "
" - ], - "text/plain": [ - " Database Description\n", - "0 awswrangler_test AWS Data Wrangler Test Arena - Glue Database\n", - "1 default Default Hive database\n", - "2 sampledb Sample database" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 default Default Hive database\n" + ] } ], "source": [ - "wr.catalog.databases()" + "databases = wr.catalog.databases()\n", + "print(databases)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Checking the empty database" + "### Create the database awswrangler_test if not exists" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 awswrangler_test \n", + "2 default Default Hive database\n" + ] + } + ], + "source": [ + "if \"awswrangler_test\" not in databases.values:\n", + " wr.catalog.create_database(\"awswrangler_test\")\n", + " print(wr.catalog.databases())\n", + "else:\n", + " print(\"Database awswrangler_test already exists\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Checking the empty database" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { @@ -293,37 +278,17 @@ " \n", " \n", " \n", - " \n", - " 0\n", - " awswrangler_test\n", - " lambda\n", - " \n", - " col1, col2\n", - " \n", - " \n", - " \n", - " 1\n", - " awswrangler_test\n", - " noaa\n", - " \n", - " id, dt, element, value, m_flag, q_flag, s_flag...\n", - " \n", - " \n", " \n", "\n", "" ], "text/plain": [ - " Database Table Description \\\n", - "0 awswrangler_test lambda \n", - "1 awswrangler_test noaa \n", - "\n", - " Columns Partitions \n", - "0 col1, col2 \n", - "1 id, dt, element, value, m_flag, q_flag, s_flag... " + "Empty DataFrame\n", + "Columns: [Database, Table, Description, Columns, Partitions]\n", + "Index: []" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -341,7 +306,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -408,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -463,7 +428,7 @@ "0 crim, zn, indus, chas, nox, rm, age, dis, rad,... " ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -474,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -529,7 +494,7 @@ "0 crim, zn, indus, chas, nox, rm, age, dis, rad,... " ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -540,7 +505,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -595,7 +560,7 @@ "0 crim, zn, indus, chas, nox, rm, age, dis, rad,... " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -606,7 +571,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -661,7 +626,7 @@ "0 crim, zn, indus, chas, nox, rm, age, dis, rad,... " ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -679,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -846,7 +811,7 @@ "13 " ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -864,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -872,19 +837,28 @@ " wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete Database" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "wr.catalog.delete_database('awswrangler_test')" + ] } ], "metadata": { "kernelspec": { - "display_name": "conda_python3", + "display_name": "Python 3", "language": "python", - "name": "conda_python3" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -896,9 +870,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tutorials/006 - Amazon Athena.ipynb b/tutorials/006 - Amazon Athena.ipynb index c4216346a..a3fcb221d 100644 --- a/tutorials/006 - Amazon Athena.ipynb +++ b/tutorials/006 - Amazon Athena.ipynb @@ -49,10 +49,10 @@ "metadata": {}, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ - " ··········································\n" + "········\n" ] } ], @@ -62,6 +62,57 @@ "path = f\"s3://{bucket}/data/\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Checking Glue Catalog Databases" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 default Default Hive database\n" + ] + } + ], + "source": [ + "databases = wr.catalog.databases()\n", + "print(databases)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 awswrangler_test \n", + "2 default Default Hive database\n" + ] + } + ], + "source": [ + "if \"awswrangler_test\" not in databases.values:\n", + " wr.catalog.create_database(\"awswrangler_test\")\n", + " print(wr.catalog.databases())\n", + "else:\n", + " print(\"Database awswrangler_test already exists\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -73,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -175,20 +226,20 @@ " ...\n", " \n", " \n", - " 29240012\n", - " USC00181790\n", - " 1899-12-31\n", - " PRCP\n", + " 1276241\n", + " CA001167635\n", + " 1890-12-31\n", + " SNOW\n", " 0\n", - " P\n", " NaN\n", - " 6\n", - " 1830\n", + " NaN\n", + " C\n", + " NaN\n", " \n", " \n", - " 29240013\n", - " ASN00061000\n", - " 1899-12-31\n", + " 1276242\n", + " ASN00019053\n", + " 1890-12-31\n", " PRCP\n", " 0\n", " NaN\n", @@ -197,9 +248,9 @@ " NaN\n", " \n", " \n", - " 29240014\n", - " ASN00040284\n", - " 1899-12-31\n", + " 1276243\n", + " ASN00024501\n", + " 1890-12-31\n", " PRCP\n", " 0\n", " NaN\n", @@ -208,22 +259,22 @@ " NaN\n", " \n", " \n", - " 29240015\n", - " ASN00048117\n", - " 1899-12-31\n", + " 1276244\n", + " SF001035700\n", + " 1890-12-31\n", " PRCP\n", " 0\n", " NaN\n", " NaN\n", - " a\n", + " I\n", " NaN\n", " \n", " \n", - " 29240016\n", + " 1276245\n", " ASN00054092\n", - " 1899-12-31\n", + " 1890-12-31\n", " PRCP\n", - " 0\n", + " 15\n", " NaN\n", " NaN\n", " a\n", @@ -231,27 +282,27 @@ " \n", " \n", "\n", - "

29240017 rows × 8 columns

\n", + "

1276246 rows × 8 columns

\n", "" ], "text/plain": [ - " id dt element value m_flag q_flag s_flag obs_time\n", - "0 ASN00070200 1890-01-01 PRCP 0 NaN NaN a NaN\n", - "1 SF000782720 1890-01-01 PRCP 0 NaN NaN I NaN\n", - "2 CA005022790 1890-01-01 TMAX -222 NaN NaN C NaN\n", - "3 CA005022790 1890-01-01 TMIN -261 NaN NaN C NaN\n", - "4 CA005022790 1890-01-01 PRCP 0 NaN NaN C NaN\n", - "... ... ... ... ... ... ... ... ...\n", - "29240012 USC00181790 1899-12-31 PRCP 0 P NaN 6 1830\n", - "29240013 ASN00061000 1899-12-31 PRCP 0 NaN NaN a NaN\n", - "29240014 ASN00040284 1899-12-31 PRCP 0 NaN NaN a NaN\n", - "29240015 ASN00048117 1899-12-31 PRCP 0 NaN NaN a NaN\n", - "29240016 ASN00054092 1899-12-31 PRCP 0 NaN NaN a NaN\n", + " id dt element value m_flag q_flag s_flag obs_time\n", + "0 ASN00070200 1890-01-01 PRCP 0 NaN NaN a NaN\n", + "1 SF000782720 1890-01-01 PRCP 0 NaN NaN I NaN\n", + "2 CA005022790 1890-01-01 TMAX -222 NaN NaN C NaN\n", + "3 CA005022790 1890-01-01 TMIN -261 NaN NaN C NaN\n", + "4 CA005022790 1890-01-01 PRCP 0 NaN NaN C NaN\n", + "... ... ... ... ... ... ... ... ...\n", + "1276241 CA001167635 1890-12-31 SNOW 0 NaN NaN C NaN\n", + "1276242 ASN00019053 1890-12-31 PRCP 0 NaN NaN a NaN\n", + "1276243 ASN00024501 1890-12-31 PRCP 0 NaN NaN a NaN\n", + "1276244 SF001035700 1890-12-31 PRCP 0 NaN NaN I NaN\n", + "1276245 ASN00054092 1890-12-31 PRCP 15 NaN NaN a NaN\n", "\n", - "[29240017 rows x 8 columns]" + "[1276246 rows x 8 columns]" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -269,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -285,7 +336,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -388,7 +439,7 @@ "7 obs_time string False " ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -406,15 +457,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 19.7 s, sys: 5.2 s, total: 24.9 s\n", - "Wall time: 45 s\n" + "CPU times: user 1.57 s, sys: 454 ms, total: 2.02 s\n", + "Wall time: 46.6 s\n" ] }, { @@ -451,8 +502,8 @@ " \n", " \n", " 0\n", - " ASN00047014\n", - " 1892-01-16\n", + " ASN00061069\n", + " 1890-01-01\n", " PRCP\n", " 0\n", " <NA>\n", @@ -462,46 +513,46 @@ " \n", " \n", " 1\n", - " ASN00056032\n", - " 1892-01-16\n", + " USC00212904\n", + " 1890-01-01\n", " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", " 2\n", - " KG000036948\n", - " 1892-01-16\n", - " PRCP\n", - " 0\n", + " USC00212904\n", + " 1890-01-01\n", + " SNWD\n", + " 305\n", " <NA>\n", " <NA>\n", - " I\n", + " 6\n", " <NA>\n", " \n", " \n", " 3\n", - " CA005010868\n", - " 1892-01-16\n", + " ASN00019052\n", + " 1890-01-01\n", " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " a\n", " <NA>\n", " \n", " \n", " 4\n", - " CA005010868\n", - " 1892-01-16\n", - " SNOW\n", + " RSM00022112\n", + " 1890-01-01\n", + " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " I\n", " <NA>\n", " \n", " \n", @@ -516,83 +567,83 @@ " ...\n", " \n", " \n", - " 29240012\n", - " USC00303773\n", - " 1899-12-31\n", - " SNOW\n", + " 1276241\n", + " ASN00075035\n", + " 1890-10-28\n", + " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " 6\n", + " a\n", " <NA>\n", " \n", " \n", - " 29240013\n", - " USC00165090\n", - " 1899-12-31\n", - " TMAX\n", - " 100\n", + " 1276242\n", + " SF001988360\n", + " 1890-10-28\n", + " PRCP\n", + " 51\n", " <NA>\n", " <NA>\n", - " 6\n", + " I\n", " <NA>\n", " \n", " \n", - " 29240014\n", - " USC00165090\n", - " 1899-12-31\n", - " TMIN\n", - " -33\n", + " 1276243\n", + " ASN00048021\n", + " 1890-10-28\n", + " PRCP\n", + " 0\n", " <NA>\n", " <NA>\n", - " 6\n", + " a\n", " <NA>\n", " \n", " \n", - " 29240015\n", - " USC00165090\n", - " 1899-12-31\n", + " 1276244\n", + " USC00412758\n", + " 1890-10-28\n", " PRCP\n", - " 51\n", + " 0\n", " <NA>\n", " <NA>\n", " 6\n", " <NA>\n", " \n", " \n", - " 29240016\n", - " USC00165090\n", - " 1899-12-31\n", - " SNOW\n", - " 51\n", + " 1276245\n", + " SF000440500\n", + " 1890-10-28\n", + " PRCP\n", + " 0\n", " <NA>\n", " <NA>\n", - " 6\n", + " I\n", " <NA>\n", " \n", " \n", "\n", - "

29240017 rows × 8 columns

\n", + "

1276246 rows × 8 columns

\n", "" ], "text/plain": [ - " id dt element value m_flag q_flag s_flag obs_time\n", - "0 ASN00047014 1892-01-16 PRCP 0 a \n", - "1 ASN00056032 1892-01-16 PRCP 0 a \n", - "2 KG000036948 1892-01-16 PRCP 0 I \n", - "3 CA005010868 1892-01-16 PRCP 0 C \n", - "4 CA005010868 1892-01-16 SNOW 0 C \n", - "... ... ... ... ... ... ... ... ...\n", - "29240012 USC00303773 1899-12-31 SNOW 0 6 \n", - "29240013 USC00165090 1899-12-31 TMAX 100 6 \n", - "29240014 USC00165090 1899-12-31 TMIN -33 6 \n", - "29240015 USC00165090 1899-12-31 PRCP 51 6 \n", - "29240016 USC00165090 1899-12-31 SNOW 51 6 \n", + " id dt element value m_flag q_flag s_flag obs_time\n", + "0 ASN00061069 1890-01-01 PRCP 0 a \n", + "1 USC00212904 1890-01-01 PRCP 0 6 \n", + "2 USC00212904 1890-01-01 SNWD 305 6 \n", + "3 ASN00019052 1890-01-01 PRCP 0 a \n", + "4 RSM00022112 1890-01-01 PRCP 0 I \n", + "... ... ... ... ... ... ... ... ...\n", + "1276241 ASN00075035 1890-10-28 PRCP 0 a \n", + "1276242 SF001988360 1890-10-28 PRCP 51 I \n", + "1276243 ASN00048021 1890-10-28 PRCP 0 a \n", + "1276244 USC00412758 1890-10-28 PRCP 0 6 \n", + "1276245 SF000440500 1890-10-28 PRCP 0 I \n", "\n", - "[29240017 rows x 8 columns]" + "[1276246 rows x 8 columns]" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -612,15 +663,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 8min 33s, sys: 6.11 s, total: 8min 39s\n", - "Wall time: 12min 28s\n" + "CPU times: user 21.2 s, sys: 1.8 s, total: 23 s\n", + "Wall time: 6min 22s\n" ] }, { @@ -722,83 +773,83 @@ " ...\n", " \n", " \n", - " 29240012\n", - " USC00181790\n", - " 1899-12-31\n", - " PRCP\n", + " 1276241\n", + " CA006131910\n", + " 1890-12-31\n", + " SNOW\n", " 0\n", - " P\n", " <NA>\n", - " 6\n", - " 1830\n", + " <NA>\n", + " C\n", + " <NA>\n", " \n", " \n", - " 29240013\n", - " ASN00061000\n", - " 1899-12-31\n", - " PRCP\n", - " 0\n", + " 1276242\n", + " USC00174230\n", + " 1890-12-31\n", + " TMAX\n", + " -106\n", " <NA>\n", " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", - " 29240014\n", - " ASN00040284\n", - " 1899-12-31\n", - " PRCP\n", - " 0\n", + " 1276243\n", + " USC00174230\n", + " 1890-12-31\n", + " TMIN\n", + " -244\n", " <NA>\n", " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", - " 29240015\n", - " ASN00048117\n", - " 1899-12-31\n", + " 1276244\n", + " USC00174230\n", + " 1890-12-31\n", " PRCP\n", " 0\n", + " P\n", " <NA>\n", - " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", - " 29240016\n", - " ASN00054092\n", - " 1899-12-31\n", - " PRCP\n", + " 1276245\n", + " USC00174230\n", + " 1890-12-31\n", + " SNOW\n", " 0\n", " <NA>\n", " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", "\n", - "

29240017 rows × 8 columns

\n", + "

1276246 rows × 8 columns

\n", "" ], "text/plain": [ - " id dt element value m_flag q_flag s_flag obs_time\n", - "0 ASN00070200 1890-01-01 PRCP 0 a \n", - "1 SF000782720 1890-01-01 PRCP 0 I \n", - "2 CA005022790 1890-01-01 TMAX -222 C \n", - "3 CA005022790 1890-01-01 TMIN -261 C \n", - "4 CA005022790 1890-01-01 PRCP 0 C \n", - "... ... ... ... ... ... ... ... ...\n", - "29240012 USC00181790 1899-12-31 PRCP 0 P 6 1830\n", - "29240013 ASN00061000 1899-12-31 PRCP 0 a \n", - "29240014 ASN00040284 1899-12-31 PRCP 0 a \n", - "29240015 ASN00048117 1899-12-31 PRCP 0 a \n", - "29240016 ASN00054092 1899-12-31 PRCP 0 a \n", + " id dt element value m_flag q_flag s_flag obs_time\n", + "0 ASN00070200 1890-01-01 PRCP 0 a \n", + "1 SF000782720 1890-01-01 PRCP 0 I \n", + "2 CA005022790 1890-01-01 TMAX -222 C \n", + "3 CA005022790 1890-01-01 TMIN -261 C \n", + "4 CA005022790 1890-01-01 PRCP 0 C \n", + "... ... ... ... ... ... ... ... ...\n", + "1276241 CA006131910 1890-12-31 SNOW 0 C \n", + "1276242 USC00174230 1890-12-31 TMAX -106 6 \n", + "1276243 USC00174230 1890-12-31 TMIN -244 6 \n", + "1276244 USC00174230 1890-12-31 PRCP 0 P 6 \n", + "1276245 USC00174230 1890-12-31 SNOW 0 6 \n", "\n", - "[29240017 rows x 8 columns]" + "[1276246 rows x 8 columns]" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -818,15 +869,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 4.12 s, sys: 1.75 s, total: 5.87 s\n", - "Wall time: 31 s\n" + "CPU times: user 748 ms, sys: 279 ms, total: 1.03 s\n", + "Wall time: 48.8 s\n" ] }, { @@ -863,58 +914,58 @@ " \n", " \n", " 0\n", - " CA008101170\n", + " ASN00061069\n", " 1890-01-01\n", - " TMIN\n", - " -217\n", + " PRCP\n", + " 0\n", " NaN\n", " NaN\n", - " C\n", + " a\n", " NaN\n", " \n", " \n", " 1\n", - " CA008101170\n", + " USC00212904\n", " 1890-01-01\n", " PRCP\n", " 0\n", " NaN\n", " NaN\n", - " C\n", + " 6\n", " NaN\n", " \n", " \n", " 2\n", - " CA008101170\n", + " USC00212904\n", " 1890-01-01\n", - " SNOW\n", - " 0\n", + " SNWD\n", + " 305\n", " NaN\n", " NaN\n", - " C\n", + " 6\n", " NaN\n", " \n", " \n", " 3\n", - " USC00435733\n", + " ASN00019052\n", " 1890-01-01\n", - " TMAX\n", - " 33\n", + " PRCP\n", + " 0\n", " NaN\n", " NaN\n", - " 6\n", - " 1700\n", + " a\n", + " NaN\n", " \n", " \n", " 4\n", - " USC00435733\n", + " RSM00022112\n", " 1890-01-01\n", - " TMIN\n", - " -122\n", + " PRCP\n", + " 0\n", " NaN\n", " NaN\n", - " 6\n", - " 1700\n", + " I\n", + " NaN\n", " \n", " \n", " ...\n", @@ -928,83 +979,83 @@ " ...\n", " \n", " \n", - " 29240012\n", - " USC00395481\n", - " 1899-12-31\n", - " SNOW\n", + " 1276241\n", + " SF004323870\n", + " 1890-01-03\n", + " PRCP\n", " 0\n", " NaN\n", " NaN\n", - " 6\n", + " I\n", " NaN\n", " \n", " \n", - " 29240013\n", - " ASN00063055\n", - " 1899-12-31\n", + " 1276242\n", + " SF001018040\n", + " 1890-01-03\n", " PRCP\n", " 0\n", " NaN\n", " NaN\n", - " a\n", + " I\n", " NaN\n", " \n", " \n", - " 29240014\n", - " USC00357814\n", - " 1899-12-31\n", - " TMAX\n", - " 78\n", + " 1276243\n", + " LG000026314\n", + " 1890-01-03\n", + " PRCP\n", + " 0\n", " NaN\n", " NaN\n", - " 6\n", + " I\n", " NaN\n", " \n", " \n", - " 29240015\n", - " USC00357814\n", - " 1899-12-31\n", - " TMIN\n", - " 0\n", + " 1276244\n", + " CA004016320\n", + " 1890-01-03\n", + " TMAX\n", + " -278\n", " NaN\n", " NaN\n", - " 6\n", + " C\n", " NaN\n", " \n", " \n", - " 29240016\n", - " USC00357814\n", - " 1899-12-31\n", - " PRCP\n", - " 102\n", + " 1276245\n", + " CA004016320\n", + " 1890-01-03\n", + " TMIN\n", + " -383\n", " NaN\n", " NaN\n", - " 6\n", + " C\n", " NaN\n", " \n", " \n", "\n", - "

29240017 rows × 8 columns

\n", + "

1276246 rows × 8 columns

\n", "" ], "text/plain": [ - " id dt element value m_flag q_flag s_flag obs_time\n", - "0 CA008101170 1890-01-01 TMIN -217 NaN NaN C NaN\n", - "1 CA008101170 1890-01-01 PRCP 0 NaN NaN C NaN\n", - "2 CA008101170 1890-01-01 SNOW 0 NaN NaN C NaN\n", - "3 USC00435733 1890-01-01 TMAX 33 NaN NaN 6 1700\n", - "4 USC00435733 1890-01-01 TMIN -122 NaN NaN 6 1700\n", - "... ... ... ... ... ... ... ... ...\n", - "29240012 USC00395481 1899-12-31 SNOW 0 NaN NaN 6 NaN\n", - "29240013 ASN00063055 1899-12-31 PRCP 0 NaN NaN a NaN\n", - "29240014 USC00357814 1899-12-31 TMAX 78 NaN NaN 6 NaN\n", - "29240015 USC00357814 1899-12-31 TMIN 0 NaN NaN 6 NaN\n", - "29240016 USC00357814 1899-12-31 PRCP 102 NaN NaN 6 NaN\n", + " id dt element value m_flag q_flag s_flag obs_time\n", + "0 ASN00061069 1890-01-01 PRCP 0 NaN NaN a NaN\n", + "1 USC00212904 1890-01-01 PRCP 0 NaN NaN 6 NaN\n", + "2 USC00212904 1890-01-01 SNWD 305 NaN NaN 6 NaN\n", + "3 ASN00019052 1890-01-01 PRCP 0 NaN NaN a NaN\n", + "4 RSM00022112 1890-01-01 PRCP 0 NaN NaN I NaN\n", + "... ... ... ... ... ... ... ... ...\n", + "1276241 SF004323870 1890-01-03 PRCP 0 NaN NaN I NaN\n", + "1276242 SF001018040 1890-01-03 PRCP 0 NaN NaN I NaN\n", + "1276243 LG000026314 1890-01-03 PRCP 0 NaN NaN I NaN\n", + "1276244 CA004016320 1890-01-03 TMAX -278 NaN NaN C NaN\n", + "1276245 CA004016320 1890-01-03 TMIN -383 NaN NaN C NaN\n", "\n", - "[29240017 rows x 8 columns]" + "[1276246 rows x 8 columns]" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1024,19 +1075,18 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "110592\n", + "150870\n", + "1024\n", "1024\n", - "50176\n", - "2278400\n", - "9641681\n", - "9716736\n", - "7552000\n" + "1012736\n" ] } ], @@ -1054,16 +1104,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "10000000\n", - "10000000\n", - "9240017\n" + "1276246\n" ] } ], @@ -1088,7 +1136,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1104,7 +1152,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -1112,19 +1160,28 @@ " wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete Database" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "wr.catalog.delete_database('awswrangler_test')" + ] } ], "metadata": { "kernelspec": { - "display_name": "conda_python3", + "display_name": "Python 3", "language": "python", - "name": "conda_python3" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1136,7 +1193,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.7" }, "pycharm": { "stem_cell": { From 7450b88ee98dec81dacea986cdfb611f8d3fd531 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sun, 7 Jun 2020 21:18:55 -0300 Subject: [PATCH 08/28] Bumping tox version. --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 49be548c4..02e0f3875 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,7 +6,7 @@ mypy~=0.780 isort~=4.3.21 pydocstyle~=5.0.2 doc8~=0.8.1 -tox~=3.15.1 +tox~=3.15.2 pytest~=5.4.3 pytest-cov~=2.9.0 pytest-xdist~=1.32.0 From c474900b8abfec81f4bf577826eba1a7e2c494ba Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 11:27:35 -0300 Subject: [PATCH 09/28] Add S3 path check. --- awswrangler/_utils.py | 2 ++ awswrangler/s3.py | 4 ++-- testing/test_awswrangler/test_data_lake2.py | 6 ++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py index c20117891..860b5ac24 100644 --- a/awswrangler/_utils.py +++ b/awswrangler/_utils.py @@ -90,6 +90,8 @@ def parse_path(path: str) -> Tuple[str, str]: >>> bucket, key = parse_path('s3://bucket/key') """ + if path.startswith("s3://") is False: + raise exceptions.InvalidArgumentValue(f"'{path}' is not a valid path. It MUST start with 's3://'") parts = path.replace("s3://", "").split("/", 1) bucket: str = parts[0] key: str = "" diff --git a/awswrangler/s3.py b/awswrangler/s3.py index 0e98e674c..4ff29097f 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -188,11 +188,11 @@ def _list_objects( suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> List[str]: - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - paginator = client_s3.get_paginator("list_objects_v2") bucket: str prefix: str bucket, prefix = _utils.parse_path(path=path) + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + paginator = client_s3.get_paginator("list_objects_v2") args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} if delimiter is not None: args["Delimiter"] = delimiter diff --git a/testing/test_awswrangler/test_data_lake2.py b/testing/test_awswrangler/test_data_lake2.py index 320c2152a..6d12df444 100644 --- a/testing/test_awswrangler/test_data_lake2.py +++ b/testing/test_awswrangler/test_data_lake2.py @@ -457,3 +457,9 @@ def test_glue_database(): assert test_database_name == "" assert test_database_description == "" + + +def test_list_wrong_path(path): + wrong_path = path.replace("s3://", "") + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.s3.list_objects(wrong_path) From 91a96c551c893e8439d275f081b4dd8f7cc64135 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 11:45:54 -0300 Subject: [PATCH 10/28] Force index=False for wr.db.to_sql() with redshift. --- awswrangler/db.py | 6 ++++++ testing/test_awswrangler/test_db.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/awswrangler/db.py b/awswrangler/db.py index 61a495c12..6d22f13e8 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -34,6 +34,10 @@ def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs) -> ---- Redshift: For large DataFrames (1MM+ rows) consider the function **wr.db.copy_to_redshift()**. + Note + ---- + Redshift: `index=False` will be forced. + Parameters ---------- df : pandas.DataFrame @@ -92,6 +96,8 @@ def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs) -> ) pandas_kwargs["dtype"] = dtypes pandas_kwargs["con"] = con + if pandas_kwargs["con"].name.lower() == "redshift": # Redshift does not accept index + pandas_kwargs["index"] = False max_attempts: int = 3 for attempt in range(max_attempts): try: diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py index 4ff1e68ed..7e198f45b 100644 --- a/testing/test_awswrangler/test_db.py +++ b/testing/test_awswrangler/test_db.py @@ -89,13 +89,14 @@ def test_sql(parameters, db_type): if db_type == "redshift": df.drop(["binary"], axis=1, inplace=True) engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") + index = True if engine.name == "redshift" else False wr.db.to_sql( df=df, con=engine, name="test_sql", schema=parameters[db_type]["schema"], if_exists="replace", - index=False, + index=index, index_label=None, chunksize=None, method=None, From bcef16f694b72115e04f51932a0ffdf8ed3641a0 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 13:41:34 -0300 Subject: [PATCH 11/28] Improve redshift tests. --- testing/test_awswrangler/test_db.py | 74 +++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py index 7e198f45b..6fa74b784 100644 --- a/testing/test_awswrangler/test_db.py +++ b/testing/test_awswrangler/test_db.py @@ -1,5 +1,6 @@ import logging import random +import string import boto3 import pandas as pd @@ -15,6 +16,8 @@ extract_cloudformation_outputs, get_df, get_df_category, + get_time_str_with_random_suffix, + path_generator ) logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") @@ -27,6 +30,11 @@ def cloudformation_outputs(): yield extract_cloudformation_outputs() +@pytest.fixture(scope="function") +def path(bucket): + yield from path_generator(bucket) + + @pytest.fixture(scope="module") def bucket(cloudformation_outputs): if "BucketName" in cloudformation_outputs: @@ -63,6 +71,15 @@ def glue_database(cloudformation_outputs): yield cloudformation_outputs["GlueDatabaseName"] +@pytest.fixture(scope="function") +def glue_table(glue_database): + name = f"tbl_{get_time_str_with_random_suffix()}" + print(f"Table name: {name}") + wr.catalog.delete_table_if_exists(database=glue_database, table=name) + yield name + wr.catalog.delete_table_if_exists(database=glue_database, table=name) + + @pytest.fixture(scope="module") def external_schema(cloudformation_outputs, parameters, glue_database): region = cloudformation_outputs.get("Region") @@ -529,3 +546,60 @@ def test_null(parameters, db_type): df2 = wr.db.read_sql_table(table=table, schema=schema, con=engine) df["id"] = df["id"].astype("Int64") assert pd.concat(objs=[df, df], ignore_index=True).equals(df2) + + +def test_redshift_spectrum_long_string(path, glue_table, glue_database, external_schema): + df = pd.DataFrame({ + "id": [1, 2], + "col_str": [ + ''.join(random.choice(string.ascii_letters) for _ in range(300)), + ''.join(random.choice(string.ascii_letters) for _ in range(300)) + ] + }) + paths = wr.s3.to_parquet( + df=df, + path=path, + database=glue_database, + table=glue_table, + mode="overwrite", + index=False, + dataset=True, + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") + with engine.connect() as con: + cursor = con.execute(f"SELECT * FROM {external_schema}.{glue_table}") + rows = cursor.fetchall() + assert len(rows) == len(df.index) + for row in rows: + assert len(row) == len(df.columns) + + +def test_redshift_copy_unload_long_string(path, parameters): + df = pd.DataFrame({ + "id": [1, 2], + "col_str": [ + ''.join(random.choice(string.ascii_letters) for _ in range(300)), + ''.join(random.choice(string.ascii_letters) for _ in range(300)) + ] + }) + engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") + wr.db.copy_to_redshift( + df=df, + path=path, + con=engine, + schema="public", + table="test_redshift_copy_unload_long_string", + mode="overwrite", + varchar_lengths={"col_str": 300}, + iam_role=parameters["redshift"]["role"], + ) + df2 = wr.db.unload_redshift( + sql="SELECT * FROM public.test_redshift_copy_unload_long_string", + con=engine, + iam_role=parameters["redshift"]["role"], + path=path, + keep_files=False, + ) + assert len(df2.index) == 2 + assert len(df2.columns) == 2 From d0c8614b314759561248adce1562c828e32152f3 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 14:43:51 -0300 Subject: [PATCH 12/28] Improve redshift tests. --- testing/test_awswrangler/test_db.py | 42 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py index 6fa74b784..2775c7187 100644 --- a/testing/test_awswrangler/test_db.py +++ b/testing/test_awswrangler/test_db.py @@ -17,7 +17,7 @@ get_df, get_df_category, get_time_str_with_random_suffix, - path_generator + path_generator, ) logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") @@ -549,21 +549,17 @@ def test_null(parameters, db_type): def test_redshift_spectrum_long_string(path, glue_table, glue_database, external_schema): - df = pd.DataFrame({ - "id": [1, 2], - "col_str": [ - ''.join(random.choice(string.ascii_letters) for _ in range(300)), - ''.join(random.choice(string.ascii_letters) for _ in range(300)) - ] - }) + df = pd.DataFrame( + { + "id": [1, 2], + "col_str": [ + "".join(random.choice(string.ascii_letters) for _ in range(300)), + "".join(random.choice(string.ascii_letters) for _ in range(300)), + ], + } + ) paths = wr.s3.to_parquet( - df=df, - path=path, - database=glue_database, - table=glue_table, - mode="overwrite", - index=False, - dataset=True, + df=df, path=path, database=glue_database, table=glue_table, mode="overwrite", index=False, dataset=True )["paths"] wr.s3.wait_objects_exist(paths=paths, use_threads=False) engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") @@ -576,13 +572,15 @@ def test_redshift_spectrum_long_string(path, glue_table, glue_database, external def test_redshift_copy_unload_long_string(path, parameters): - df = pd.DataFrame({ - "id": [1, 2], - "col_str": [ - ''.join(random.choice(string.ascii_letters) for _ in range(300)), - ''.join(random.choice(string.ascii_letters) for _ in range(300)) - ] - }) + df = pd.DataFrame( + { + "id": [1, 2], + "col_str": [ + "".join(random.choice(string.ascii_letters) for _ in range(300)), + "".join(random.choice(string.ascii_letters) for _ in range(300)), + ], + } + ) engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") wr.db.copy_to_redshift( df=df, From 2164b6863146cc4b71c276cf8883060a0e4394f6 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 14:44:33 -0300 Subject: [PATCH 13/28] Add sanitize_columns arg in to_parquet and to_csv. #278 #279 --- awswrangler/s3.py | 46 ++++++++++++++------- testing/test_awswrangler/test_data_lake2.py | 23 +++++++++++ 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/awswrangler/s3.py b/awswrangler/s3.py index 4ff29097f..5c17b3fbb 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -429,7 +429,7 @@ def size_objects( return size_dict -def to_csv( # pylint: disable=too-many-arguments +def to_csv( # pylint: disable=too-many-arguments,too-many-locals df: pd.DataFrame, path: str, sep: str = ",", @@ -438,6 +438,7 @@ def to_csv( # pylint: disable=too-many-arguments use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, + sanitize_columns: bool = False, dataset: bool = False, partition_cols: Optional[List[str]] = None, mode: Optional[str] = None, @@ -464,8 +465,9 @@ def to_csv( # pylint: disable=too-many-arguments Note ---- - The table name and all column names will be automatically sanitize using + If `dataset=True` The table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. + Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. Note ---- @@ -495,13 +497,16 @@ def to_csv( # pylint: disable=too-many-arguments s3_additional_kwargs: Forward to s3fs, useful for server side encryption https://s3fs.readthedocs.io/en/latest/#serverside-encryption - dataset: bool + sanitize_columns : bool + True to sanitize columns names or False to keep it as is. + True value is forced if `dataset=True`. + dataset : bool If True store a parquet dataset instead of a single file. If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, . partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. - mode: str, optional + mode : str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. @@ -662,13 +667,16 @@ def to_csv( # pylint: disable=too-many-arguments if df.empty is True: raise exceptions.EmptyDataFrame() - # Sanitize table to respect Athena's standards partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} - df = catalog.sanitize_dataframe_columns_names(df=df) - partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] - dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} + + # Sanitize table to respect Athena's standards + if (sanitize_columns is True) or (dataset is True): + df = catalog.sanitize_dataframe_columns_names(df=df) + partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] + dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} + df = catalog.drop_duplicated_columns(df=df) session: boto3.Session = _utils.ensure_session(session=boto3_session) fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) @@ -703,7 +711,6 @@ def to_csv( # pylint: disable=too-many-arguments if catalog_types is not None: for k, v in catalog_types.items(): dtype[k] = v - df = catalog.drop_duplicated_columns(df=df) paths, partitions_values = _to_csv_dataset( df=df, path=path, @@ -906,6 +913,7 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, + sanitize_columns: bool = False, dataset: bool = False, partition_cols: Optional[List[str]] = None, mode: Optional[str] = None, @@ -931,8 +939,9 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals Note ---- - The table name and all column names will be automatically sanitize using + If `dataset=True` The table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. + Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. Note ---- @@ -960,7 +969,10 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals s3_additional_kwargs: Forward to s3fs, useful for server side encryption https://s3fs.readthedocs.io/en/latest/#serverside-encryption - dataset: bool + sanitize_columns : bool + True to sanitize columns names or False to keep it as is. + True value is forced if `dataset=True`. + dataset : bool If True store a parquet dataset instead of a single file. If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, . @@ -1127,14 +1139,16 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals if df.empty is True: raise exceptions.EmptyDataFrame() - # Sanitize table to respect Athena's standards partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} - df = catalog.sanitize_dataframe_columns_names(df=df) - partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] - dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} - df = catalog.drop_duplicated_columns(df=df) + + # Sanitize table to respect Athena's standards + if (sanitize_columns is True) or (dataset is True): + df = catalog.sanitize_dataframe_columns_names(df=df) + partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] + dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} + df = catalog.drop_duplicated_columns(df=df) session: boto3.Session = _utils.ensure_session(session=boto3_session) cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) diff --git a/testing/test_awswrangler/test_data_lake2.py b/testing/test_awswrangler/test_data_lake2.py index 6d12df444..7993d2905 100644 --- a/testing/test_awswrangler/test_data_lake2.py +++ b/testing/test_awswrangler/test_data_lake2.py @@ -463,3 +463,26 @@ def test_list_wrong_path(path): wrong_path = path.replace("s3://", "") with pytest.raises(wr.exceptions.InvalidArgumentValue): wr.s3.list_objects(wrong_path) + + +@pytest.mark.parametrize("sanitize_columns,col", [(True, "foo_boo"), (False, "FooBoo")]) +def test_sanitize_columns(path, sanitize_columns, col): + df = pd.DataFrame({"FooBoo": [1, 2, 3]}) + + # Parquet + file_path = f"{path}0.parquet" + wr.s3.to_parquet(df, path=file_path, sanitize_columns=sanitize_columns) + wr.s3.wait_objects_exist([file_path]) + df = wr.s3.read_parquet(file_path) + assert len(df.index) == 3 + assert len(df.columns) == 1 + assert df.columns == [col] + + # CSV + file_path = f"{path}0.csv" + wr.s3.to_csv(df, path=file_path, sanitize_columns=sanitize_columns, index=False) + wr.s3.wait_objects_exist([file_path]) + df = wr.s3.read_csv(file_path) + assert len(df.index) == 3 + assert len(df.columns) == 1 + assert df.columns == [col] From 10749e24e6048ea540ba130bfaa5b5d31bdf39b0 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 15:35:31 -0300 Subject: [PATCH 14/28] Bumping dev dependencies versions. --- requirements-dev.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 02e0f3875..3cb805c67 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ awscli>=1.18.0,<2.0.0 black~=19.3b0 -pylint~=2.5.2 -flake8~=3.8.2 +pylint~=2.5.3 +flake8~=3.8.3 mypy~=0.780 isort~=4.3.21 pydocstyle~=5.0.2 From bd9ab9447240dc1253bdad9f6e9056c6124d50c1 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 16:19:57 -0300 Subject: [PATCH 15/28] Breaking up s3 module in multiple files. --- awswrangler/db.py | 3 +- awswrangler/s3.py | 2679 ----------------------------------- awswrangler/s3/__init__.py | 16 + awswrangler/s3/_copy.py | 182 +++ awswrangler/s3/_delete.py | 85 ++ awswrangler/s3/_describe.py | 182 +++ awswrangler/s3/_list.py | 176 +++ awswrangler/s3/_read.py | 885 ++++++++++++ awswrangler/s3/_wait.py | 159 +++ awswrangler/s3/_write.py | 1093 ++++++++++++++ 10 files changed, 2780 insertions(+), 2680 deletions(-) delete mode 100644 awswrangler/s3.py create mode 100644 awswrangler/s3/__init__.py create mode 100644 awswrangler/s3/_copy.py create mode 100644 awswrangler/s3/_delete.py create mode 100644 awswrangler/s3/_describe.py create mode 100644 awswrangler/s3/_list.py create mode 100644 awswrangler/s3/_read.py create mode 100644 awswrangler/s3/_wait.py create mode 100644 awswrangler/s3/_write.py diff --git a/awswrangler/db.py b/awswrangler/db.py index 6d22f13e8..4f696ab2e 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -13,6 +13,7 @@ from sqlalchemy.sql.visitors import VisitableType # type: ignore from awswrangler import _data_types, _utils, exceptions, s3 +from awswrangler.s3._list import path2list # noqa _logger: logging.Logger = logging.getLogger(__name__) @@ -655,7 +656,7 @@ def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-argument """ _varchar_lengths: Dict[str, int] = {} if varchar_lengths is None else varchar_lengths session: boto3.Session = _utils.ensure_session(session=boto3_session) - paths: List[str] = s3._path2list(path=path, boto3_session=session) # pylint: disable=protected-access + paths: List[str] = path2list(path=path, boto3_session=session) # pylint: disable=protected-access manifest_directory = manifest_directory if manifest_directory.endswith("/") else f"{manifest_directory}/" manifest_path: str = f"{manifest_directory}manifest.json" write_redshift_copy_manifest( diff --git a/awswrangler/s3.py b/awswrangler/s3.py deleted file mode 100644 index 5c17b3fbb..000000000 --- a/awswrangler/s3.py +++ /dev/null @@ -1,2679 +0,0 @@ -"""Amazon S3 Module.""" - -import concurrent.futures -import csv -import logging -import time -import uuid -from itertools import repeat -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union - -import boto3 # type: ignore -import botocore.exceptions # type: ignore -import pandas as pd # type: ignore -import pandas.io.parsers # type: ignore -import pyarrow as pa # type: ignore -import pyarrow.lib # type: ignore -import pyarrow.parquet # type: ignore -import s3fs # type: ignore -from boto3.s3.transfer import TransferConfig # type: ignore -from pandas.io.common import infer_compression # type: ignore - -from awswrangler import _data_types, _utils, catalog, exceptions - -_COMPRESSION_2_EXT: Dict[Optional[str], str] = {None: "", "gzip": ".gz", "snappy": ".snappy"} - -_logger: logging.Logger = logging.getLogger(__name__) - - -def get_bucket_region(bucket: str, boto3_session: Optional[boto3.Session] = None) -> str: - """Get bucket region name. - - Parameters - ---------- - bucket : str - Bucket name. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Region code (e.g. 'us-east-1'). - - Examples - -------- - Using the default boto3 session - - >>> import awswrangler as wr - >>> region = wr.s3.get_bucket_region('bucket-name') - - Using a custom boto3 session - - >>> import boto3 - >>> import awswrangler as wr - >>> region = wr.s3.get_bucket_region('bucket-name', boto3_session=boto3.Session()) - - """ - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - _logger.debug("bucket: %s", bucket) - region: str = client_s3.get_bucket_location(Bucket=bucket)["LocationConstraint"] - region = "us-east-1" if region is None else region - _logger.debug("region: %s", region) - return region - - -def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) -> bool: - """Check if object exists on S3. - - Parameters - ---------- - path: str - S3 path (e.g. s3://bucket/key). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - bool - True if exists, False otherwise. - - Examples - -------- - Using the default boto3 session - - >>> import awswrangler as wr - >>> wr.s3.does_object_exist('s3://bucket/key_real') - True - >>> wr.s3.does_object_exist('s3://bucket/key_unreal') - False - - Using a custom boto3 session - - >>> import boto3 - >>> import awswrangler as wr - >>> wr.s3.does_object_exist('s3://bucket/key_real', boto3_session=boto3.Session()) - True - >>> wr.s3.does_object_exist('s3://bucket/key_unreal', boto3_session=boto3.Session()) - False - - """ - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - bucket: str - key: str - bucket, key = path.replace("s3://", "").split("/", 1) - try: - client_s3.head_object(Bucket=bucket, Key=key) - return True - except botocore.exceptions.ClientError as ex: - if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404: - return False - raise ex # pragma: no cover - - -def list_directories(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: - """List Amazon S3 objects from a prefix. - - Parameters - ---------- - path : str - S3 path (e.g. s3://bucket/prefix). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - List of objects paths. - - Examples - -------- - Using the default boto3 session - - >>> import awswrangler as wr - >>> wr.s3.list_objects('s3://bucket/prefix/') - ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] - - Using a custom boto3 session - - >>> import boto3 - >>> import awswrangler as wr - >>> wr.s3.list_objects('s3://bucket/prefix/', boto3_session=boto3.Session()) - ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] - - """ - return _list_objects(path=path, delimiter="/", boto3_session=boto3_session) - - -def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[str]: - """List Amazon S3 objects from a prefix. - - Parameters - ---------- - path : str - S3 path (e.g. s3://bucket/prefix). - suffix: str, optional - Suffix for filtering S3 keys. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - List of objects paths. - - Examples - -------- - Using the default boto3 session - - >>> import awswrangler as wr - >>> wr.s3.list_objects('s3://bucket/prefix') - ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] - - Using a custom boto3 session - - >>> import boto3 - >>> import awswrangler as wr - >>> wr.s3.list_objects('s3://bucket/prefix', boto3_session=boto3.Session()) - ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] - - """ - paths: List[str] = _list_objects(path=path, delimiter=None, suffix=suffix, boto3_session=boto3_session) - return [p for p in paths if not p.endswith("/")] - - -def _list_objects( - path: str, - delimiter: Optional[str] = None, - suffix: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[str]: - bucket: str - prefix: str - bucket, prefix = _utils.parse_path(path=path) - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - paginator = client_s3.get_paginator("list_objects_v2") - args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} - if delimiter is not None: - args["Delimiter"] = delimiter - response_iterator = paginator.paginate(**args) - paths: List[str] = [] - for page in response_iterator: # pylint: disable=too-many-nested-blocks - if delimiter is None: - contents: Optional[List] = page.get("Contents") - if contents is not None: - for content in contents: - if (content is not None) and ("Key" in content): - key: str = content["Key"] - if (suffix is None) or key.endswith(suffix): - paths.append(f"s3://{bucket}/{key}") - else: - prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") - if prefixes is not None: - for pfx in prefixes: - if (pfx is not None) and ("Prefix" in pfx): - key = pfx["Prefix"] - paths.append(f"s3://{bucket}/{key}") - return paths - - -def _path2list(path: object, boto3_session: boto3.Session, suffix: str = None) -> List[str]: - if isinstance(path, str): # prefix - paths: List[str] = list_objects(path=path, suffix=suffix, boto3_session=boto3_session) - elif isinstance(path, list): - paths = path if suffix is None else [x for x in path if x.endswith(suffix)] - else: - raise exceptions.InvalidArgumentType(f"{type(path)} is not a valid path type. Please, use str or List[str].") - return paths - - -def delete_objects( - path: Union[str, List[str]], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None -) -> None: - """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - None - None. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1']) # Delete both objects - >>> wr.s3.delete_objects('s3://bucket/prefix') # Delete all objects under the received prefix - - """ - paths: List[str] = _path2list(path=path, boto3_session=boto3_session) - if len(paths) < 1: - return - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths) - for bucket, keys in buckets.items(): - chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000) - if use_threads is False: - for chunk in chunks: - _delete_objects(bucket=bucket, keys=chunk, client_s3=client_s3) - else: - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - list(executor.map(_delete_objects, repeat(bucket), chunks, repeat(client_s3))) - - -def _split_paths_by_bucket(paths: List[str]) -> Dict[str, List[str]]: - buckets: Dict[str, List[str]] = {} - bucket: str - key: str - for path in paths: - bucket, key = _utils.parse_path(path=path) - if bucket not in buckets: - buckets[bucket] = [] - buckets[bucket].append(key) - return buckets - - -def _delete_objects(bucket: str, keys: List[str], client_s3: boto3.client) -> None: - _logger.debug("len(keys): %s", len(keys)) - batch: List[Dict[str, str]] = [{"Key": key} for key in keys] - res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch}) - deleted = res.get("Deleted") - if deleted is not None: - for i in deleted: - _logger.debug("s3://%s/%s has been deleted.", bucket, i.get("Key")) - errors = res.get("Errors") - if errors is not None: # pragma: no cover - raise exceptions.ServiceApiError(errors) - - -def describe_objects( - path: Union[str, List[str]], - wait_time: Optional[Union[int, float]] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> Dict[str, Dict[str, Any]]: - """Describe Amazon S3 objects from a received S3 prefix or list of S3 objects paths. - - Fetch attributes like ContentLength, DeleteMarker, LastModified, ContentType, etc - The full list of attributes can be explored under the boto3 head_object documentation: - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - wait_time : Union[int,float], optional - How much time (seconds) should Wrangler try to reach this objects. - Very useful to overcome eventual consistence issues. - `None` means only a single try will be done. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - Dict[str, Dict[str, Any]] - Return a dictionary of objects returned from head_objects where the key is the object path. - The response object can be explored here: - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object - - Examples - -------- - >>> import awswrangler as wr - >>> descs0 = wr.s3.describe_objects(['s3://bucket/key0', 's3://bucket/key1']) # Describe both objects - >>> descs1 = wr.s3.describe_objects('s3://bucket/prefix') # Describe all objects under the prefix - >>> descs2 = wr.s3.describe_objects('s3://bucket/prefix', wait_time=30) # Overcoming eventual consistence issues - - """ - paths: List[str] = _path2list(path=path, boto3_session=boto3_session) - if len(paths) < 1: - return {} - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - resp_list: List[Tuple[str, Dict[str, Any]]] - if use_threads is False: - resp_list = [_describe_object(path=p, wait_time=wait_time, client_s3=client_s3) for p in paths] - else: - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - resp_list = list(executor.map(_describe_object, paths, repeat(wait_time), repeat(client_s3))) - desc_dict: Dict[str, Dict[str, Any]] = dict(resp_list) - return desc_dict - - -def _describe_object( - path: str, wait_time: Optional[Union[int, float]], client_s3: boto3.client -) -> Tuple[str, Dict[str, Any]]: - wait_time = int(wait_time) if isinstance(wait_time, float) else wait_time - tries: int = wait_time if (wait_time is not None) and (wait_time > 0) else 1 - bucket: str - key: str - bucket, key = _utils.parse_path(path=path) - desc: Dict[str, Any] = {} - for i in range(tries, 0, -1): - try: - desc = client_s3.head_object(Bucket=bucket, Key=key) - break - except botocore.exceptions.ClientError as e: # pragma: no cover - if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404: # Not Found - _logger.debug("Object not found. %s seconds remaining to wait.", i) - if i == 1: # Last try, there is no more need to sleep - break - time.sleep(1) - else: - raise e - return path, desc - - -def size_objects( - path: Union[str, List[str]], - wait_time: Optional[Union[int, float]] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> Dict[str, Optional[int]]: - """Get the size (ContentLength) in bytes of Amazon S3 objects from a received S3 prefix or list of S3 objects paths. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - wait_time : Union[int,float], optional - How much time (seconds) should Wrangler try to reach this objects. - Very useful to overcome eventual consistence issues. - `None` means only a single try will be done. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - Dict[str, Optional[int]] - Dictionary where the key is the object path and the value is the object size. - - Examples - -------- - >>> import awswrangler as wr - >>> sizes0 = wr.s3.size_objects(['s3://bucket/key0', 's3://bucket/key1']) # Get the sizes of both objects - >>> sizes1 = wr.s3.size_objects('s3://bucket/prefix') # Get the sizes of all objects under the received prefix - >>> sizes2 = wr.s3.size_objects('s3://bucket/prefix', wait_time=30) # Overcoming eventual consistence issues - - """ - desc_list: Dict[str, Dict[str, Any]] = describe_objects( - path=path, wait_time=wait_time, use_threads=use_threads, boto3_session=boto3_session - ) - size_dict: Dict[str, Optional[int]] = {k: d.get("ContentLength", None) for k, d in desc_list.items()} - return size_dict - - -def to_csv( # pylint: disable=too-many-arguments,too-many-locals - df: pd.DataFrame, - path: str, - sep: str = ",", - index: bool = True, - columns: Optional[List[str]] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - sanitize_columns: bool = False, - dataset: bool = False, - partition_cols: Optional[List[str]] = None, - mode: Optional[str] = None, - catalog_versioning: bool = False, - database: Optional[str] = None, - table: Optional[str] = None, - dtype: Optional[Dict[str, str]] = None, - description: Optional[str] = None, - parameters: Optional[Dict[str, str]] = None, - columns_comments: Optional[Dict[str, str]] = None, - regular_partitions: bool = True, - projection_enabled: bool = False, - projection_types: Optional[Dict[str, str]] = None, - projection_ranges: Optional[Dict[str, str]] = None, - projection_values: Optional[Dict[str, str]] = None, - projection_intervals: Optional[Dict[str, str]] = None, - projection_digits: Optional[Dict[str, str]] = None, - **pandas_kwargs, -) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: - """Write CSV file or dataset on Amazon S3. - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). - - Note - ---- - If `dataset=True` The table name and all column names will be automatically sanitized using - `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. - Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. - - Note - ---- - On `append` mode, the `parameters` will be upsert on an existing table. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - df: pandas.DataFrame - Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html - path : str - Amazon S3 path (e.g. s3://bucket/filename.csv). - sep : str - String of length 1. Field delimiter for the output file. - index : bool - Write row names (index). - columns : List[str], optional - Columns to write. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 Session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - sanitize_columns : bool - True to sanitize columns names or False to keep it as is. - True value is forced if `dataset=True`. - dataset : bool - If True store a parquet dataset instead of a single file. - If True, enable all follow arguments: - partition_cols, mode, database, table, description, parameters, columns_comments, . - partition_cols: List[str], optional - List of column names that will be used to create partitions. Only takes effect if dataset=True. - mode : str, optional - ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. - catalog_versioning : bool - If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. - database : str, optional - Glue/Athena catalog: Database name. - table : str, optional - Glue/Athena catalog: Table name. - dtype : Dict[str, str], optional - Dictionary of columns names and Athena/Glue types to be casted. - Useful when you have columns with undetermined or mixed data types. - (e.g. {'col name': 'bigint', 'col2 name': 'int'}) - description : str, optional - Glue/Athena catalog: Table description - parameters : Dict[str, str], optional - Glue/Athena catalog: Key/value pairs to tag the table. - columns_comments : Dict[str, str], optional - Glue/Athena catalog: - Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). - regular_partitions : bool - Create regular partitions (Non projected partitions) on Glue Catalog. - Disable when you will work only with Partition Projection. - Keep enabled even when working with projections is useful to keep - Redshift Spectrum working with the regular partitions. - projection_enabled : bool - Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) - projection_types : Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections types. - Valid types: "enum", "integer", "date", "injected" - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) - projection_ranges: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections ranges. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) - projection_values: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections values. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) - projection_intervals: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections intervals. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '5'}) - projection_digits: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections digits. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '2'}) - pandas_kwargs : - keyword arguments forwarded to pandas.DataFrame.to_csv() - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html - - Returns - ------- - None - None. - - Examples - -------- - Writing single file - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/prefix/my_file.csv', - ... ) - { - 'paths': ['s3://bucket/prefix/my_file.csv'], - 'partitions_values': {} - } - - Writing single file encrypted with a KMS key - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/prefix/my_file.csv', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - { - 'paths': ['s3://bucket/prefix/my_file.csv'], - 'partitions_values': {} - } - - Writing partitioned dataset - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... partition_cols=['col2'] - ... ) - { - 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], - 'partitions_values: { - 's3://.../col2=A/': ['A'], - 's3://.../col2=B/': ['B'] - } - } - - Writing dataset to S3 with metadata on Athena/Glue Catalog. - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... partition_cols=['col2'], - ... database='default', # Athena/Glue database - ... table='my_table' # Athena/Glue table - ... ) - { - 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], - 'partitions_values: { - 's3://.../col2=A/': ['A'], - 's3://.../col2=B/': ['B'] - } - } - - Writing dataset casting empty column data type - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'], - ... 'col3': [None, None, None] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... database='default', # Athena/Glue database - ... table='my_table' # Athena/Glue table - ... dtype={'col3': 'date'} - ... ) - { - 'paths': ['s3://.../x.csv'], - 'partitions_values: {} - } - - """ - if (database is None) ^ (table is None): - raise exceptions.InvalidArgumentCombination( - "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." - ) - if df.empty is True: - raise exceptions.EmptyDataFrame() - - partition_cols = partition_cols if partition_cols else [] - dtype = dtype if dtype else {} - partitions_values: Dict[str, List[str]] = {} - - # Sanitize table to respect Athena's standards - if (sanitize_columns is True) or (dataset is True): - df = catalog.sanitize_dataframe_columns_names(df=df) - partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] - dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} - df = catalog.drop_duplicated_columns(df=df) - - session: boto3.Session = _utils.ensure_session(session=boto3_session) - fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) - if dataset is False: - if partition_cols: - raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") - if mode is not None: - raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") - if columns_comments: - raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use columns_comments.") - if any(arg is not None for arg in (database, table, description, parameters)): - raise exceptions.InvalidArgumentCombination( - "Please pass dataset=True to be able to use any one of these " - "arguments: database, table, description, parameters, " - "columns_comments." - ) - pandas_kwargs["sep"] = sep - pandas_kwargs["index"] = index - pandas_kwargs["columns"] = columns - _to_text(file_format="csv", df=df, path=path, fs=fs, **pandas_kwargs) - paths = [path] - else: - mode = "append" if mode is None else mode - if columns: - df = df[columns] - if ( - (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) - ): # Fetching Catalog Types - catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( - database=database, table=table, boto3_session=session - ) - if catalog_types is not None: - for k, v in catalog_types.items(): - dtype[k] = v - paths, partitions_values = _to_csv_dataset( - df=df, - path=path, - index=index, - sep=sep, - fs=fs, - use_threads=use_threads, - partition_cols=partition_cols, - dtype=dtype, - mode=mode, - boto3_session=session, - ) - if (database is not None) and (table is not None): - columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( - df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True - ) - catalog.create_csv_table( - database=database, - table=table, - path=path, - columns_types=columns_types, - partitions_types=partitions_types, - description=description, - parameters=parameters, - columns_comments=columns_comments, - boto3_session=session, - mode=mode, - catalog_versioning=catalog_versioning, - sep=sep, - projection_enabled=projection_enabled, - projection_types=projection_types, - projection_ranges=projection_ranges, - projection_values=projection_values, - projection_intervals=projection_intervals, - projection_digits=projection_digits, - ) - if partitions_values and (regular_partitions is True): - _logger.debug("partitions_values:\n%s", partitions_values) - catalog.add_csv_partitions( - database=database, table=table, partitions_values=partitions_values, boto3_session=session, sep=sep - ) - return {"paths": paths, "partitions_values": partitions_values} - - -def _to_csv_dataset( - df: pd.DataFrame, - path: str, - index: bool, - sep: str, - fs: s3fs.S3FileSystem, - use_threads: bool, - mode: str, - dtype: Dict[str, str], - partition_cols: Optional[List[str]] = None, - boto3_session: Optional[boto3.Session] = None, -) -> Tuple[List[str], Dict[str, List[str]]]: - paths: List[str] = [] - partitions_values: Dict[str, List[str]] = {} - path = path if path[-1] == "/" else f"{path}/" - if mode not in ["append", "overwrite", "overwrite_partitions"]: - raise exceptions.InvalidArgumentValue( - f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." - ) - if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): - delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) - df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) - _logger.debug("dtypes: %s", df.dtypes) - if not partition_cols: - file_path: str = f"{path}{uuid.uuid4().hex}.csv" - _to_text( - file_format="csv", - df=df, - path=file_path, - fs=fs, - quoting=csv.QUOTE_NONE, - escapechar="\\", - header=False, - date_format="%Y-%m-%d %H:%M:%S.%f", - index=index, - sep=sep, - ) - paths.append(file_path) - else: - for keys, subgroup in df.groupby(by=partition_cols, observed=True): - subgroup = subgroup.drop(partition_cols, axis="columns") - keys = (keys,) if not isinstance(keys, tuple) else keys - subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) - prefix: str = f"{path}{subdir}/" - if mode == "overwrite_partitions": - delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) - file_path = f"{prefix}{uuid.uuid4().hex}.csv" - _to_text( - file_format="csv", - df=subgroup, - path=file_path, - fs=fs, - quoting=csv.QUOTE_NONE, - escapechar="\\", - header=False, - date_format="%Y-%m-%d %H:%M:%S.%f", - index=index, - sep=sep, - ) - paths.append(file_path) - partitions_values[prefix] = [str(k) for k in keys] - return paths, partitions_values - - -def to_json( - df: pd.DataFrame, - path: str, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - **pandas_kwargs, -) -> None: - """Write JSON file on Amazon S3. - - Parameters - ---------- - df: pandas.DataFrame - Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html - path : str - Amazon S3 path (e.g. s3://bucket/filename.csv). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 Session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - pandas_kwargs: - keyword arguments forwarded to pandas.DataFrame.to_csv() - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html - - Returns - ------- - None - None. - - Examples - -------- - Writing JSON file - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_json( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/filename.json', - ... ) - - Writing CSV file encrypted with a KMS key - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_json( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/filename.json', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - """ - return _to_text( - file_format="json", - df=df, - path=path, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - **pandas_kwargs, - ) - - -def _to_text( - file_format: str, - df: pd.DataFrame, - path: str, - fs: Optional[s3fs.S3FileSystem] = None, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - **pandas_kwargs, -) -> None: - if df.empty is True: # pragma: no cover - raise exceptions.EmptyDataFrame() - if fs is None: - fs = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) - encoding: Optional[str] = pandas_kwargs.get("encoding", None) - newline: Optional[str] = pandas_kwargs.get("line_terminator", None) - with fs.open(path=path, mode="w", encoding=encoding, newline=newline) as f: - if file_format == "csv": - df.to_csv(f, **pandas_kwargs) - elif file_format == "json": - df.to_json(f, **pandas_kwargs) - - -def to_parquet( # pylint: disable=too-many-arguments,too-many-locals - df: pd.DataFrame, - path: str, - index: bool = False, - compression: Optional[str] = "snappy", - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - sanitize_columns: bool = False, - dataset: bool = False, - partition_cols: Optional[List[str]] = None, - mode: Optional[str] = None, - catalog_versioning: bool = False, - database: Optional[str] = None, - table: Optional[str] = None, - dtype: Optional[Dict[str, str]] = None, - description: Optional[str] = None, - parameters: Optional[Dict[str, str]] = None, - columns_comments: Optional[Dict[str, str]] = None, - regular_partitions: bool = True, - projection_enabled: bool = False, - projection_types: Optional[Dict[str, str]] = None, - projection_ranges: Optional[Dict[str, str]] = None, - projection_values: Optional[Dict[str, str]] = None, - projection_intervals: Optional[Dict[str, str]] = None, - projection_digits: Optional[Dict[str, str]] = None, -) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: - """Write Parquet file or dataset on Amazon S3. - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). - - Note - ---- - If `dataset=True` The table name and all column names will be automatically sanitized using - `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. - Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. - - Note - ---- - On `append` mode, the `parameters` will be upsert on an existing table. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - df: pandas.DataFrame - Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html - path : str - S3 path (for file e.g. ``s3://bucket/prefix/filename.parquet``) (for dataset e.g. ``s3://bucket/prefix``). - index : bool - True to store the DataFrame index in file, otherwise False to ignore it. - compression: str, optional - Compression style (``None``, ``snappy``, ``gzip``). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - sanitize_columns : bool - True to sanitize columns names or False to keep it as is. - True value is forced if `dataset=True`. - dataset : bool - If True store a parquet dataset instead of a single file. - If True, enable all follow arguments: - partition_cols, mode, database, table, description, parameters, columns_comments, . - partition_cols: List[str], optional - List of column names that will be used to create partitions. Only takes effect if dataset=True. - mode: str, optional - ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. - catalog_versioning : bool - If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. - database : str, optional - Glue/Athena catalog: Database name. - table : str, optional - Glue/Athena catalog: Table name. - dtype : Dict[str, str], optional - Dictionary of columns names and Athena/Glue types to be casted. - Useful when you have columns with undetermined or mixed data types. - (e.g. {'col name': 'bigint', 'col2 name': 'int'}) - description : str, optional - Glue/Athena catalog: Table description - parameters : Dict[str, str], optional - Glue/Athena catalog: Key/value pairs to tag the table. - columns_comments : Dict[str, str], optional - Glue/Athena catalog: - Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). - regular_partitions : bool - Create regular partitions (Non projected partitions) on Glue Catalog. - Disable when you will work only with Partition Projection. - Keep enabled even when working with projections is useful to keep - Redshift Spectrum working with the regular partitions. - projection_enabled : bool - Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) - projection_types : Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections types. - Valid types: "enum", "integer", "date", "injected" - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) - projection_ranges: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections ranges. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) - projection_values: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections values. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) - projection_intervals: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections intervals. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '5'}) - projection_digits: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections digits. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '2'}) - - Returns - ------- - Dict[str, Union[List[str], Dict[str, List[str]]]] - Dictionary with: - 'paths': List of all stored files paths on S3. - 'partitions_values': Dictionary of partitions added with keys as S3 path locations - and values as a list of partitions values as str. - - Examples - -------- - Writing single file - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/prefix/my_file.parquet', - ... ) - { - 'paths': ['s3://bucket/prefix/my_file.parquet'], - 'partitions_values': {} - } - - Writing single file encrypted with a KMS key - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/prefix/my_file.parquet', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - { - 'paths': ['s3://bucket/prefix/my_file.parquet'], - 'partitions_values': {} - } - - Writing partitioned dataset - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... partition_cols=['col2'] - ... ) - { - 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], - 'partitions_values: { - 's3://.../col2=A/': ['A'], - 's3://.../col2=B/': ['B'] - } - } - - Writing dataset to S3 with metadata on Athena/Glue Catalog. - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... partition_cols=['col2'], - ... database='default', # Athena/Glue database - ... table='my_table' # Athena/Glue table - ... ) - { - 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], - 'partitions_values: { - 's3://.../col2=A/': ['A'], - 's3://.../col2=B/': ['B'] - } - } - - Writing dataset casting empty column data type - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'], - ... 'col3': [None, None, None] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... database='default', # Athena/Glue database - ... table='my_table' # Athena/Glue table - ... dtype={'col3': 'date'} - ... ) - { - 'paths': ['s3://.../x.parquet'], - 'partitions_values: {} - } - - """ - if (database is None) ^ (table is None): - raise exceptions.InvalidArgumentCombination( - "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." - ) - if df.empty is True: - raise exceptions.EmptyDataFrame() - - partition_cols = partition_cols if partition_cols else [] - dtype = dtype if dtype else {} - partitions_values: Dict[str, List[str]] = {} - - # Sanitize table to respect Athena's standards - if (sanitize_columns is True) or (dataset is True): - df = catalog.sanitize_dataframe_columns_names(df=df) - partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] - dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} - df = catalog.drop_duplicated_columns(df=df) - - session: boto3.Session = _utils.ensure_session(session=boto3_session) - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) - compression_ext: Optional[str] = _COMPRESSION_2_EXT.get(compression, None) - if compression_ext is None: - raise exceptions.InvalidCompression(f"{compression} is invalid, please use None, snappy or gzip.") - if dataset is False: - if path.endswith("/"): # pragma: no cover - raise exceptions.InvalidArgumentValue( - "If , the argument should be a object path, not a directory." - ) - if partition_cols: - raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") - if mode is not None: - raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") - if any(arg is not None for arg in (database, table, description, parameters)): - raise exceptions.InvalidArgumentCombination( - "Please pass dataset=True to be able to use any one of these " - "arguments: database, table, description, parameters, " - "columns_comments." - ) - df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) - schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( - df=df, index=index, ignore_cols=partition_cols, dtype=dtype - ) - _logger.debug("schema: \n%s", schema) - paths = [ - _to_parquet_file( - df=df, path=path, schema=schema, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype - ) - ] - else: - mode = "append" if mode is None else mode - if ( - (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) - ): # Fetching Catalog Types - catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( - database=database, table=table, boto3_session=session - ) - if catalog_types is not None: - for k, v in catalog_types.items(): - dtype[k] = v - paths, partitions_values = _to_parquet_dataset( - df=df, - path=path, - index=index, - compression=compression, - compression_ext=compression_ext, - cpus=cpus, - fs=fs, - use_threads=use_threads, - partition_cols=partition_cols, - dtype=dtype, - mode=mode, - boto3_session=session, - ) - if (database is not None) and (table is not None): - columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( - df=df, index=index, partition_cols=partition_cols, dtype=dtype - ) - catalog.create_parquet_table( - database=database, - table=table, - path=path, - columns_types=columns_types, - partitions_types=partitions_types, - compression=compression, - description=description, - parameters=parameters, - columns_comments=columns_comments, - boto3_session=session, - mode=mode, - catalog_versioning=catalog_versioning, - projection_enabled=projection_enabled, - projection_types=projection_types, - projection_ranges=projection_ranges, - projection_values=projection_values, - projection_intervals=projection_intervals, - projection_digits=projection_digits, - ) - if partitions_values and (regular_partitions is True): - _logger.debug("partitions_values:\n%s", partitions_values) - catalog.add_parquet_partitions( - database=database, - table=table, - partitions_values=partitions_values, - compression=compression, - boto3_session=session, - ) - return {"paths": paths, "partitions_values": partitions_values} - - -def _to_parquet_dataset( - df: pd.DataFrame, - path: str, - index: bool, - compression: Optional[str], - compression_ext: str, - cpus: int, - fs: s3fs.S3FileSystem, - use_threads: bool, - mode: str, - dtype: Dict[str, str], - partition_cols: Optional[List[str]] = None, - boto3_session: Optional[boto3.Session] = None, -) -> Tuple[List[str], Dict[str, List[str]]]: - paths: List[str] = [] - partitions_values: Dict[str, List[str]] = {} - path = path if path[-1] == "/" else f"{path}/" - if mode not in ["append", "overwrite", "overwrite_partitions"]: - raise exceptions.InvalidArgumentValue( - f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." - ) - if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): - delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) - df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) - schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( - df=df, index=index, ignore_cols=partition_cols, dtype=dtype - ) - _logger.debug("schema: \n%s", schema) - if not partition_cols: - file_path: str = f"{path}{uuid.uuid4().hex}{compression_ext}.parquet" - _to_parquet_file( - df=df, schema=schema, path=file_path, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype - ) - paths.append(file_path) - else: - for keys, subgroup in df.groupby(by=partition_cols, observed=True): - subgroup = subgroup.drop(partition_cols, axis="columns") - keys = (keys,) if not isinstance(keys, tuple) else keys - subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) - prefix: str = f"{path}{subdir}/" - if mode == "overwrite_partitions": - delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) - file_path = f"{prefix}{uuid.uuid4().hex}{compression_ext}.parquet" - _to_parquet_file( - df=subgroup, - schema=schema, - path=file_path, - index=index, - compression=compression, - cpus=cpus, - fs=fs, - dtype=dtype, - ) - paths.append(file_path) - partitions_values[prefix] = [str(k) for k in keys] - return paths, partitions_values - - -def _to_parquet_file( - df: pd.DataFrame, - path: str, - schema: pa.Schema, - index: bool, - compression: Optional[str], - cpus: int, - fs: s3fs.S3FileSystem, - dtype: Dict[str, str], -) -> str: - table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=True) - for col_name, col_type in dtype.items(): - if col_name in table.column_names: - col_index = table.column_names.index(col_name) - pyarrow_dtype = _data_types.athena2pyarrow(col_type) - field = pa.field(name=col_name, type=pyarrow_dtype) - table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype)) - _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype) - pyarrow.parquet.write_table( - table=table, - where=path, - write_statistics=True, - use_dictionary=True, - filesystem=fs, - coerce_timestamps="ms", - compression=compression, - flavor="spark", - ) - return path - - -def read_csv( - path: Union[str, List[str]], - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - chunksize: Optional[int] = None, - dataset: bool = False, - **pandas_kwargs, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read CSV file(s) from from a received S3 prefix or list of S3 objects paths. - - Note - ---- - For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - chunksize: int, optional - If specified, return an generator where chunksize is the number of rows to include in each chunk. - dataset: bool - If `True` read a CSV dataset instead of simple file(s) loading all the related partitions as columns. - pandas_kwargs: - keyword arguments forwarded to pandas.read_csv(). - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunksize != None`. - - Examples - -------- - Reading all CSV files under a prefix - - >>> import awswrangler as wr - >>> df = wr.s3.read_csv(path='s3://bucket/prefix/') - - Reading all CSV files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_csv( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading all CSV files from a list - - >>> import awswrangler as wr - >>> df = wr.s3.read_csv(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv']) - - Reading in chunks of 100 lines - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_csv(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunksize=100) - >>> for df in dfs: - >>> print(df) # 100 lines Pandas DataFrame - - """ - return _read_text( - parser_func=pd.read_csv, - path=path, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - chunksize=chunksize, - dataset=dataset, - **pandas_kwargs, - ) - - -def read_fwf( - path: Union[str, List[str]], - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - chunksize: Optional[int] = None, - dataset: bool = False, - **pandas_kwargs, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read fixed-width formatted file(s) from from a received S3 prefix or list of S3 objects paths. - - Note - ---- - For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - chunksize: int, optional - If specified, return an generator where chunksize is the number of rows to include in each chunk. - dataset: bool - If `True` read a FWF dataset instead of simple file(s) loading all the related partitions as columns. - pandas_kwargs: - keyword arguments forwarded to pandas.read_fwf(). - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_fwf.html - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunksize != None`. - - Examples - -------- - Reading all fixed-width formatted (FWF) files under a prefix - - >>> import awswrangler as wr - >>> df = wr.s3.read_fwf(path='s3://bucket/prefix/') - - Reading all fixed-width formatted (FWF) files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_fwf( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading all fixed-width formatted (FWF) files from a list - - >>> import awswrangler as wr - >>> df = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt']) - - Reading in chunks of 100 lines - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt'], chunksize=100) - >>> for df in dfs: - >>> print(df) # 100 lines Pandas DataFrame - - """ - return _read_text( - parser_func=pd.read_fwf, - path=path, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - chunksize=chunksize, - dataset=dataset, - **pandas_kwargs, - ) - - -def read_json( - path: Union[str, List[str]], - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - chunksize: Optional[int] = None, - dataset: bool = False, - **pandas_kwargs, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read JSON file(s) from from a received S3 prefix or list of S3 objects paths. - - Note - ---- - For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - chunksize: int, optional - If specified, return an generator where chunksize is the number of rows to include in each chunk. - dataset: bool - If `True` read a JSON dataset instead of simple file(s) loading all the related partitions as columns. - If `True`, the `lines=True` will be assumed by default. - pandas_kwargs: - keyword arguments forwarded to pandas.read_json(). - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunksize != None`. - - Examples - -------- - Reading all JSON files under a prefix - - >>> import awswrangler as wr - >>> df = wr.s3.read_json(path='s3://bucket/prefix/') - - Reading all JSON files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_json( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading all JSON files from a list - - >>> import awswrangler as wr - >>> df = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json']) - - Reading in chunks of 100 lines - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json'], chunksize=100) - >>> for df in dfs: - >>> print(df) # 100 lines Pandas DataFrame - - """ - if (dataset is True) and ("lines" not in pandas_kwargs): - pandas_kwargs["lines"] = True - return _read_text( - parser_func=pd.read_json, - path=path, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - chunksize=chunksize, - dataset=dataset, - **pandas_kwargs, - ) - - -def _read_text( - parser_func: Callable, - path: Union[str, List[str]], - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - chunksize: Optional[int] = None, - dataset: bool = False, - **pandas_kwargs, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - if "iterator" in pandas_kwargs: - raise exceptions.InvalidArgument("Please, use chunksize instead of iterator.") - session: boto3.Session = _utils.ensure_session(session=boto3_session) - if (dataset is True) and (not isinstance(path, str)): # pragma: no cover - raise exceptions.InvalidArgument("The path argument must be a string Amazon S3 prefix if dataset=True.") - if dataset is True: - path_root: str = str(path) - else: - path_root = "" - paths: List[str] = _path2list(path=path, boto3_session=session) - _logger.debug("paths:\n%s", paths) - if chunksize is not None: - dfs: Iterator[pd.DataFrame] = _read_text_chunksize( - parser_func=parser_func, - paths=paths, - boto3_session=session, - chunksize=chunksize, - pandas_args=pandas_kwargs, - s3_additional_kwargs=s3_additional_kwargs, - dataset=dataset, - path_root=path_root, - ) - return dfs - if use_threads is False: - df: pd.DataFrame = pd.concat( - objs=[ - _read_text_full( - parser_func=parser_func, - path=p, - boto3_session=session, - pandas_args=pandas_kwargs, - s3_additional_kwargs=s3_additional_kwargs, - dataset=dataset, - path_root=path_root, - ) - for p in paths - ], - ignore_index=True, - sort=False, - ) - else: - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - df = pd.concat( - objs=executor.map( - _read_text_full, - repeat(parser_func), - repeat(path_root), - paths, - repeat(_utils.boto3_to_primitives(boto3_session=session)), # Boto3.Session - repeat(pandas_kwargs), - repeat(s3_additional_kwargs), - repeat(dataset), - ), - ignore_index=True, - sort=False, - ) - return df - - -def _read_text_chunksize( - parser_func: Callable, - path_root: str, - paths: List[str], - boto3_session: boto3.Session, - chunksize: int, - pandas_args: Dict[str, Any], - s3_additional_kwargs: Optional[Dict[str, str]] = None, - dataset: bool = False, -) -> Iterator[pd.DataFrame]: - fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) - for path in paths: - _logger.debug("path: %s", path) - partitions: Dict[str, Any] = {} - if dataset is True: - partitions = _utils.extract_partitions_from_path(path_root=path_root, path=path) - if pandas_args.get("compression", "infer") == "infer": - pandas_args["compression"] = infer_compression(path, compression="infer") - mode: str = "r" if pandas_args.get("compression") is None else "rb" - with fs.open(path, mode) as f: - reader: pandas.io.parsers.TextFileReader = parser_func(f, chunksize=chunksize, **pandas_args) - for df in reader: - if dataset is True: - for column_name, value in partitions.items(): - df[column_name] = value - yield df - - -def _read_text_full( - parser_func: Callable, - path_root: str, - path: str, - boto3_session: Union[boto3.Session, Dict[str, Optional[str]]], - pandas_args: Dict[str, Any], - s3_additional_kwargs: Optional[Dict[str, str]] = None, - dataset: bool = False, -) -> pd.DataFrame: - fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) - if pandas_args.get("compression", "infer") == "infer": - pandas_args["compression"] = infer_compression(path, compression="infer") - mode: str = "r" if pandas_args.get("compression") is None else "rb" - encoding: Optional[str] = pandas_args.get("encoding", None) - newline: Optional[str] = pandas_args.get("lineterminator", None) - with fs.open(path=path, mode=mode, encoding=encoding, newline=newline) as f: - df: pd.DataFrame = parser_func(f, **pandas_args) - if dataset is True: - partitions: Dict[str, Any] = _utils.extract_partitions_from_path(path_root=path_root, path=path) - for column_name, value in partitions.items(): - df[column_name] = value - return df - - -def _read_parquet_init( - path: Union[str, List[str]], - filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, - categories: List[str] = None, - validate_schema: bool = True, - dataset: bool = False, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, -) -> pyarrow.parquet.ParquetDataset: - """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset.""" - session: boto3.Session = _utils.ensure_session(session=boto3_session) - if dataset is False: - path_or_paths: Union[str, List[str]] = _path2list(path=path, boto3_session=session) - elif isinstance(path, str): - path_or_paths = path[:-1] if path.endswith("/") else path - else: - path_or_paths = path - _logger.debug("path_or_paths: %s", path_or_paths) - fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset( - path_or_paths=path_or_paths, - filesystem=fs, - metadata_nthreads=cpus, - filters=filters, - read_dictionary=categories, - validate_schema=validate_schema, - split_row_groups=False, - use_legacy_dataset=True, - ) - return data - - -def read_parquet( - path: Union[str, List[str]], - filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, - columns: Optional[List[str]] = None, - validate_schema: bool = True, - chunked: Union[bool, int] = False, - dataset: bool = False, - categories: List[str] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read Apache Parquet file(s) from from a received S3 prefix or list of S3 objects paths. - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning and catalog integration (AWS Glue Catalog). - - Note - ---- - ``Batching`` (`chunked` argument) (Memory Friendly): - - Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. - - There are two batching strategies on Wrangler: - - - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - - - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. - - `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise - in number of rows for each Dataframe. - - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - filters: Union[List[Tuple], List[List[Tuple]]], optional - List of filters to apply on PARTITION columns (PUSH-DOWN filter), like ``[[('x', '=', 0), ...], ...]``. - Ignored if `dataset=False`. - columns : List[str], optional - Names of columns to read from the file(s). - validate_schema: - Check that individual file schemas are all the same / compatible. Schemas within a - folder prefix should all be the same. Disable if you have schemas that are different - and want to disable this check. - chunked : Union[int, bool] - If passed will split the data in a Iterable of DataFrames (Memory friendly). - If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. - If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. - dataset: bool - If `True` read a parquet dataset instead of simple file(s) loading all the related partitions as columns. - categories: List[str], optional - List of columns names that should be returned as pandas.Categorical. - Recommended for memory restricted environments. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunked=True`. - - Examples - -------- - Reading all Parquet files under a prefix - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet(path='s3://bucket/prefix/') - - Reading all Parquet files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading all Parquet files from a list - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet(path=['s3://bucket/filename0.parquet', 's3://bucket/filename1.parquet']) - - Reading in chunks (Chunk by file) - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=True) - >>> for df in dfs: - >>> print(df) # Smaller Pandas DataFrame - - Reading in chunks (Chunk by 1MM rows) - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) - >>> for df in dfs: - >>> print(df) # 1MM Pandas DataFrame - - """ - data: pyarrow.parquet.ParquetDataset = _read_parquet_init( - path=path, - filters=filters, - dataset=dataset, - categories=categories, - validate_schema=validate_schema, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - ) - _logger.debug("pyarrow.parquet.ParquetDataset initialized.") - if chunked is False: - return _read_parquet( - data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema - ) - return _read_parquet_chunked( - data=data, columns=columns, categories=categories, chunked=chunked, use_threads=use_threads - ) - - -def _read_parquet( - data: pyarrow.parquet.ParquetDataset, - columns: Optional[List[str]] = None, - categories: List[str] = None, - use_threads: bool = True, - validate_schema: bool = True, -) -> pd.DataFrame: - tables: List[pa.Table] = [] - _logger.debug("Reading pieces...") - for piece in data.pieces: - table: pa.Table = piece.read( - columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False - ) - _logger.debug("Appending piece in the list...") - tables.append(table) - promote: bool = not validate_schema - _logger.debug("Concating pieces...") - table = pa.lib.concat_tables(tables, promote=promote) - _logger.debug("Converting PyArrow table to Pandas DataFrame...") - return table.to_pandas( - use_threads=use_threads, - split_blocks=True, - self_destruct=True, - integer_object_nulls=False, - date_as_object=True, - ignore_metadata=True, - categories=categories, - types_mapper=_data_types.pyarrow2pandas_extension, - ) - - -def _read_parquet_chunked( - data: pyarrow.parquet.ParquetDataset, - columns: Optional[List[str]] = None, - categories: List[str] = None, - chunked: Union[bool, int] = True, - use_threads: bool = True, -) -> Iterator[pd.DataFrame]: - next_slice: Optional[pd.DataFrame] = None - for piece in data.pieces: - df: pd.DataFrame = _table2df( - table=piece.read( - columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False - ), - categories=categories, - use_threads=use_threads, - ) - if chunked is True: - yield df - else: - if next_slice is not None: - df = pd.concat(objs=[next_slice, df], ignore_index=True, sort=False) - while len(df.index) >= chunked: - yield df.iloc[:chunked] - df = df.iloc[chunked:] - if df.empty: - next_slice = None - else: - next_slice = df - if next_slice is not None: - yield next_slice - - -def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame: - return table.to_pandas( - use_threads=use_threads, - split_blocks=True, - self_destruct=True, - integer_object_nulls=False, - date_as_object=True, - ignore_metadata=True, - categories=categories, - types_mapper=_data_types.pyarrow2pandas_extension, - ) - - -def read_parquet_metadata( - path: Union[str, List[str]], - dtype: Optional[Dict[str, str]] = None, - sampling: float = 1.0, - dataset: bool = False, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> Tuple[Dict[str, str], Optional[Dict[str, str]]]: - """Read Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths. - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning and catalog integration (AWS Glue Catalog). - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - dtype : Dict[str, str], optional - Dictionary of columns names and Athena/Glue types to be casted. - Useful when you have columns with undetermined data types as partitions columns. - (e.g. {'col name': 'bigint', 'col2 name': 'int'}) - sampling : float - Random sample ratio of files that will have the metadata inspected. - Must be `0.0 < sampling <= 1.0`. - The higher, the more accurate. - The lower, the faster. - dataset: bool - If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - Tuple[Dict[str, str], Optional[Dict[str, str]]] - columns_types: Dictionary with keys as column names and vales as - data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / - partitions_types: Dictionary with keys as partition names - and values as data types (e.g. {'col2': 'date'}). - - Examples - -------- - Reading all Parquet files (with partitions) metadata under a prefix - - >>> import awswrangler as wr - >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path='s3://bucket/prefix/', dataset=True) - - Reading all Parquet files metadata from a list - - >>> import awswrangler as wr - >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path=[ - ... 's3://bucket/filename0.parquet', - ... 's3://bucket/filename1.parquet' - ... ]) - - """ - return _read_parquet_metadata( - path=path, dtype=dtype, sampling=sampling, dataset=dataset, use_threads=use_threads, boto3_session=boto3_session - )[:2] - - -def _read_parquet_metadata( - path: Union[str, List[str]], - dtype: Optional[Dict[str, str]], - sampling: float, - dataset: bool, - use_threads: bool, - boto3_session: Optional[boto3.Session], -) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: - session: boto3.Session = _utils.ensure_session(session=boto3_session) - if dataset is True: - if isinstance(path, str): - _path: Optional[str] = path if path.endswith("/") else f"{path}/" - paths: List[str] = _path2list(path=_path, boto3_session=session) - else: # pragma: no cover - raise exceptions.InvalidArgumentType("Argument must be str if dataset=True.") - else: - if isinstance(path, str): - _path = None - paths = _path2list(path=path, boto3_session=session) - elif isinstance(path, list): - _path = None - paths = path - else: # pragma: no cover - raise exceptions.InvalidArgumentType(f"Argument path must be str or List[str] instead of {type(path)}.") - schemas: List[Dict[str, str]] = [ - _read_parquet_metadata_file(path=x, use_threads=use_threads, boto3_session=session) - for x in _utils.list_sampling(lst=paths, sampling=sampling) - ] - _logger.debug("schemas: %s", schemas) - columns_types: Dict[str, str] = {} - for schema in schemas: - for column, _dtype in schema.items(): - if (column in columns_types) and (columns_types[column] != _dtype): # pragma: no cover - raise exceptions.InvalidSchemaConvergence( - f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})." - ) - columns_types[column] = _dtype - partitions_types: Optional[Dict[str, str]] = None - partitions_values: Optional[Dict[str, List[str]]] = None - if (dataset is True) and (_path is not None): - partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths(path=_path, paths=paths) - if dtype: - for k, v in dtype.items(): - if columns_types and k in columns_types: - columns_types[k] = v - if partitions_types and k in partitions_types: - partitions_types[k] = v - _logger.debug("columns_types: %s", columns_types) - return columns_types, partitions_types, partitions_values - - -def _read_parquet_metadata_file(path: str, use_threads: bool, boto3_session: boto3.Session) -> Dict[str, str]: - data: pyarrow.parquet.ParquetDataset = _read_parquet_init( - path=path, filters=None, dataset=False, use_threads=use_threads, boto3_session=boto3_session - ) - return _data_types.athena_types_from_pyarrow_schema(schema=data.schema.to_arrow_schema(), partitions=None)[0] - - -def store_parquet_metadata( # pylint: disable=too-many-arguments - path: str, - database: str, - table: str, - dtype: Optional[Dict[str, str]] = None, - sampling: float = 1.0, - dataset: bool = False, - use_threads: bool = True, - description: Optional[str] = None, - parameters: Optional[Dict[str, str]] = None, - columns_comments: Optional[Dict[str, str]] = None, - compression: Optional[str] = None, - mode: str = "overwrite", - catalog_versioning: bool = False, - regular_partitions: bool = True, - projection_enabled: bool = False, - projection_types: Optional[Dict[str, str]] = None, - projection_ranges: Optional[Dict[str, str]] = None, - projection_values: Optional[Dict[str, str]] = None, - projection_intervals: Optional[Dict[str, str]] = None, - projection_digits: Optional[Dict[str, str]] = None, - boto3_session: Optional[boto3.Session] = None, -) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: - """Infer and store parquet metadata on AWS Glue Catalog. - - Infer Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths - And then stores it on AWS Glue Catalog including all inferred partitions - (No need of 'MCSK REPAIR TABLE') - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning and catalog integration (AWS Glue Catalog). - - Note - ---- - On `append` mode, the `parameters` will be upsert on an existing table. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - database : str - Glue/Athena catalog: Database name. - table : str - Glue/Athena catalog: Table name. - dtype : Dict[str, str], optional - Dictionary of columns names and Athena/Glue types to be casted. - Useful when you have columns with undetermined data types as partitions columns. - (e.g. {'col name': 'bigint', 'col2 name': 'int'}) - sampling : float - Random sample ratio of files that will have the metadata inspected. - Must be `0.0 < sampling <= 1.0`. - The higher, the more accurate. - The lower, the faster. - dataset: bool - If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - description: str, optional - Glue/Athena catalog: Table description - parameters: Dict[str, str], optional - Glue/Athena catalog: Key/value pairs to tag the table. - columns_comments: Dict[str, str], optional - Glue/Athena catalog: - Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). - compression: str, optional - Compression style (``None``, ``snappy``, ``gzip``, etc). - mode: str - 'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table. - catalog_versioning : bool - If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. - regular_partitions : bool - Create regular partitions (Non projected partitions) on Glue Catalog. - Disable when you will work only with Partition Projection. - Keep enabled even when working with projections is useful to keep - Redshift Spectrum working with the regular partitions. - projection_enabled : bool - Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) - projection_types : Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections types. - Valid types: "enum", "integer", "date", "injected" - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) - projection_ranges: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections ranges. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) - projection_values: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections values. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) - projection_intervals: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections intervals. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '5'}) - projection_digits: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections digits. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '2'}) - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]] - The metadata used to create the Glue Table. - columns_types: Dictionary with keys as column names and vales as - data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / - partitions_types: Dictionary with keys as partition names - and values as data types (e.g. {'col2': 'date'}). / - partitions_values: Dictionary with keys as S3 path locations and values as a - list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). - - Examples - -------- - Reading all Parquet files metadata under a prefix - - >>> import awswrangler as wr - >>> columns_types, partitions_types, partitions_values = wr.s3.store_parquet_metadata( - ... path='s3://bucket/prefix/', - ... database='...', - ... table='...', - ... dataset=True - ... ) - - """ - session: boto3.Session = _utils.ensure_session(session=boto3_session) - columns_types: Dict[str, str] - partitions_types: Optional[Dict[str, str]] - partitions_values: Optional[Dict[str, List[str]]] - columns_types, partitions_types, partitions_values = _read_parquet_metadata( - path=path, dtype=dtype, sampling=sampling, dataset=dataset, use_threads=use_threads, boto3_session=session - ) - _logger.debug("columns_types: %s", columns_types) - _logger.debug("partitions_types: %s", partitions_types) - _logger.debug("partitions_values: %s", partitions_values) - catalog.create_parquet_table( - database=database, - table=table, - path=path, - columns_types=columns_types, - partitions_types=partitions_types, - description=description, - parameters=parameters, - columns_comments=columns_comments, - mode=mode, - catalog_versioning=catalog_versioning, - projection_enabled=projection_enabled, - projection_types=projection_types, - projection_ranges=projection_ranges, - projection_values=projection_values, - projection_intervals=projection_intervals, - projection_digits=projection_digits, - boto3_session=session, - ) - if (partitions_types is not None) and (partitions_values is not None) and (regular_partitions is True): - catalog.add_parquet_partitions( - database=database, - table=table, - partitions_values=partitions_values, - compression=compression, - boto3_session=session, - ) - return columns_types, partitions_types, partitions_values - - -def wait_objects_exist( - paths: List[str], - delay: Optional[Union[int, float]] = None, - max_attempts: Optional[int] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> None: - """Wait Amazon S3 objects exist. - - Polls S3.Client.head_object() every 5 seconds (default) until a successful - state is reached. An error is returned after 20 (default) failed checks. - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Waiter.ObjectExists - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - paths : List[str] - List of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - delay : Union[int,float], optional - The amount of time in seconds to wait between attempts. Default: 5 - max_attempts : int, optional - The maximum number of attempts to be made. Default: 20 - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - None - None. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.wait_objects_exist(['s3://bucket/key0', 's3://bucket/key1']) # wait both objects - - """ - return _wait_objects( - waiter_name="object_exists", - paths=paths, - delay=delay, - max_attempts=max_attempts, - use_threads=use_threads, - boto3_session=boto3_session, - ) - - -def wait_objects_not_exist( - paths: List[str], - delay: Optional[Union[int, float]] = None, - max_attempts: Optional[int] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> None: - """Wait Amazon S3 objects not exist. - - Polls S3.Client.head_object() every 5 seconds (default) until a successful - state is reached. An error is returned after 20 (default) failed checks. - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Waiter.ObjectNotExists - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - paths : List[str] - List of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - delay : Union[int,float], optional - The amount of time in seconds to wait between attempts. Default: 5 - max_attempts : int, optional - The maximum number of attempts to be made. Default: 20 - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - None - None. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.wait_objects_not_exist(['s3://bucket/key0', 's3://bucket/key1']) # wait both objects not exist - - """ - return _wait_objects( - waiter_name="object_not_exists", - paths=paths, - delay=delay, - max_attempts=max_attempts, - use_threads=use_threads, - boto3_session=boto3_session, - ) - - -def _wait_objects( - waiter_name: str, - paths: List[str], - delay: Optional[Union[int, float]] = None, - max_attempts: Optional[int] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> None: - delay = 5 if delay is None else delay - max_attempts = 20 if max_attempts is None else max_attempts - _delay: int = int(delay) if isinstance(delay, float) else delay - if len(paths) < 1: - return None - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths] - if use_threads is False: - waiter = client_s3.get_waiter(waiter_name) - for bucket, key in _paths: - waiter.wait(Bucket=bucket, Key=key, WaiterConfig={"Delay": _delay, "MaxAttempts": max_attempts}) - else: - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - list( - executor.map( - _wait_objects_concurrent, - _paths, - repeat(waiter_name), - repeat(client_s3), - repeat(_delay), - repeat(max_attempts), - ) - ) - return None - - -def _wait_objects_concurrent( - path: Tuple[str, str], waiter_name: str, client_s3: boto3.client, delay: int, max_attempts: int -) -> None: - waiter = client_s3.get_waiter(waiter_name) - bucket, key = path - waiter.wait(Bucket=bucket, Key=key, WaiterConfig={"Delay": delay, "MaxAttempts": max_attempts}) - - -def read_parquet_table( - table: str, - database: str, - filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, - columns: Optional[List[str]] = None, - categories: List[str] = None, - chunked: Union[bool, int] = False, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read Apache Parquet table registered on AWS Glue Catalog. - - Note - ---- - ``Batching`` (`chunked` argument) (Memory Friendly): - - Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. - - There are two batching strategies on Wrangler: - - - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - - - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating - to return DataFrames with the number of row igual the received INTEGER. - - `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise - in number of rows for each Dataframe. - - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - table : str - AWS Glue Catalog table name. - database : str - AWS Glue Catalog database name. - filters: Union[List[Tuple], List[List[Tuple]]], optional - List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. - columns : List[str], optional - Names of columns to read from the file(s). - categories: List[str], optional - List of columns names that should be returned as pandas.Categorical. - Recommended for memory restricted environments. - chunked : bool - If True will break the data in smaller DataFrames (Non deterministic number of lines). - Otherwise return a single DataFrame with the whole data. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunked=True`. - - Examples - -------- - Reading Parquet Table - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet_table(database='...', table='...') - - Reading Parquet Table encrypted - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet_table( - ... database='...', - ... table='...' - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading Parquet Table in chunks (Chunk by file) - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True) - >>> for df in dfs: - >>> print(df) # Smaller Pandas DataFrame - - Reading in chunks (Chunk by 1MM rows) - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) - >>> for df in dfs: - >>> print(df) # 1MM Pandas DataFrame - - """ - path: str = catalog.get_table_location(database=database, table=table, boto3_session=boto3_session) - return read_parquet( - path=path, - filters=filters, - columns=columns, - categories=categories, - chunked=chunked, - dataset=True, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - ) - - -def merge_datasets( - source_path: str, - target_path: str, - mode: str = "append", - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> List[str]: - """Merge a source dataset into a target dataset. - - Note - ---- - If you are merging tables (S3 datasets + Glue Catalog metadata), - remember that you will also need to update your partitions metadata in some cases. - (e.g. wr.athena.repair_table(table='...', database='...')) - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - source_path : str, - S3 Path for the source directory. - target_path : str, - S3 Path for the target directory. - mode: str, optional - ``append`` (Default), ``overwrite``, ``overwrite_partitions``. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - List of new objects paths. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.merge_datasets( - ... source_path="s3://bucket0/dir0/", - ... target_path="s3://bucket1/dir1/", - ... mode="append" - ... ) - ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] - - """ - source_path = source_path[:-1] if source_path[-1] == "/" else source_path - target_path = target_path[:-1] if target_path[-1] == "/" else target_path - session: boto3.Session = _utils.ensure_session(session=boto3_session) - - paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session) - _logger.debug("len(paths): %s", len(paths)) - if len(paths) < 1: - return [] - - if mode == "overwrite": - _logger.debug("Deleting to overwrite: %s/", target_path) - delete_objects(path=f"{target_path}/", use_threads=use_threads, boto3_session=session) - elif mode == "overwrite_partitions": - paths_wo_prefix: List[str] = [x.replace(f"{source_path}/", "") for x in paths] - paths_wo_filename: List[str] = [f"{x.rpartition('/')[0]}/" for x in paths_wo_prefix] - partitions_paths: List[str] = list(set(paths_wo_filename)) - target_partitions_paths = [f"{target_path}/{x}" for x in partitions_paths] - for path in target_partitions_paths: - _logger.debug("Deleting to overwrite_partitions: %s", path) - delete_objects(path=path, use_threads=use_threads, boto3_session=session) - elif mode != "append": - raise exceptions.InvalidArgumentValue(f"{mode} is a invalid mode option.") - - new_objects: List[str] = copy_objects( - paths=paths, source_path=source_path, target_path=target_path, use_threads=use_threads, boto3_session=session - ) - _logger.debug("len(new_objects): %s", len(new_objects)) - return new_objects - - -def copy_objects( - paths: List[str], - source_path: str, - target_path: str, - replace_filenames: Optional[Dict[str, str]] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> List[str]: - """Copy a list of S3 objects to another S3 directory. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - paths : List[str] - List of S3 objects paths (e.g. [s3://bucket/dir0/key0, s3://bucket/dir0/key1]). - source_path : str, - S3 Path for the source directory. - target_path : str, - S3 Path for the target directory. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - List of new objects paths. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.copy_objects( - ... paths=["s3://bucket0/dir0/key0", "s3://bucket0/dir0/key1"]) - ... source_path="s3://bucket0/dir0/", - ... target_path="s3://bucket1/dir1/", - ... ) - ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] - - """ - _logger.debug("len(paths): %s", len(paths)) - if len(paths) < 1: - return [] - source_path = source_path[:-1] if source_path[-1] == "/" else source_path - target_path = target_path[:-1] if target_path[-1] == "/" else target_path - session: boto3.Session = _utils.ensure_session(session=boto3_session) - batch: List[Tuple[str, str]] = [] - new_objects: List[str] = [] - for path in paths: - path_wo_prefix: str = path.replace(f"{source_path}/", "") - path_final: str = f"{target_path}/{path_wo_prefix}" - if replace_filenames is not None: - parts: List[str] = path_final.rsplit(sep="/", maxsplit=1) - if len(parts) == 2: - path_wo_filename: str = parts[0] - filename: str = parts[1] - if filename in replace_filenames: - new_filename: str = replace_filenames[filename] - _logger.debug("Replacing filename: %s -> %s", filename, new_filename) - path_final = f"{path_wo_filename}/{new_filename}" - new_objects.append(path_final) - batch.append((path, path_final)) - _logger.debug("len(new_objects): %s", len(new_objects)) - _copy_objects(batch=batch, use_threads=use_threads, boto3_session=session) - return new_objects - - -def _copy_objects(batch: List[Tuple[str, str]], use_threads: bool, boto3_session: boto3.Session) -> None: - _logger.debug("len(batch): %s", len(batch)) - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - resource_s3: boto3.resource = _utils.resource(service_name="s3", session=boto3_session) - for source, target in batch: - source_bucket, source_key = _utils.parse_path(path=source) - copy_source: Dict[str, str] = {"Bucket": source_bucket, "Key": source_key} - target_bucket, target_key = _utils.parse_path(path=target) - resource_s3.meta.client.copy( - CopySource=copy_source, - Bucket=target_bucket, - Key=target_key, - SourceClient=client_s3, - Config=TransferConfig(num_download_attempts=15, use_threads=use_threads), - ) diff --git a/awswrangler/s3/__init__.py b/awswrangler/s3/__init__.py new file mode 100644 index 000000000..234dbc718 --- /dev/null +++ b/awswrangler/s3/__init__.py @@ -0,0 +1,16 @@ +"""Amazon S3 Read Module.""" + +from awswrangler.s3._copy import copy_objects, merge_datasets # noqa +from awswrangler.s3._delete import delete_objects # noqa +from awswrangler.s3._describe import describe_objects, get_bucket_region, size_objects # noqa +from awswrangler.s3._list import does_object_exist, list_directories, list_objects # noqa +from awswrangler.s3._read import ( # noqa + read_csv, + read_fwf, + read_json, + read_parquet, + read_parquet_metadata, + read_parquet_table, +) +from awswrangler.s3._wait import wait_objects_exist, wait_objects_not_exist # noqa +from awswrangler.s3._write import store_parquet_metadata, to_csv, to_json, to_parquet # noqa diff --git a/awswrangler/s3/_copy.py b/awswrangler/s3/_copy.py new file mode 100644 index 000000000..f8fedd5ca --- /dev/null +++ b/awswrangler/s3/_copy.py @@ -0,0 +1,182 @@ +"""Amazon S3 Copy Module (PRIVATE).""" + +import logging +from typing import Dict, List, Optional, Tuple + +import boto3 # type: ignore +from boto3.s3.transfer import TransferConfig # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.s3._delete import delete_objects +from awswrangler.s3._list import list_objects + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _copy_objects(batch: List[Tuple[str, str]], use_threads: bool, boto3_session: boto3.Session) -> None: + _logger.debug("len(batch): %s", len(batch)) + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + resource_s3: boto3.resource = _utils.resource(service_name="s3", session=boto3_session) + for source, target in batch: + source_bucket, source_key = _utils.parse_path(path=source) + copy_source: Dict[str, str] = {"Bucket": source_bucket, "Key": source_key} + target_bucket, target_key = _utils.parse_path(path=target) + resource_s3.meta.client.copy( + CopySource=copy_source, + Bucket=target_bucket, + Key=target_key, + SourceClient=client_s3, + Config=TransferConfig(num_download_attempts=15, use_threads=use_threads), + ) + + +def merge_datasets( + source_path: str, + target_path: str, + mode: str = "append", + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + """Merge a source dataset into a target dataset. + + Note + ---- + If you are merging tables (S3 datasets + Glue Catalog metadata), + remember that you will also need to update your partitions metadata in some cases. + (e.g. wr.athena.repair_table(table='...', database='...')) + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + source_path : str, + S3 Path for the source directory. + target_path : str, + S3 Path for the target directory. + mode: str, optional + ``append`` (Default), ``overwrite``, ``overwrite_partitions``. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of new objects paths. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.merge_datasets( + ... source_path="s3://bucket0/dir0/", + ... target_path="s3://bucket1/dir1/", + ... mode="append" + ... ) + ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] + + """ + source_path = source_path[:-1] if source_path[-1] == "/" else source_path + target_path = target_path[:-1] if target_path[-1] == "/" else target_path + session: boto3.Session = _utils.ensure_session(session=boto3_session) + + paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session) + _logger.debug("len(paths): %s", len(paths)) + if len(paths) < 1: + return [] + + if mode == "overwrite": + _logger.debug("Deleting to overwrite: %s/", target_path) + delete_objects(path=f"{target_path}/", use_threads=use_threads, boto3_session=session) + elif mode == "overwrite_partitions": + paths_wo_prefix: List[str] = [x.replace(f"{source_path}/", "") for x in paths] + paths_wo_filename: List[str] = [f"{x.rpartition('/')[0]}/" for x in paths_wo_prefix] + partitions_paths: List[str] = list(set(paths_wo_filename)) + target_partitions_paths = [f"{target_path}/{x}" for x in partitions_paths] + for path in target_partitions_paths: + _logger.debug("Deleting to overwrite_partitions: %s", path) + delete_objects(path=path, use_threads=use_threads, boto3_session=session) + elif mode != "append": + raise exceptions.InvalidArgumentValue(f"{mode} is a invalid mode option.") + + new_objects: List[str] = copy_objects( + paths=paths, source_path=source_path, target_path=target_path, use_threads=use_threads, boto3_session=session + ) + _logger.debug("len(new_objects): %s", len(new_objects)) + return new_objects + + +def copy_objects( + paths: List[str], + source_path: str, + target_path: str, + replace_filenames: Optional[Dict[str, str]] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + """Copy a list of S3 objects to another S3 directory. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + paths : List[str] + List of S3 objects paths (e.g. [s3://bucket/dir0/key0, s3://bucket/dir0/key1]). + source_path : str, + S3 Path for the source directory. + target_path : str, + S3 Path for the target directory. + replace_filenames : Dict[str, str], optional + e.g. {"old_name.csv": "new_name.csv", "old_name2.csv": "new_name2.csv"} + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of new objects paths. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.copy_objects( + ... paths=["s3://bucket0/dir0/key0", "s3://bucket0/dir0/key1"]) + ... source_path="s3://bucket0/dir0/", + ... target_path="s3://bucket1/dir1/", + ... ) + ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] + + """ + _logger.debug("len(paths): %s", len(paths)) + if len(paths) < 1: + return [] + source_path = source_path[:-1] if source_path[-1] == "/" else source_path + target_path = target_path[:-1] if target_path[-1] == "/" else target_path + session: boto3.Session = _utils.ensure_session(session=boto3_session) + batch: List[Tuple[str, str]] = [] + new_objects: List[str] = [] + for path in paths: + path_wo_prefix: str = path.replace(f"{source_path}/", "") + path_final: str = f"{target_path}/{path_wo_prefix}" + if replace_filenames is not None: + parts: List[str] = path_final.rsplit(sep="/", maxsplit=1) + if len(parts) == 2: + path_wo_filename: str = parts[0] + filename: str = parts[1] + if filename in replace_filenames: + new_filename: str = replace_filenames[filename] + _logger.debug("Replacing filename: %s -> %s", filename, new_filename) + path_final = f"{path_wo_filename}/{new_filename}" + new_objects.append(path_final) + batch.append((path, path_final)) + _logger.debug("len(new_objects): %s", len(new_objects)) + _copy_objects(batch=batch, use_threads=use_threads, boto3_session=session) + return new_objects diff --git a/awswrangler/s3/_delete.py b/awswrangler/s3/_delete.py new file mode 100644 index 000000000..3c9cad484 --- /dev/null +++ b/awswrangler/s3/_delete.py @@ -0,0 +1,85 @@ +"""Amazon S3 CopDeletey Module (PRIVATE).""" + +import concurrent.futures +import logging +from itertools import repeat +from typing import Dict, List, Optional, Union + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.s3._list import path2list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _split_paths_by_bucket(paths: List[str]) -> Dict[str, List[str]]: + buckets: Dict[str, List[str]] = {} + bucket: str + key: str + for path in paths: + bucket, key = _utils.parse_path(path=path) + if bucket not in buckets: + buckets[bucket] = [] + buckets[bucket].append(key) + return buckets + + +def _delete_objects(bucket: str, keys: List[str], client_s3: boto3.client) -> None: + _logger.debug("len(keys): %s", len(keys)) + batch: List[Dict[str, str]] = [{"Key": key} for key in keys] + res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch}) + deleted = res.get("Deleted") + if deleted is not None: + for i in deleted: + _logger.debug("s3://%s/%s has been deleted.", bucket, i.get("Key")) + errors = res.get("Errors") + if errors is not None: # pragma: no cover + raise exceptions.ServiceApiError(errors) + + +def delete_objects( + path: Union[str, List[str]], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None +) -> None: + """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1']) # Delete both objects + >>> wr.s3.delete_objects('s3://bucket/prefix') # Delete all objects under the received prefix + + """ + paths: List[str] = path2list(path=path, boto3_session=boto3_session) + if len(paths) < 1: + return + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths) + for bucket, keys in buckets.items(): + chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000) + if use_threads is False: + for chunk in chunks: + _delete_objects(bucket=bucket, keys=chunk, client_s3=client_s3) + else: + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: + list(executor.map(_delete_objects, repeat(bucket), chunks, repeat(client_s3))) diff --git a/awswrangler/s3/_describe.py b/awswrangler/s3/_describe.py new file mode 100644 index 000000000..0319dcb94 --- /dev/null +++ b/awswrangler/s3/_describe.py @@ -0,0 +1,182 @@ +"""Amazon S3 Describe Module (INTERNAL).""" + +import concurrent.futures +import logging +import time +from itertools import repeat +from typing import Any, Dict, List, Optional, Tuple, Union + +import boto3 # type: ignore +import botocore.exceptions # type: ignore + +from awswrangler import _utils +from awswrangler.s3._list import path2list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _describe_object( + path: str, wait_time: Optional[Union[int, float]], client_s3: boto3.client +) -> Tuple[str, Dict[str, Any]]: + wait_time = int(wait_time) if isinstance(wait_time, float) else wait_time + tries: int = wait_time if (wait_time is not None) and (wait_time > 0) else 1 + bucket: str + key: str + bucket, key = _utils.parse_path(path=path) + desc: Dict[str, Any] = {} + for i in range(tries, 0, -1): + try: + desc = client_s3.head_object(Bucket=bucket, Key=key) + break + except botocore.exceptions.ClientError as e: # pragma: no cover + if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404: # Not Found + _logger.debug("Object not found. %s seconds remaining to wait.", i) + if i == 1: # Last try, there is no more need to sleep + break + time.sleep(1) + else: + raise e + return path, desc + + +def describe_objects( + path: Union[str, List[str]], + wait_time: Optional[Union[int, float]] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Dict[str, Any]]: + """Describe Amazon S3 objects from a received S3 prefix or list of S3 objects paths. + + Fetch attributes like ContentLength, DeleteMarker, LastModified, ContentType, etc + The full list of attributes can be explored under the boto3 head_object documentation: + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + wait_time : Union[int,float], optional + How much time (seconds) should Wrangler try to reach this objects. + Very useful to overcome eventual consistence issues. + `None` means only a single try will be done. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Dict[str, Any]] + Return a dictionary of objects returned from head_objects where the key is the object path. + The response object can be explored here: + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object + + Examples + -------- + >>> import awswrangler as wr + >>> descs0 = wr.s3.describe_objects(['s3://bucket/key0', 's3://bucket/key1']) # Describe both objects + >>> descs1 = wr.s3.describe_objects('s3://bucket/prefix') # Describe all objects under the prefix + >>> descs2 = wr.s3.describe_objects('s3://bucket/prefix', wait_time=30) # Overcoming eventual consistence issues + + """ + paths: List[str] = path2list(path=path, boto3_session=boto3_session) + if len(paths) < 1: + return {} + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + resp_list: List[Tuple[str, Dict[str, Any]]] + if use_threads is False: + resp_list = [_describe_object(path=p, wait_time=wait_time, client_s3=client_s3) for p in paths] + else: + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: + resp_list = list(executor.map(_describe_object, paths, repeat(wait_time), repeat(client_s3))) + desc_dict: Dict[str, Dict[str, Any]] = dict(resp_list) + return desc_dict + + +def size_objects( + path: Union[str, List[str]], + wait_time: Optional[Union[int, float]] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Optional[int]]: + """Get the size (ContentLength) in bytes of Amazon S3 objects from a received S3 prefix or list of S3 objects paths. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + wait_time : Union[int,float], optional + How much time (seconds) should Wrangler try to reach this objects. + Very useful to overcome eventual consistence issues. + `None` means only a single try will be done. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Optional[int]] + Dictionary where the key is the object path and the value is the object size. + + Examples + -------- + >>> import awswrangler as wr + >>> sizes0 = wr.s3.size_objects(['s3://bucket/key0', 's3://bucket/key1']) # Get the sizes of both objects + >>> sizes1 = wr.s3.size_objects('s3://bucket/prefix') # Get the sizes of all objects under the received prefix + >>> sizes2 = wr.s3.size_objects('s3://bucket/prefix', wait_time=30) # Overcoming eventual consistence issues + + """ + desc_list: Dict[str, Dict[str, Any]] = describe_objects( + path=path, wait_time=wait_time, use_threads=use_threads, boto3_session=boto3_session + ) + size_dict: Dict[str, Optional[int]] = {k: d.get("ContentLength", None) for k, d in desc_list.items()} + return size_dict + + +def get_bucket_region(bucket: str, boto3_session: Optional[boto3.Session] = None) -> str: + """Get bucket region name. + + Parameters + ---------- + bucket : str + Bucket name. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Region code (e.g. 'us-east-1'). + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> region = wr.s3.get_bucket_region('bucket-name') + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> region = wr.s3.get_bucket_region('bucket-name', boto3_session=boto3.Session()) + + """ + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + _logger.debug("bucket: %s", bucket) + region: str = client_s3.get_bucket_location(Bucket=bucket)["LocationConstraint"] + region = "us-east-1" if region is None else region + _logger.debug("region: %s", region) + return region diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py new file mode 100644 index 000000000..2eebea1ca --- /dev/null +++ b/awswrangler/s3/_list.py @@ -0,0 +1,176 @@ +"""Amazon S3 List Module (PRIVATE).""" + +import logging +from typing import Any, Dict, List, Optional + +import boto3 # type: ignore +import botocore.exceptions # type: ignore + +from awswrangler import _utils, exceptions + +_logger: logging.Logger = logging.getLogger(__name__) + + +def path2list(path: object, boto3_session: boto3.Session, suffix: str = None) -> List[str]: + """Convert Amazon S3 path to list of objects.""" + if isinstance(path, str): # prefix + paths: List[str] = list_objects(path=path, suffix=suffix, boto3_session=boto3_session) + elif isinstance(path, list): + paths = path if suffix is None else [x for x in path if x.endswith(suffix)] + else: + raise exceptions.InvalidArgumentType(f"{type(path)} is not a valid path type. Please, use str or List[str].") + return paths + + +def _list_objects( + path: str, + delimiter: Optional[str] = None, + suffix: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + bucket: str + prefix: str + bucket, prefix = _utils.parse_path(path=path) + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + paginator = client_s3.get_paginator("list_objects_v2") + args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} + if delimiter is not None: + args["Delimiter"] = delimiter + response_iterator = paginator.paginate(**args) + paths: List[str] = [] + for page in response_iterator: # pylint: disable=too-many-nested-blocks + if delimiter is None: + contents: Optional[List] = page.get("Contents") + if contents is not None: + for content in contents: + if (content is not None) and ("Key" in content): + key: str = content["Key"] + if (suffix is None) or key.endswith(suffix): + paths.append(f"s3://{bucket}/{key}") + else: + prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") + if prefixes is not None: + for pfx in prefixes: + if (pfx is not None) and ("Prefix" in pfx): + key = pfx["Prefix"] + paths.append(f"s3://{bucket}/{key}") + return paths + + +def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) -> bool: + """Check if object exists on S3. + + Parameters + ---------- + path: str + S3 path (e.g. s3://bucket/key). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + bool + True if exists, False otherwise. + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> wr.s3.does_object_exist('s3://bucket/key_real') + True + >>> wr.s3.does_object_exist('s3://bucket/key_unreal') + False + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> wr.s3.does_object_exist('s3://bucket/key_real', boto3_session=boto3.Session()) + True + >>> wr.s3.does_object_exist('s3://bucket/key_unreal', boto3_session=boto3.Session()) + False + + """ + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + bucket: str + key: str + bucket, key = path.replace("s3://", "").split("/", 1) + try: + client_s3.head_object(Bucket=bucket, Key=key) + return True + except botocore.exceptions.ClientError as ex: + if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404: + return False + raise ex # pragma: no cover + + +def list_directories(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: + """List Amazon S3 objects from a prefix. + + Parameters + ---------- + path : str + S3 path (e.g. s3://bucket/prefix). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of objects paths. + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix/') + ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix/', boto3_session=boto3.Session()) + ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] + + """ + return _list_objects(path=path, delimiter="/", boto3_session=boto3_session) + + +def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[str]: + """List Amazon S3 objects from a prefix. + + Parameters + ---------- + path : str + S3 path (e.g. s3://bucket/prefix). + suffix: str, optional + Suffix for filtering S3 keys. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of objects paths. + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix') + ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix', boto3_session=boto3.Session()) + ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] + + """ + paths: List[str] = _list_objects(path=path, delimiter=None, suffix=suffix, boto3_session=boto3_session) + return [p for p in paths if not p.endswith("/")] diff --git a/awswrangler/s3/_read.py b/awswrangler/s3/_read.py new file mode 100644 index 000000000..857f3fdcf --- /dev/null +++ b/awswrangler/s3/_read.py @@ -0,0 +1,885 @@ +"""Amazon S3 Read Module (PRIVATE).""" + +import concurrent.futures +import logging +from itertools import repeat +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union + +import boto3 # type: ignore +import pandas as pd # type: ignore +import pandas.io.parsers # type: ignore +import pyarrow as pa # type: ignore +import pyarrow.lib # type: ignore +import pyarrow.parquet # type: ignore +import s3fs # type: ignore +from pandas.io.common import infer_compression # type: ignore + +from awswrangler import _data_types, _utils, catalog, exceptions +from awswrangler.s3._list import path2list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def read_parquet_metadata_internal( + path: Union[str, List[str]], + dtype: Optional[Dict[str, str]], + sampling: float, + dataset: bool, + use_threads: bool, + boto3_session: Optional[boto3.Session], +) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: + """Handle wr.s3.read_parquet_metadata internally.""" + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if dataset is True: + if isinstance(path, str): + _path: Optional[str] = path if path.endswith("/") else f"{path}/" + paths: List[str] = path2list(path=_path, boto3_session=session) + else: # pragma: no cover + raise exceptions.InvalidArgumentType("Argument must be str if dataset=True.") + else: + if isinstance(path, str): + _path = None + paths = path2list(path=path, boto3_session=session) + elif isinstance(path, list): + _path = None + paths = path + else: # pragma: no cover + raise exceptions.InvalidArgumentType(f"Argument path must be str or List[str] instead of {type(path)}.") + schemas: List[Dict[str, str]] = [ + _read_parquet_metadata_file(path=x, use_threads=use_threads, boto3_session=session) + for x in _utils.list_sampling(lst=paths, sampling=sampling) + ] + _logger.debug("schemas: %s", schemas) + columns_types: Dict[str, str] = {} + for schema in schemas: + for column, _dtype in schema.items(): + if (column in columns_types) and (columns_types[column] != _dtype): # pragma: no cover + raise exceptions.InvalidSchemaConvergence( + f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})." + ) + columns_types[column] = _dtype + partitions_types: Optional[Dict[str, str]] = None + partitions_values: Optional[Dict[str, List[str]]] = None + if (dataset is True) and (_path is not None): + partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths(path=_path, paths=paths) + if dtype: + for k, v in dtype.items(): + if columns_types and k in columns_types: + columns_types[k] = v + if partitions_types and k in partitions_types: + partitions_types[k] = v + _logger.debug("columns_types: %s", columns_types) + return columns_types, partitions_types, partitions_values + + +def _read_text( + parser_func: Callable, + path: Union[str, List[str]], + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + chunksize: Optional[int] = None, + dataset: bool = False, + **pandas_kwargs, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + if "iterator" in pandas_kwargs: + raise exceptions.InvalidArgument("Please, use chunksize instead of iterator.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (dataset is True) and (not isinstance(path, str)): # pragma: no cover + raise exceptions.InvalidArgument("The path argument must be a string Amazon S3 prefix if dataset=True.") + if dataset is True: + path_root: str = str(path) + else: + path_root = "" + paths: List[str] = path2list(path=path, boto3_session=session) + _logger.debug("paths:\n%s", paths) + if chunksize is not None: + dfs: Iterator[pd.DataFrame] = _read_text_chunksize( + parser_func=parser_func, + paths=paths, + boto3_session=session, + chunksize=chunksize, + pandas_args=pandas_kwargs, + s3_additional_kwargs=s3_additional_kwargs, + dataset=dataset, + path_root=path_root, + ) + return dfs + if use_threads is False: + df: pd.DataFrame = pd.concat( + objs=[ + _read_text_full( + parser_func=parser_func, + path=p, + boto3_session=session, + pandas_args=pandas_kwargs, + s3_additional_kwargs=s3_additional_kwargs, + dataset=dataset, + path_root=path_root, + ) + for p in paths + ], + ignore_index=True, + sort=False, + ) + else: + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: + df = pd.concat( + objs=executor.map( + _read_text_full, + repeat(parser_func), + repeat(path_root), + paths, + repeat(_utils.boto3_to_primitives(boto3_session=session)), # Boto3.Session + repeat(pandas_kwargs), + repeat(s3_additional_kwargs), + repeat(dataset), + ), + ignore_index=True, + sort=False, + ) + return df + + +def _read_text_chunksize( + parser_func: Callable, + path_root: str, + paths: List[str], + boto3_session: boto3.Session, + chunksize: int, + pandas_args: Dict[str, Any], + s3_additional_kwargs: Optional[Dict[str, str]] = None, + dataset: bool = False, +) -> Iterator[pd.DataFrame]: + fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) + for path in paths: + _logger.debug("path: %s", path) + partitions: Dict[str, Any] = {} + if dataset is True: + partitions = _utils.extract_partitions_from_path(path_root=path_root, path=path) + if pandas_args.get("compression", "infer") == "infer": + pandas_args["compression"] = infer_compression(path, compression="infer") + mode: str = "r" if pandas_args.get("compression") is None else "rb" + with fs.open(path, mode) as f: + reader: pandas.io.parsers.TextFileReader = parser_func(f, chunksize=chunksize, **pandas_args) + for df in reader: + if dataset is True: + for column_name, value in partitions.items(): + df[column_name] = value + yield df + + +def _read_text_full( + parser_func: Callable, + path_root: str, + path: str, + boto3_session: Union[boto3.Session, Dict[str, Optional[str]]], + pandas_args: Dict[str, Any], + s3_additional_kwargs: Optional[Dict[str, str]] = None, + dataset: bool = False, +) -> pd.DataFrame: + fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) + if pandas_args.get("compression", "infer") == "infer": + pandas_args["compression"] = infer_compression(path, compression="infer") + mode: str = "r" if pandas_args.get("compression") is None else "rb" + encoding: Optional[str] = pandas_args.get("encoding", None) + newline: Optional[str] = pandas_args.get("lineterminator", None) + with fs.open(path=path, mode=mode, encoding=encoding, newline=newline) as f: + df: pd.DataFrame = parser_func(f, **pandas_args) + if dataset is True: + partitions: Dict[str, Any] = _utils.extract_partitions_from_path(path_root=path_root, path=path) + for column_name, value in partitions.items(): + df[column_name] = value + return df + + +def _read_parquet_init( + path: Union[str, List[str]], + filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, + categories: List[str] = None, + validate_schema: bool = True, + dataset: bool = False, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, +) -> pyarrow.parquet.ParquetDataset: + """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset.""" + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if dataset is False: + path_or_paths: Union[str, List[str]] = path2list(path=path, boto3_session=session) + elif isinstance(path, str): + path_or_paths = path[:-1] if path.endswith("/") else path + else: + path_or_paths = path + _logger.debug("path_or_paths: %s", path_or_paths) + fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset( + path_or_paths=path_or_paths, + filesystem=fs, + metadata_nthreads=cpus, + filters=filters, + read_dictionary=categories, + validate_schema=validate_schema, + split_row_groups=False, + use_legacy_dataset=True, + ) + return data + + +def _read_parquet( + data: pyarrow.parquet.ParquetDataset, + columns: Optional[List[str]] = None, + categories: List[str] = None, + use_threads: bool = True, + validate_schema: bool = True, +) -> pd.DataFrame: + tables: List[pa.Table] = [] + _logger.debug("Reading pieces...") + for piece in data.pieces: + table: pa.Table = piece.read( + columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False + ) + _logger.debug("Appending piece in the list...") + tables.append(table) + promote: bool = not validate_schema + _logger.debug("Concating pieces...") + table = pa.lib.concat_tables(tables, promote=promote) + _logger.debug("Converting PyArrow table to Pandas DataFrame...") + return table.to_pandas( + use_threads=use_threads, + split_blocks=True, + self_destruct=True, + integer_object_nulls=False, + date_as_object=True, + ignore_metadata=True, + categories=categories, + types_mapper=_data_types.pyarrow2pandas_extension, + ) + + +def _read_parquet_chunked( + data: pyarrow.parquet.ParquetDataset, + columns: Optional[List[str]] = None, + categories: List[str] = None, + chunked: Union[bool, int] = True, + use_threads: bool = True, +) -> Iterator[pd.DataFrame]: + next_slice: Optional[pd.DataFrame] = None + for piece in data.pieces: + df: pd.DataFrame = _table2df( + table=piece.read( + columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False + ), + categories=categories, + use_threads=use_threads, + ) + if chunked is True: + yield df + else: + if next_slice is not None: + df = pd.concat(objs=[next_slice, df], ignore_index=True, sort=False) + while len(df.index) >= chunked: + yield df.iloc[:chunked] + df = df.iloc[chunked:] + if df.empty: + next_slice = None + else: + next_slice = df + if next_slice is not None: + yield next_slice + + +def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame: + return table.to_pandas( + use_threads=use_threads, + split_blocks=True, + self_destruct=True, + integer_object_nulls=False, + date_as_object=True, + ignore_metadata=True, + categories=categories, + types_mapper=_data_types.pyarrow2pandas_extension, + ) + + +def _read_parquet_metadata_file(path: str, use_threads: bool, boto3_session: boto3.Session) -> Dict[str, str]: + data: pyarrow.parquet.ParquetDataset = _read_parquet_init( + path=path, filters=None, dataset=False, use_threads=use_threads, boto3_session=boto3_session + ) + return _data_types.athena_types_from_pyarrow_schema(schema=data.schema.to_arrow_schema(), partitions=None)[0] + + +def read_csv( + path: Union[str, List[str]], + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + chunksize: Optional[int] = None, + dataset: bool = False, + **pandas_kwargs, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read CSV file(s) from from a received S3 prefix or list of S3 objects paths. + + Note + ---- + For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + chunksize: int, optional + If specified, return an generator where chunksize is the number of rows to include in each chunk. + dataset: bool + If `True` read a CSV dataset instead of simple file(s) loading all the related partitions as columns. + pandas_kwargs: + keyword arguments forwarded to pandas.read_csv(). + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunksize != None`. + + Examples + -------- + Reading all CSV files under a prefix + + >>> import awswrangler as wr + >>> df = wr.s3.read_csv(path='s3://bucket/prefix/') + + Reading all CSV files under a prefix encrypted with a KMS key + + >>> import awswrangler as wr + >>> df = wr.s3.read_csv( + ... path='s3://bucket/prefix/', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading all CSV files from a list + + >>> import awswrangler as wr + >>> df = wr.s3.read_csv(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv']) + + Reading in chunks of 100 lines + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_csv(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunksize=100) + >>> for df in dfs: + >>> print(df) # 100 lines Pandas DataFrame + + """ + return _read_text( + parser_func=pd.read_csv, + path=path, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + chunksize=chunksize, + dataset=dataset, + **pandas_kwargs, + ) + + +def read_fwf( + path: Union[str, List[str]], + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + chunksize: Optional[int] = None, + dataset: bool = False, + **pandas_kwargs, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read fixed-width formatted file(s) from from a received S3 prefix or list of S3 objects paths. + + Note + ---- + For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + chunksize: int, optional + If specified, return an generator where chunksize is the number of rows to include in each chunk. + dataset: bool + If `True` read a FWF dataset instead of simple file(s) loading all the related partitions as columns. + pandas_kwargs: + keyword arguments forwarded to pandas.read_fwf(). + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_fwf.html + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunksize != None`. + + Examples + -------- + Reading all fixed-width formatted (FWF) files under a prefix + + >>> import awswrangler as wr + >>> df = wr.s3.read_fwf(path='s3://bucket/prefix/') + + Reading all fixed-width formatted (FWF) files under a prefix encrypted with a KMS key + + >>> import awswrangler as wr + >>> df = wr.s3.read_fwf( + ... path='s3://bucket/prefix/', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading all fixed-width formatted (FWF) files from a list + + >>> import awswrangler as wr + >>> df = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt']) + + Reading in chunks of 100 lines + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt'], chunksize=100) + >>> for df in dfs: + >>> print(df) # 100 lines Pandas DataFrame + + """ + return _read_text( + parser_func=pd.read_fwf, + path=path, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + chunksize=chunksize, + dataset=dataset, + **pandas_kwargs, + ) + + +def read_json( + path: Union[str, List[str]], + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + chunksize: Optional[int] = None, + dataset: bool = False, + **pandas_kwargs, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read JSON file(s) from from a received S3 prefix or list of S3 objects paths. + + Note + ---- + For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + chunksize: int, optional + If specified, return an generator where chunksize is the number of rows to include in each chunk. + dataset: bool + If `True` read a JSON dataset instead of simple file(s) loading all the related partitions as columns. + If `True`, the `lines=True` will be assumed by default. + pandas_kwargs: + keyword arguments forwarded to pandas.read_json(). + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunksize != None`. + + Examples + -------- + Reading all JSON files under a prefix + + >>> import awswrangler as wr + >>> df = wr.s3.read_json(path='s3://bucket/prefix/') + + Reading all JSON files under a prefix encrypted with a KMS key + + >>> import awswrangler as wr + >>> df = wr.s3.read_json( + ... path='s3://bucket/prefix/', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading all JSON files from a list + + >>> import awswrangler as wr + >>> df = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json']) + + Reading in chunks of 100 lines + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json'], chunksize=100) + >>> for df in dfs: + >>> print(df) # 100 lines Pandas DataFrame + + """ + if (dataset is True) and ("lines" not in pandas_kwargs): + pandas_kwargs["lines"] = True + return _read_text( + parser_func=pd.read_json, + path=path, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + chunksize=chunksize, + dataset=dataset, + **pandas_kwargs, + ) + + +def read_parquet( + path: Union[str, List[str]], + filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, + columns: Optional[List[str]] = None, + validate_schema: bool = True, + chunked: Union[bool, int] = False, + dataset: bool = False, + categories: List[str] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read Apache Parquet file(s) from from a received S3 prefix or list of S3 objects paths. + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning and catalog integration (AWS Glue Catalog). + + Note + ---- + ``Batching`` (`chunked` argument) (Memory Friendly): + + Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. + + There are two batching strategies on Wrangler: + + - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. + + - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. + + `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise + in number of rows for each Dataframe. + + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + filters: Union[List[Tuple], List[List[Tuple]]], optional + List of filters to apply on PARTITION columns (PUSH-DOWN filter), like ``[[('x', '=', 0), ...], ...]``. + Ignored if `dataset=False`. + columns : List[str], optional + Names of columns to read from the file(s). + validate_schema: + Check that individual file schemas are all the same / compatible. Schemas within a + folder prefix should all be the same. Disable if you have schemas that are different + and want to disable this check. + chunked : Union[int, bool] + If passed will split the data in a Iterable of DataFrames (Memory friendly). + If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. + If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. + dataset: bool + If `True` read a parquet dataset instead of simple file(s) loading all the related partitions as columns. + categories: List[str], optional + List of columns names that should be returned as pandas.Categorical. + Recommended for memory restricted environments. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunked=True`. + + Examples + -------- + Reading all Parquet files under a prefix + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet(path='s3://bucket/prefix/') + + Reading all Parquet files under a prefix encrypted with a KMS key + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet( + ... path='s3://bucket/prefix/', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading all Parquet files from a list + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet(path=['s3://bucket/filename0.parquet', 's3://bucket/filename1.parquet']) + + Reading in chunks (Chunk by file) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=True) + >>> for df in dfs: + >>> print(df) # Smaller Pandas DataFrame + + Reading in chunks (Chunk by 1MM rows) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) + >>> for df in dfs: + >>> print(df) # 1MM Pandas DataFrame + + """ + data: pyarrow.parquet.ParquetDataset = _read_parquet_init( + path=path, + filters=filters, + dataset=dataset, + categories=categories, + validate_schema=validate_schema, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + ) + _logger.debug("pyarrow.parquet.ParquetDataset initialized.") + if chunked is False: + return _read_parquet( + data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema + ) + return _read_parquet_chunked( + data=data, columns=columns, categories=categories, chunked=chunked, use_threads=use_threads + ) + + +def read_parquet_metadata( + path: Union[str, List[str]], + dtype: Optional[Dict[str, str]] = None, + sampling: float = 1.0, + dataset: bool = False, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> Tuple[Dict[str, str], Optional[Dict[str, str]]]: + """Read Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths. + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning and catalog integration (AWS Glue Catalog). + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + dtype : Dict[str, str], optional + Dictionary of columns names and Athena/Glue types to be casted. + Useful when you have columns with undetermined data types as partitions columns. + (e.g. {'col name': 'bigint', 'col2 name': 'int'}) + sampling : float + Random sample ratio of files that will have the metadata inspected. + Must be `0.0 < sampling <= 1.0`. + The higher, the more accurate. + The lower, the faster. + dataset: bool + If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Tuple[Dict[str, str], Optional[Dict[str, str]]] + columns_types: Dictionary with keys as column names and vales as + data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / + partitions_types: Dictionary with keys as partition names + and values as data types (e.g. {'col2': 'date'}). + + Examples + -------- + Reading all Parquet files (with partitions) metadata under a prefix + + >>> import awswrangler as wr + >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path='s3://bucket/prefix/', dataset=True) + + Reading all Parquet files metadata from a list + + >>> import awswrangler as wr + >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path=[ + ... 's3://bucket/filename0.parquet', + ... 's3://bucket/filename1.parquet' + ... ]) + + """ + return read_parquet_metadata_internal( + path=path, dtype=dtype, sampling=sampling, dataset=dataset, use_threads=use_threads, boto3_session=boto3_session + )[:2] + + +def read_parquet_table( + table: str, + database: str, + filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, + columns: Optional[List[str]] = None, + categories: List[str] = None, + chunked: Union[bool, int] = False, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read Apache Parquet table registered on AWS Glue Catalog. + + Note + ---- + ``Batching`` (`chunked` argument) (Memory Friendly): + + Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. + + There are two batching strategies on Wrangler: + + - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. + + - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating + to return DataFrames with the number of row igual the received INTEGER. + + `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise + in number of rows for each Dataframe. + + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + table : str + AWS Glue Catalog table name. + database : str + AWS Glue Catalog database name. + filters: Union[List[Tuple], List[List[Tuple]]], optional + List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. + columns : List[str], optional + Names of columns to read from the file(s). + categories: List[str], optional + List of columns names that should be returned as pandas.Categorical. + Recommended for memory restricted environments. + chunked : bool + If True will break the data in smaller DataFrames (Non deterministic number of lines). + Otherwise return a single DataFrame with the whole data. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunked=True`. + + Examples + -------- + Reading Parquet Table + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet_table(database='...', table='...') + + Reading Parquet Table encrypted + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet_table( + ... database='...', + ... table='...' + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading Parquet Table in chunks (Chunk by file) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True) + >>> for df in dfs: + >>> print(df) # Smaller Pandas DataFrame + + Reading in chunks (Chunk by 1MM rows) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) + >>> for df in dfs: + >>> print(df) # 1MM Pandas DataFrame + + """ + path: str = catalog.get_table_location(database=database, table=table, boto3_session=boto3_session) + return read_parquet( + path=path, + filters=filters, + columns=columns, + categories=categories, + chunked=chunked, + dataset=True, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + ) diff --git a/awswrangler/s3/_wait.py b/awswrangler/s3/_wait.py new file mode 100644 index 000000000..c2ebc7a74 --- /dev/null +++ b/awswrangler/s3/_wait.py @@ -0,0 +1,159 @@ +"""Amazon S3 Wait Module (PRIVATE).""" + +import concurrent.futures +import logging +from itertools import repeat +from typing import List, Optional, Tuple, Union + +import boto3 # type: ignore + +from awswrangler import _utils + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _wait_objects( + waiter_name: str, + paths: List[str], + delay: Optional[Union[int, float]] = None, + max_attempts: Optional[int] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> None: + delay = 5 if delay is None else delay + max_attempts = 20 if max_attempts is None else max_attempts + _delay: int = int(delay) if isinstance(delay, float) else delay + if len(paths) < 1: + return None + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths] + if use_threads is False: + waiter = client_s3.get_waiter(waiter_name) + for bucket, key in _paths: + waiter.wait(Bucket=bucket, Key=key, WaiterConfig={"Delay": _delay, "MaxAttempts": max_attempts}) + else: + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: + list( + executor.map( + _wait_objects_concurrent, + _paths, + repeat(waiter_name), + repeat(client_s3), + repeat(_delay), + repeat(max_attempts), + ) + ) + return None + + +def _wait_objects_concurrent( + path: Tuple[str, str], waiter_name: str, client_s3: boto3.client, delay: int, max_attempts: int +) -> None: + waiter = client_s3.get_waiter(waiter_name) + bucket, key = path + waiter.wait(Bucket=bucket, Key=key, WaiterConfig={"Delay": delay, "MaxAttempts": max_attempts}) + + +def wait_objects_exist( + paths: List[str], + delay: Optional[Union[int, float]] = None, + max_attempts: Optional[int] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Wait Amazon S3 objects exist. + + Polls S3.Client.head_object() every 5 seconds (default) until a successful + state is reached. An error is returned after 20 (default) failed checks. + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Waiter.ObjectExists + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + paths : List[str] + List of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + delay : Union[int,float], optional + The amount of time in seconds to wait between attempts. Default: 5 + max_attempts : int, optional + The maximum number of attempts to be made. Default: 20 + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.wait_objects_exist(['s3://bucket/key0', 's3://bucket/key1']) # wait both objects + + """ + return _wait_objects( + waiter_name="object_exists", + paths=paths, + delay=delay, + max_attempts=max_attempts, + use_threads=use_threads, + boto3_session=boto3_session, + ) + + +def wait_objects_not_exist( + paths: List[str], + delay: Optional[Union[int, float]] = None, + max_attempts: Optional[int] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Wait Amazon S3 objects not exist. + + Polls S3.Client.head_object() every 5 seconds (default) until a successful + state is reached. An error is returned after 20 (default) failed checks. + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Waiter.ObjectNotExists + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + paths : List[str] + List of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + delay : Union[int,float], optional + The amount of time in seconds to wait between attempts. Default: 5 + max_attempts : int, optional + The maximum number of attempts to be made. Default: 20 + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.wait_objects_not_exist(['s3://bucket/key0', 's3://bucket/key1']) # wait both objects not exist + + """ + return _wait_objects( + waiter_name="object_not_exists", + paths=paths, + delay=delay, + max_attempts=max_attempts, + use_threads=use_threads, + boto3_session=boto3_session, + ) diff --git a/awswrangler/s3/_write.py b/awswrangler/s3/_write.py new file mode 100644 index 000000000..3b9737e7b --- /dev/null +++ b/awswrangler/s3/_write.py @@ -0,0 +1,1093 @@ +"""Amazon S3 Write Module (PRIVATE).""" + +import csv +import logging +import uuid +from typing import Dict, List, Optional, Tuple, Union + +import boto3 # type: ignore +import pandas as pd # type: ignore +import pyarrow as pa # type: ignore +import pyarrow.lib # type: ignore +import pyarrow.parquet # type: ignore +import s3fs # type: ignore + +from awswrangler import _data_types, _utils, catalog, exceptions +from awswrangler.s3._delete import delete_objects +from awswrangler.s3._read import read_parquet_metadata_internal + +_COMPRESSION_2_EXT: Dict[Optional[str], str] = {None: "", "gzip": ".gz", "snappy": ".snappy"} + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _to_csv_dataset( + df: pd.DataFrame, + path: str, + index: bool, + sep: str, + fs: s3fs.S3FileSystem, + use_threads: bool, + mode: str, + dtype: Dict[str, str], + partition_cols: Optional[List[str]] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Tuple[List[str], Dict[str, List[str]]]: + paths: List[str] = [] + partitions_values: Dict[str, List[str]] = {} + path = path if path[-1] == "/" else f"{path}/" + if mode not in ["append", "overwrite", "overwrite_partitions"]: + raise exceptions.InvalidArgumentValue( + f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." + ) + if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): + delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) + df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) + _logger.debug("dtypes: %s", df.dtypes) + if not partition_cols: + file_path: str = f"{path}{uuid.uuid4().hex}.csv" + _to_text( + file_format="csv", + df=df, + path=file_path, + fs=fs, + quoting=csv.QUOTE_NONE, + escapechar="\\", + header=False, + date_format="%Y-%m-%d %H:%M:%S.%f", + index=index, + sep=sep, + ) + paths.append(file_path) + else: + for keys, subgroup in df.groupby(by=partition_cols, observed=True): + subgroup = subgroup.drop(partition_cols, axis="columns") + keys = (keys,) if not isinstance(keys, tuple) else keys + subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) + prefix: str = f"{path}{subdir}/" + if mode == "overwrite_partitions": + delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) + file_path = f"{prefix}{uuid.uuid4().hex}.csv" + _to_text( + file_format="csv", + df=subgroup, + path=file_path, + fs=fs, + quoting=csv.QUOTE_NONE, + escapechar="\\", + header=False, + date_format="%Y-%m-%d %H:%M:%S.%f", + index=index, + sep=sep, + ) + paths.append(file_path) + partitions_values[prefix] = [str(k) for k in keys] + return paths, partitions_values + + +def _to_text( + file_format: str, + df: pd.DataFrame, + path: str, + fs: Optional[s3fs.S3FileSystem] = None, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + **pandas_kwargs, +) -> None: + if df.empty is True: # pragma: no cover + raise exceptions.EmptyDataFrame() + if fs is None: + fs = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) + encoding: Optional[str] = pandas_kwargs.get("encoding", None) + newline: Optional[str] = pandas_kwargs.get("line_terminator", None) + with fs.open(path=path, mode="w", encoding=encoding, newline=newline) as f: + if file_format == "csv": + df.to_csv(f, **pandas_kwargs) + elif file_format == "json": + df.to_json(f, **pandas_kwargs) + + +def _to_parquet_dataset( + df: pd.DataFrame, + path: str, + index: bool, + compression: Optional[str], + compression_ext: str, + cpus: int, + fs: s3fs.S3FileSystem, + use_threads: bool, + mode: str, + dtype: Dict[str, str], + partition_cols: Optional[List[str]] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Tuple[List[str], Dict[str, List[str]]]: + paths: List[str] = [] + partitions_values: Dict[str, List[str]] = {} + path = path if path[-1] == "/" else f"{path}/" + if mode not in ["append", "overwrite", "overwrite_partitions"]: + raise exceptions.InvalidArgumentValue( + f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." + ) + if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): + delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) + df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) + schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( + df=df, index=index, ignore_cols=partition_cols, dtype=dtype + ) + _logger.debug("schema: \n%s", schema) + if not partition_cols: + file_path: str = f"{path}{uuid.uuid4().hex}{compression_ext}.parquet" + _to_parquet_file( + df=df, schema=schema, path=file_path, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype + ) + paths.append(file_path) + else: + for keys, subgroup in df.groupby(by=partition_cols, observed=True): + subgroup = subgroup.drop(partition_cols, axis="columns") + keys = (keys,) if not isinstance(keys, tuple) else keys + subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) + prefix: str = f"{path}{subdir}/" + if mode == "overwrite_partitions": + delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) + file_path = f"{prefix}{uuid.uuid4().hex}{compression_ext}.parquet" + _to_parquet_file( + df=subgroup, + schema=schema, + path=file_path, + index=index, + compression=compression, + cpus=cpus, + fs=fs, + dtype=dtype, + ) + paths.append(file_path) + partitions_values[prefix] = [str(k) for k in keys] + return paths, partitions_values + + +def _to_parquet_file( + df: pd.DataFrame, + path: str, + schema: pa.Schema, + index: bool, + compression: Optional[str], + cpus: int, + fs: s3fs.S3FileSystem, + dtype: Dict[str, str], +) -> str: + table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=True) + for col_name, col_type in dtype.items(): + if col_name in table.column_names: + col_index = table.column_names.index(col_name) + pyarrow_dtype = _data_types.athena2pyarrow(col_type) + field = pa.field(name=col_name, type=pyarrow_dtype) + table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype)) + _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype) + pyarrow.parquet.write_table( + table=table, + where=path, + write_statistics=True, + use_dictionary=True, + filesystem=fs, + coerce_timestamps="ms", + compression=compression, + flavor="spark", + ) + return path + + +def to_csv( # pylint: disable=too-many-arguments,too-many-locals + df: pd.DataFrame, + path: str, + sep: str = ",", + index: bool = True, + columns: Optional[List[str]] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + sanitize_columns: bool = False, + dataset: bool = False, + partition_cols: Optional[List[str]] = None, + mode: Optional[str] = None, + catalog_versioning: bool = False, + database: Optional[str] = None, + table: Optional[str] = None, + dtype: Optional[Dict[str, str]] = None, + description: Optional[str] = None, + parameters: Optional[Dict[str, str]] = None, + columns_comments: Optional[Dict[str, str]] = None, + regular_partitions: bool = True, + projection_enabled: bool = False, + projection_types: Optional[Dict[str, str]] = None, + projection_ranges: Optional[Dict[str, str]] = None, + projection_values: Optional[Dict[str, str]] = None, + projection_intervals: Optional[Dict[str, str]] = None, + projection_digits: Optional[Dict[str, str]] = None, + **pandas_kwargs, +) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: + """Write CSV file or dataset on Amazon S3. + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). + + Note + ---- + If `dataset=True` The table name and all column names will be automatically sanitized using + `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. + Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. + + Note + ---- + On `append` mode, the `parameters` will be upsert on an existing table. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + df: pandas.DataFrame + Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + path : str + Amazon S3 path (e.g. s3://bucket/filename.csv). + sep : str + String of length 1. Field delimiter for the output file. + index : bool + Write row names (index). + columns : List[str], optional + Columns to write. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + sanitize_columns : bool + True to sanitize columns names or False to keep it as is. + True value is forced if `dataset=True`. + dataset : bool + If True store a parquet dataset instead of a single file. + If True, enable all follow arguments: + partition_cols, mode, database, table, description, parameters, columns_comments, . + partition_cols: List[str], optional + List of column names that will be used to create partitions. Only takes effect if dataset=True. + mode : str, optional + ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. + database : str, optional + Glue/Athena catalog: Database name. + table : str, optional + Glue/Athena catalog: Table name. + dtype : Dict[str, str], optional + Dictionary of columns names and Athena/Glue types to be casted. + Useful when you have columns with undetermined or mixed data types. + (e.g. {'col name': 'bigint', 'col2 name': 'int'}) + description : str, optional + Glue/Athena catalog: Table description + parameters : Dict[str, str], optional + Glue/Athena catalog: Key/value pairs to tag the table. + columns_comments : Dict[str, str], optional + Glue/Athena catalog: + Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). + regular_partitions : bool + Create regular partitions (Non projected partitions) on Glue Catalog. + Disable when you will work only with Partition Projection. + Keep enabled even when working with projections is useful to keep + Redshift Spectrum working with the regular partitions. + projection_enabled : bool + Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) + projection_types : Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections types. + Valid types: "enum", "integer", "date", "injected" + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) + projection_ranges: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections ranges. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) + projection_values: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections values. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) + projection_intervals: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections intervals. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '5'}) + projection_digits: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections digits. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '2'}) + pandas_kwargs : + keyword arguments forwarded to pandas.DataFrame.to_csv() + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html + + Returns + ------- + None + None. + + Examples + -------- + Writing single file + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/prefix/my_file.csv', + ... ) + { + 'paths': ['s3://bucket/prefix/my_file.csv'], + 'partitions_values': {} + } + + Writing single file encrypted with a KMS key + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/prefix/my_file.csv', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + { + 'paths': ['s3://bucket/prefix/my_file.csv'], + 'partitions_values': {} + } + + Writing partitioned dataset + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... partition_cols=['col2'] + ... ) + { + 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], + 'partitions_values: { + 's3://.../col2=A/': ['A'], + 's3://.../col2=B/': ['B'] + } + } + + Writing dataset to S3 with metadata on Athena/Glue Catalog. + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... partition_cols=['col2'], + ... database='default', # Athena/Glue database + ... table='my_table' # Athena/Glue table + ... ) + { + 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], + 'partitions_values: { + 's3://.../col2=A/': ['A'], + 's3://.../col2=B/': ['B'] + } + } + + Writing dataset casting empty column data type + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'], + ... 'col3': [None, None, None] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... database='default', # Athena/Glue database + ... table='my_table' # Athena/Glue table + ... dtype={'col3': 'date'} + ... ) + { + 'paths': ['s3://.../x.csv'], + 'partitions_values: {} + } + + """ + if (database is None) ^ (table is None): + raise exceptions.InvalidArgumentCombination( + "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." + ) + if df.empty is True: + raise exceptions.EmptyDataFrame() + + partition_cols = partition_cols if partition_cols else [] + dtype = dtype if dtype else {} + partitions_values: Dict[str, List[str]] = {} + + # Sanitize table to respect Athena's standards + if (sanitize_columns is True) or (dataset is True): + df = catalog.sanitize_dataframe_columns_names(df=df) + partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] + dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} + df = catalog.drop_duplicated_columns(df=df) + + session: boto3.Session = _utils.ensure_session(session=boto3_session) + fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) + if dataset is False: + if partition_cols: + raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") + if mode is not None: + raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") + if columns_comments: + raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use columns_comments.") + if any(arg is not None for arg in (database, table, description, parameters)): + raise exceptions.InvalidArgumentCombination( + "Please pass dataset=True to be able to use any one of these " + "arguments: database, table, description, parameters, " + "columns_comments." + ) + pandas_kwargs["sep"] = sep + pandas_kwargs["index"] = index + pandas_kwargs["columns"] = columns + _to_text(file_format="csv", df=df, path=path, fs=fs, **pandas_kwargs) + paths = [path] + else: + mode = "append" if mode is None else mode + if columns: + df = df[columns] + if ( + (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) + ): # Fetching Catalog Types + catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( + database=database, table=table, boto3_session=session + ) + if catalog_types is not None: + for k, v in catalog_types.items(): + dtype[k] = v + paths, partitions_values = _to_csv_dataset( + df=df, + path=path, + index=index, + sep=sep, + fs=fs, + use_threads=use_threads, + partition_cols=partition_cols, + dtype=dtype, + mode=mode, + boto3_session=session, + ) + if (database is not None) and (table is not None): + columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( + df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True + ) + catalog.create_csv_table( + database=database, + table=table, + path=path, + columns_types=columns_types, + partitions_types=partitions_types, + description=description, + parameters=parameters, + columns_comments=columns_comments, + boto3_session=session, + mode=mode, + catalog_versioning=catalog_versioning, + sep=sep, + projection_enabled=projection_enabled, + projection_types=projection_types, + projection_ranges=projection_ranges, + projection_values=projection_values, + projection_intervals=projection_intervals, + projection_digits=projection_digits, + ) + if partitions_values and (regular_partitions is True): + _logger.debug("partitions_values:\n%s", partitions_values) + catalog.add_csv_partitions( + database=database, table=table, partitions_values=partitions_values, boto3_session=session, sep=sep + ) + return {"paths": paths, "partitions_values": partitions_values} + + +def to_json( + df: pd.DataFrame, + path: str, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + **pandas_kwargs, +) -> None: + """Write JSON file on Amazon S3. + + Parameters + ---------- + df: pandas.DataFrame + Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + path : str + Amazon S3 path (e.g. s3://bucket/filename.csv). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + pandas_kwargs: + keyword arguments forwarded to pandas.DataFrame.to_csv() + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html + + Returns + ------- + None + None. + + Examples + -------- + Writing JSON file + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_json( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/filename.json', + ... ) + + Writing CSV file encrypted with a KMS key + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_json( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/filename.json', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + """ + return _to_text( + file_format="json", + df=df, + path=path, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + **pandas_kwargs, + ) + + +def to_parquet( # pylint: disable=too-many-arguments,too-many-locals + df: pd.DataFrame, + path: str, + index: bool = False, + compression: Optional[str] = "snappy", + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + sanitize_columns: bool = False, + dataset: bool = False, + partition_cols: Optional[List[str]] = None, + mode: Optional[str] = None, + catalog_versioning: bool = False, + database: Optional[str] = None, + table: Optional[str] = None, + dtype: Optional[Dict[str, str]] = None, + description: Optional[str] = None, + parameters: Optional[Dict[str, str]] = None, + columns_comments: Optional[Dict[str, str]] = None, + regular_partitions: bool = True, + projection_enabled: bool = False, + projection_types: Optional[Dict[str, str]] = None, + projection_ranges: Optional[Dict[str, str]] = None, + projection_values: Optional[Dict[str, str]] = None, + projection_intervals: Optional[Dict[str, str]] = None, + projection_digits: Optional[Dict[str, str]] = None, +) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: + """Write Parquet file or dataset on Amazon S3. + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). + + Note + ---- + If `dataset=True` The table name and all column names will be automatically sanitized using + `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. + Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. + + Note + ---- + On `append` mode, the `parameters` will be upsert on an existing table. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + df: pandas.DataFrame + Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + path : str + S3 path (for file e.g. ``s3://bucket/prefix/filename.parquet``) (for dataset e.g. ``s3://bucket/prefix``). + index : bool + True to store the DataFrame index in file, otherwise False to ignore it. + compression: str, optional + Compression style (``None``, ``snappy``, ``gzip``). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + sanitize_columns : bool + True to sanitize columns names or False to keep it as is. + True value is forced if `dataset=True`. + dataset : bool + If True store a parquet dataset instead of a single file. + If True, enable all follow arguments: + partition_cols, mode, database, table, description, parameters, columns_comments, . + partition_cols: List[str], optional + List of column names that will be used to create partitions. Only takes effect if dataset=True. + mode: str, optional + ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. + database : str, optional + Glue/Athena catalog: Database name. + table : str, optional + Glue/Athena catalog: Table name. + dtype : Dict[str, str], optional + Dictionary of columns names and Athena/Glue types to be casted. + Useful when you have columns with undetermined or mixed data types. + (e.g. {'col name': 'bigint', 'col2 name': 'int'}) + description : str, optional + Glue/Athena catalog: Table description + parameters : Dict[str, str], optional + Glue/Athena catalog: Key/value pairs to tag the table. + columns_comments : Dict[str, str], optional + Glue/Athena catalog: + Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). + regular_partitions : bool + Create regular partitions (Non projected partitions) on Glue Catalog. + Disable when you will work only with Partition Projection. + Keep enabled even when working with projections is useful to keep + Redshift Spectrum working with the regular partitions. + projection_enabled : bool + Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) + projection_types : Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections types. + Valid types: "enum", "integer", "date", "injected" + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) + projection_ranges: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections ranges. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) + projection_values: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections values. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) + projection_intervals: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections intervals. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '5'}) + projection_digits: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections digits. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '2'}) + + Returns + ------- + Dict[str, Union[List[str], Dict[str, List[str]]]] + Dictionary with: + 'paths': List of all stored files paths on S3. + 'partitions_values': Dictionary of partitions added with keys as S3 path locations + and values as a list of partitions values as str. + + Examples + -------- + Writing single file + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/prefix/my_file.parquet', + ... ) + { + 'paths': ['s3://bucket/prefix/my_file.parquet'], + 'partitions_values': {} + } + + Writing single file encrypted with a KMS key + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/prefix/my_file.parquet', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + { + 'paths': ['s3://bucket/prefix/my_file.parquet'], + 'partitions_values': {} + } + + Writing partitioned dataset + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... partition_cols=['col2'] + ... ) + { + 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], + 'partitions_values: { + 's3://.../col2=A/': ['A'], + 's3://.../col2=B/': ['B'] + } + } + + Writing dataset to S3 with metadata on Athena/Glue Catalog. + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... partition_cols=['col2'], + ... database='default', # Athena/Glue database + ... table='my_table' # Athena/Glue table + ... ) + { + 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], + 'partitions_values: { + 's3://.../col2=A/': ['A'], + 's3://.../col2=B/': ['B'] + } + } + + Writing dataset casting empty column data type + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'], + ... 'col3': [None, None, None] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... database='default', # Athena/Glue database + ... table='my_table' # Athena/Glue table + ... dtype={'col3': 'date'} + ... ) + { + 'paths': ['s3://.../x.parquet'], + 'partitions_values: {} + } + + """ + if (database is None) ^ (table is None): + raise exceptions.InvalidArgumentCombination( + "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." + ) + if df.empty is True: + raise exceptions.EmptyDataFrame() + + partition_cols = partition_cols if partition_cols else [] + dtype = dtype if dtype else {} + partitions_values: Dict[str, List[str]] = {} + + # Sanitize table to respect Athena's standards + if (sanitize_columns is True) or (dataset is True): + df = catalog.sanitize_dataframe_columns_names(df=df) + partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] + dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} + df = catalog.drop_duplicated_columns(df=df) + + session: boto3.Session = _utils.ensure_session(session=boto3_session) + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) + compression_ext: Optional[str] = _COMPRESSION_2_EXT.get(compression, None) + if compression_ext is None: + raise exceptions.InvalidCompression(f"{compression} is invalid, please use None, snappy or gzip.") + if dataset is False: + if path.endswith("/"): # pragma: no cover + raise exceptions.InvalidArgumentValue( + "If , the argument should be a object path, not a directory." + ) + if partition_cols: + raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") + if mode is not None: + raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") + if any(arg is not None for arg in (database, table, description, parameters)): + raise exceptions.InvalidArgumentCombination( + "Please pass dataset=True to be able to use any one of these " + "arguments: database, table, description, parameters, " + "columns_comments." + ) + df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) + schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( + df=df, index=index, ignore_cols=partition_cols, dtype=dtype + ) + _logger.debug("schema: \n%s", schema) + paths = [ + _to_parquet_file( + df=df, path=path, schema=schema, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype + ) + ] + else: + mode = "append" if mode is None else mode + if ( + (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) + ): # Fetching Catalog Types + catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( + database=database, table=table, boto3_session=session + ) + if catalog_types is not None: + for k, v in catalog_types.items(): + dtype[k] = v + paths, partitions_values = _to_parquet_dataset( + df=df, + path=path, + index=index, + compression=compression, + compression_ext=compression_ext, + cpus=cpus, + fs=fs, + use_threads=use_threads, + partition_cols=partition_cols, + dtype=dtype, + mode=mode, + boto3_session=session, + ) + if (database is not None) and (table is not None): + columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( + df=df, index=index, partition_cols=partition_cols, dtype=dtype + ) + catalog.create_parquet_table( + database=database, + table=table, + path=path, + columns_types=columns_types, + partitions_types=partitions_types, + compression=compression, + description=description, + parameters=parameters, + columns_comments=columns_comments, + boto3_session=session, + mode=mode, + catalog_versioning=catalog_versioning, + projection_enabled=projection_enabled, + projection_types=projection_types, + projection_ranges=projection_ranges, + projection_values=projection_values, + projection_intervals=projection_intervals, + projection_digits=projection_digits, + ) + if partitions_values and (regular_partitions is True): + _logger.debug("partitions_values:\n%s", partitions_values) + catalog.add_parquet_partitions( + database=database, + table=table, + partitions_values=partitions_values, + compression=compression, + boto3_session=session, + ) + return {"paths": paths, "partitions_values": partitions_values} + + +def store_parquet_metadata( # pylint: disable=too-many-arguments + path: str, + database: str, + table: str, + dtype: Optional[Dict[str, str]] = None, + sampling: float = 1.0, + dataset: bool = False, + use_threads: bool = True, + description: Optional[str] = None, + parameters: Optional[Dict[str, str]] = None, + columns_comments: Optional[Dict[str, str]] = None, + compression: Optional[str] = None, + mode: str = "overwrite", + catalog_versioning: bool = False, + regular_partitions: bool = True, + projection_enabled: bool = False, + projection_types: Optional[Dict[str, str]] = None, + projection_ranges: Optional[Dict[str, str]] = None, + projection_values: Optional[Dict[str, str]] = None, + projection_intervals: Optional[Dict[str, str]] = None, + projection_digits: Optional[Dict[str, str]] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: + """Infer and store parquet metadata on AWS Glue Catalog. + + Infer Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths + And then stores it on AWS Glue Catalog including all inferred partitions + (No need of 'MCSK REPAIR TABLE') + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning and catalog integration (AWS Glue Catalog). + + Note + ---- + On `append` mode, the `parameters` will be upsert on an existing table. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + database : str + Glue/Athena catalog: Database name. + table : str + Glue/Athena catalog: Table name. + dtype : Dict[str, str], optional + Dictionary of columns names and Athena/Glue types to be casted. + Useful when you have columns with undetermined data types as partitions columns. + (e.g. {'col name': 'bigint', 'col2 name': 'int'}) + sampling : float + Random sample ratio of files that will have the metadata inspected. + Must be `0.0 < sampling <= 1.0`. + The higher, the more accurate. + The lower, the faster. + dataset: bool + If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + description: str, optional + Glue/Athena catalog: Table description + parameters: Dict[str, str], optional + Glue/Athena catalog: Key/value pairs to tag the table. + columns_comments: Dict[str, str], optional + Glue/Athena catalog: + Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). + compression: str, optional + Compression style (``None``, ``snappy``, ``gzip``, etc). + mode: str + 'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. + regular_partitions : bool + Create regular partitions (Non projected partitions) on Glue Catalog. + Disable when you will work only with Partition Projection. + Keep enabled even when working with projections is useful to keep + Redshift Spectrum working with the regular partitions. + projection_enabled : bool + Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) + projection_types : Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections types. + Valid types: "enum", "integer", "date", "injected" + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) + projection_ranges: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections ranges. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) + projection_values: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections values. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) + projection_intervals: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections intervals. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '5'}) + projection_digits: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections digits. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '2'}) + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]] + The metadata used to create the Glue Table. + columns_types: Dictionary with keys as column names and vales as + data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / + partitions_types: Dictionary with keys as partition names + and values as data types (e.g. {'col2': 'date'}). / + partitions_values: Dictionary with keys as S3 path locations and values as a + list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). + + Examples + -------- + Reading all Parquet files metadata under a prefix + + >>> import awswrangler as wr + >>> columns_types, partitions_types, partitions_values = wr.s3.store_parquet_metadata( + ... path='s3://bucket/prefix/', + ... database='...', + ... table='...', + ... dataset=True + ... ) + + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + columns_types: Dict[str, str] + partitions_types: Optional[Dict[str, str]] + partitions_values: Optional[Dict[str, List[str]]] + columns_types, partitions_types, partitions_values = read_parquet_metadata_internal( + path=path, dtype=dtype, sampling=sampling, dataset=dataset, use_threads=use_threads, boto3_session=session + ) + _logger.debug("columns_types: %s", columns_types) + _logger.debug("partitions_types: %s", partitions_types) + _logger.debug("partitions_values: %s", partitions_values) + catalog.create_parquet_table( + database=database, + table=table, + path=path, + columns_types=columns_types, + partitions_types=partitions_types, + description=description, + parameters=parameters, + columns_comments=columns_comments, + mode=mode, + catalog_versioning=catalog_versioning, + projection_enabled=projection_enabled, + projection_types=projection_types, + projection_ranges=projection_ranges, + projection_values=projection_values, + projection_intervals=projection_intervals, + projection_digits=projection_digits, + boto3_session=session, + ) + if (partitions_types is not None) and (partitions_values is not None) and (regular_partitions is True): + catalog.add_parquet_partitions( + database=database, + table=table, + partitions_values=partitions_values, + compression=compression, + boto3_session=session, + ) + return columns_types, partitions_types, partitions_values From b3837c6df107f2b7dfc33a847fa7c533625c2ab1 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 22:36:17 -0300 Subject: [PATCH 16/28] Remove in memory copy of DataFrame for to_parquet and to_csv. --- awswrangler/catalog.py | 17 ++++++++++++----- awswrangler/s3/_write.py | 4 ++-- testing/test_awswrangler/test_data_lake2.py | 7 ++----- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index 52a46e3a7..5ea578d20 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -886,6 +886,10 @@ def sanitize_table_name(table: str) -> str: def drop_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame: """Drop all repeated columns (duplicated names). + Note + ---- + This transformation will run `inplace` and will make changes in the original DataFrame. + Note ---- It is different from Panda's drop_duplicates() function which considers the column values. @@ -912,11 +916,14 @@ def drop_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame: 1 2 """ - duplicated_cols = df.columns.duplicated() - duplicated_cols_names: List[str] = list(df.columns[duplicated_cols]) - if len(duplicated_cols_names) > 0: - _logger.warning("Dropping repeated columns: %s", duplicated_cols_names) - return df.loc[:, ~duplicated_cols] + duplicated = df.columns.duplicated() + if duplicated.any(): + _logger.warning("Dropping duplicated columns...") + columns = df.columns.values + columns[duplicated] = "AWSDataWranglerDuplicatedMarker" + df.columns = columns + df.drop(columns="AWSDataWranglerDuplicatedMarker", inplace=True) + return df def get_connection( diff --git a/awswrangler/s3/_write.py b/awswrangler/s3/_write.py index 3b9737e7b..c51d4ea14 100644 --- a/awswrangler/s3/_write.py +++ b/awswrangler/s3/_write.py @@ -443,7 +443,7 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals df = catalog.sanitize_dataframe_columns_names(df=df) partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} - df = catalog.drop_duplicated_columns(df=df) + catalog.drop_duplicated_columns(df=df) session: boto3.Session = _utils.ensure_session(session=boto3_session) fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) @@ -829,7 +829,7 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals df = catalog.sanitize_dataframe_columns_names(df=df) partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} - df = catalog.drop_duplicated_columns(df=df) + catalog.drop_duplicated_columns(df=df) session: boto3.Session = _utils.ensure_session(session=boto3_session) cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) diff --git a/testing/test_awswrangler/test_data_lake2.py b/testing/test_awswrangler/test_data_lake2.py index 7993d2905..05400a615 100644 --- a/testing/test_awswrangler/test_data_lake2.py +++ b/testing/test_awswrangler/test_data_lake2.py @@ -103,11 +103,10 @@ def test_json_chunksize(path): def test_parquet_cast_string(path): df = pd.DataFrame({"id": [1, 2, 3], "value": ["foo", "boo", "bar"]}) path_file = f"{path}0.parquet" - wr.s3.to_parquet(df, path_file, dtype={"id": "string"}) + wr.s3.to_parquet(df, path_file, dtype={"id": "string"}, sanitize_columns=False) wr.s3.wait_objects_exist([path_file]) df2 = wr.s3.read_parquet(path_file) assert str(df2.id.dtypes) == "string" - df2["id"] = df2["id"].astype(int) assert df.shape == df2.shape for col, row in tuple(itertools.product(df.columns, range(3))): assert df[col].iloc[row] == df2[col].iloc[row] @@ -123,8 +122,6 @@ def test_parquet_cast_string_dataset(path, partition_cols): df2 = wr.s3.read_parquet(path, dataset=True).sort_values("id", ignore_index=True) assert str(df2.id.dtypes) == "string" assert str(df2.c3.dtypes) == "string" - df2["id"] = df2["id"].astype(int) - df2["c3"] = df2["c3"].astype(float) assert df.shape == df2.shape for col, row in tuple(itertools.product(df.columns, range(3))): assert df[col].iloc[row] == df2[col].iloc[row] @@ -158,7 +155,7 @@ def test_athena_undefined_column(database): def test_to_parquet_file_sanitize(path): df = pd.DataFrame({"C0": [0, 1], "camelCase": [2, 3], "c**--2": [4, 5]}) path_file = f"{path}0.parquet" - wr.s3.to_parquet(df, path_file) + wr.s3.to_parquet(df, path_file, sanitize_columns=True) wr.s3.wait_objects_exist([path_file]) df2 = wr.s3.read_parquet(path_file) assert df.shape == df2.shape From 7c4880dd63d1983fef1ae25399373e9cf6b89ed5 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Wed, 10 Jun 2020 20:02:05 -0300 Subject: [PATCH 17/28] First quicksight codes. :rocket: --- awswrangler/__init__.py | 2 +- awswrangler/_data_types.py | 28 ++ awswrangler/quicksight/__init__.py | 9 + awswrangler/quicksight/_cancel.py | 58 +++++ awswrangler/quicksight/_create.py | 388 ++++++++++++++++++++++++++++ awswrangler/quicksight/_delete.py | 330 +++++++++++++++++++++++ awswrangler/quicksight/_describe.py | 236 +++++++++++++++++ awswrangler/quicksight/_get.py | 385 +++++++++++++++++++++++++++ awswrangler/quicksight/_list.py | 371 ++++++++++++++++++++++++++ awswrangler/quicksight/_utils.py | 80 ++++++ docs/source/api.rst | 38 +++ 11 files changed, 1924 insertions(+), 1 deletion(-) create mode 100644 awswrangler/quicksight/__init__.py create mode 100644 awswrangler/quicksight/_cancel.py create mode 100644 awswrangler/quicksight/_create.py create mode 100644 awswrangler/quicksight/_delete.py create mode 100644 awswrangler/quicksight/_describe.py create mode 100644 awswrangler/quicksight/_get.py create mode 100644 awswrangler/quicksight/_list.py create mode 100644 awswrangler/quicksight/_utils.py diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py index 4413ab5f4..9aff3abcd 100644 --- a/awswrangler/__init__.py +++ b/awswrangler/__init__.py @@ -7,7 +7,7 @@ import logging -from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3 # noqa +from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, quicksight, s3 # noqa from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ # noqa from awswrangler._utils import get_account_id # noqa diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py index 50cc0e372..e0ffcf208 100644 --- a/awswrangler/_data_types.py +++ b/awswrangler/_data_types.py @@ -114,6 +114,34 @@ def athena2redshift( # pylint: disable=too-many-branches,too-many-return-statem raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}") # pragma: no cover +def athena2quicksight(dtype: str) -> str: # pylint: disable=too-many-branches,too-many-return-statements + """Athena to Quicksight data types conversion.""" + dtype = dtype.lower() + if dtype == "smallint": + return "INTEGER" + if dtype in ("int", "integer"): + return "INTEGER" + if dtype == "bigint": + return "INTEGER" + if dtype == "float": + return "DECIMAL" + if dtype == "double": + return "DECIMAL" + if dtype in ("boolean", "bool"): + return "BOOLEAN" + if dtype in ("string", "char", "varchar"): + return "STRING" + if dtype == "timestamp": + return "DATETIME" + if dtype == "date": + return "DATETIME" + if dtype.startswith("decimal"): + return "DECIMAL" + if dtype in ("binary" or "varbinary"): + return "BIT" + raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}") # pragma: no cover + + def pyarrow2athena(dtype: pa.DataType) -> str: # pylint: disable=too-many-branches,too-many-return-statements """Pyarrow to Athena data types conversion.""" if pa.types.is_int8(dtype): diff --git a/awswrangler/quicksight/__init__.py b/awswrangler/quicksight/__init__.py new file mode 100644 index 000000000..dfeefeed5 --- /dev/null +++ b/awswrangler/quicksight/__init__.py @@ -0,0 +1,9 @@ +"""Amazon QuickSight Module.""" + +from awswrangler.quicksight._cancel import * # noqa +from awswrangler.quicksight._create import * # noqa +from awswrangler.quicksight._delete import * # noqa +from awswrangler.quicksight._describe import * # noqa +from awswrangler.quicksight._get import * # noqa +from awswrangler.quicksight._list import * # noqa +from awswrangler.quicksight._utils import list_ingestions # noqa diff --git a/awswrangler/quicksight/_cancel.py b/awswrangler/quicksight/_cancel.py new file mode 100644 index 000000000..9c1dc9fda --- /dev/null +++ b/awswrangler/quicksight/_cancel.py @@ -0,0 +1,58 @@ +"""Amazon QuickSight Cancel Module.""" + +import logging +from typing import Optional + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.quicksight import _get + +_logger: logging.Logger = logging.getLogger(__name__) + + +def cancel_ingestion( + ingestion_id: str = None, + dataset_name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Cancel an ongoing ingestion of data into SPICE. + + Note + ---- + You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument. + + Parameters + ---------- + ingestion_id : str + Ingestion ID. + dataset_name : str, optional + Dataset name. + dataset_id : str, optional + Dataset ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.cancel_ingestion(ingestion_id="...", dataset_name="...") + """ + if (dataset_name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_id is None) and (dataset_name is not None): + dataset_id = _get.get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + client.cancel_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id) diff --git a/awswrangler/quicksight/_create.py b/awswrangler/quicksight/_create.py new file mode 100644 index 000000000..ffaed7edb --- /dev/null +++ b/awswrangler/quicksight/_create.py @@ -0,0 +1,388 @@ +"""Amazon QuickSight Create Module.""" + +import logging +import uuid +from typing import Any, Dict, List, Optional, Union + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.quicksight import _get +from awswrangler.quicksight import _utils as _qs_utils + +_logger: logging.Logger = logging.getLogger(__name__) + +_ALLOWED_ACTIONS: Dict[str, Dict[str, List[str]]] = { + "data_source": { + "allowed_to_use": [ + "quicksight:DescribeDataSource", + "quicksight:DescribeDataSourcePermissions", + "quicksight:PassDataSource", + ], + "allowed_to_manage": [ + "quicksight:DescribeDataSource", + "quicksight:DescribeDataSourcePermissions", + "quicksight:PassDataSource", + "quicksight:UpdateDataSource", + "quicksight:DeleteDataSource", + "quicksight:UpdateDataSourcePermissions", + ], + }, + "dataset": { + "allowed_to_use": [ + "quicksight:DescribeDataSet", + "quicksight:DescribeDataSetPermissions", + "quicksight:PassDataSet", + "quicksight:DescribeIngestion", + "quicksight:ListIngestions", + ], + "allowed_to_manage": [ + "quicksight:DescribeDataSet", + "quicksight:DescribeDataSetPermissions", + "quicksight:PassDataSet", + "quicksight:DescribeIngestion", + "quicksight:ListIngestions", + "quicksight:UpdateDataSet", + "quicksight:DeleteDataSet", + "quicksight:CreateIngestion", + "quicksight:CancelIngestion", + "quicksight:UpdateDataSetPermissions", + ], + }, +} + + +def _generate_principal(user_name: str, account_id: str, region: str) -> str: + user_name = user_name if "/" in user_name else f"default/{user_name}" + return f"arn:aws:quicksight:{region}:{account_id}:user/{user_name}" + + +def _generate_permissions( + resource: str, + account_id: str, + boto3_session: boto3.Session, + allowed_to_use: Optional[List[str]] = None, + allowed_to_manage: Optional[List[str]] = None, +) -> List[Dict[str, Union[str, List[str]]]]: + permissions: List[Dict[str, Union[str, List[str]]]] = [] + if (allowed_to_use is None) and (allowed_to_manage is None): + return permissions + + # Forcing same principal not be in both lists at the same time. + if (allowed_to_use is not None) and (allowed_to_manage is not None): + allowed_to_use = list(set(allowed_to_use) - set(allowed_to_manage)) + + region: str = _utils.get_region_from_session(boto3_session=boto3_session) + if allowed_to_use is not None: + permissions += [ + { + "Principal": _generate_principal(user_name=user_name, account_id=account_id, region=region), + "Actions": _ALLOWED_ACTIONS[resource]["allowed_to_use"], + } + for user_name in allowed_to_use + ] + if allowed_to_manage is not None: + permissions += [ + { + "Principal": _generate_principal(user_name=user_name, account_id=account_id, region=region), + "Actions": _ALLOWED_ACTIONS[resource]["allowed_to_manage"], + } + for user_name in allowed_to_manage + ] + return permissions + + +def _generate_transformations( + rename_columns: Optional[Dict[str, str]], cast_columns_types=Optional[Dict[str, str]] +) -> List[Dict[str, Dict[str, Any]]]: + trans: List[Dict[str, Dict[str, Any]]] = [] + if rename_columns is not None: + for k, v in rename_columns.items(): + trans.append({"RenameColumnOperation": {"ColumnName": k, "NewColumnName": v}}) + if cast_columns_types is not None: + for k, v in cast_columns_types.items(): + trans.append({"CastColumnTypeOperation": {"ColumnName": k, "NewColumnType": v.upper()}}) + return trans + + +def create_athena_data_source( + name: str, + workgroup: str = "primary", + allowed_to_use: Optional[List[str]] = None, + allowed_to_manage: Optional[List[str]] = None, + tags: Optional[Dict[str, str]] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Create a QuickSight data source pointing to an Athena/Workgroup. + + Note + ---- + You will not be able to see the the data source in the console + if you not pass your user to one of the ``allowed_*`` arguments. + + Parameters + ---------- + name : str + Data source name. + workgroup : str + Athena workgroup. + tags : Dict[str, str], optional + Key/Value collection to put on the Cluster. + e.g. {"foo": "boo", "bar": "xoo"}) + allowed_to_use : optional + List of principals that will be allowed to see and use the data source. + e.g. ["John"] + allowed_to_manage : optional + List of principals that will be allowed to see, use, update and delete the data source. + e.g. ["Mary"] + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.create_athena_data_source( + ... name="...", + ... allowed_to_manage=["john"] + ... ) + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + args: Dict[str, Any] = { + "AwsAccountId": account_id, + "DataSourceId": name, + "Name": name, + "Type": "ATHENA", + "DataSourceParameters": {"AthenaParameters": {"WorkGroup": workgroup}}, + "SslProperties": {"DisableSsl": True}, + } + permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( + resource="data_source", + account_id=account_id, + boto3_session=session, + allowed_to_use=allowed_to_use, + allowed_to_manage=allowed_to_manage, + ) + if permissions: + args["Permissions"] = permissions + if tags is not None: + _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] + args["Tags"] = _tags + client.create_data_source(**args) + + +def create_athena_dataset( + name: str, + database: Optional[str] = None, + table: Optional[str] = None, + sql: Optional[str] = None, + sql_name: str = "CustomSQL", + data_source_name: Optional[str] = None, + data_source_arn: Optional[str] = None, + import_mode: str = "DIRECT_QUERY", + allowed_to_use: Optional[List[str]] = None, + allowed_to_manage: Optional[List[str]] = None, + logical_table_alias: str = "LogicalTable", + rename_columns: Optional[Dict[str, str]] = None, + cast_columns_types: Optional[Dict[str, str]] = None, + tags: Optional[Dict[str, str]] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Create a QuickSight dataset. + + Note + ---- + You will not be able to see the the dataset in the console + if you not pass your user to one of the ``allowed_*`` arguments. + + Note + ---- + You must pass ``database``/``table`` OR ``sql`` argument. + + Note + ---- + You must pass ``data_source_name`` OR ``data_source_arn`` argument. + + Parameters + ---------- + name : str + Dataset name. + database : str + Athena's database name. + table : str + Athena's table name. + sql : str + Use a SQL query to define your table. + sql_name : str + Query name. + data_source_name : str, optional + QuickSight data source name. + data_source_arn : str, optional + QuickSight data source ARN. + import_mode : str + Indicates whether you want to import the data into SPICE. + 'SPICE'|'DIRECT_QUERY' + tags : Dict[str, str], optional + Key/Value collection to put on the Cluster. + e.g. {"foo": "boo", "bar": "xoo"}) + allowed_to_use : optional + List of principals that will be allowed to see and use the data source. + e.g. ["john", "Mary"] + allowed_to_manage : optional + List of principals that will be allowed to see, use, update and delete the data source. + e.g. ["Mary"] + logical_table_alias : str + A display name for the logical table. + rename_columns : Dict[str, str], optional + Dictionary to map column renames. e.g. {"old_name": "new_name", "old_name2": "new_name2"} + cast_columns_types : Dict[str, str], optional + Dictionary to map column casts. e.g. {"col_name": "STRING", "col_name2": "DECIMAL"} + Valid types: 'STRING'|'INTEGER'|'DECIMAL'|'DATETIME' + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.create_athena_dataset( + ... name="...", + ... database="..." + ... table="..." + ... data_source_name="..." + ... allowed_to_manage=["Mary"] + ... ) + """ + if (data_source_name is None) and (data_source_arn is None): + raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.") + if ((database is None) and (table is None)) and (sql is None): + raise exceptions.InvalidArgument("You must pass database/table OR sql argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (data_source_arn is None) and (data_source_name is not None): + data_source_arn = _get.get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session) + if sql is not None: + physical_table: Dict[str, Dict[str, Any]] = { + "CustomSql": { + "DataSourceArn": data_source_arn, + "Name": sql_name, + "SqlQuery": sql, + "Columns": _qs_utils.extract_athena_query_columns( + sql=sql, + data_source_arn=data_source_arn, # type: ignore + account_id=account_id, + boto3_session=session, + ), + } + } + else: + physical_table = { + "RelationalTable": { + "DataSourceArn": data_source_arn, + "Schema": database, + "Name": table, + "InputColumns": _qs_utils.extract_athena_table_columns( + database=database, # type: ignore + table=table, # type: ignore + boto3_session=session, + ), + } + } + table_uuid: str = uuid.uuid4().hex + args: Dict[str, Any] = { + "AwsAccountId": account_id, + "DataSetId": name, + "Name": name, + "ImportMode": import_mode, + "PhysicalTableMap": {table_uuid: physical_table}, + "LogicalTableMap": {table_uuid: {"Alias": logical_table_alias, "Source": {"PhysicalTableId": table_uuid}}}, + } + trans: List[Dict[str, Dict[str, Any]]] = _generate_transformations( + rename_columns=rename_columns, cast_columns_types=cast_columns_types + ) + if trans: + args["LogicalTableMap"][table_uuid]["DataTransforms"] = trans + permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( + resource="dataset", + account_id=account_id, + boto3_session=session, + allowed_to_use=allowed_to_use, + allowed_to_manage=allowed_to_manage, + ) + if permissions: + args["Permissions"] = permissions + if tags is not None: + _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] + args["Tags"] = _tags + client.create_data_set(**args) + + +def create_ingestion( + dataset_name: Optional[str] = None, + dataset_id: Optional[str] = None, + ingestion_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: + """Create and starts a new SPICE ingestion on a dataset. + + Note + ---- + You must pass ``dataset_name`` OR ``dataset_id`` argument. + + Parameters + ---------- + dataset_name : str, optional + Dataset name. + dataset_id : str, optional + Dataset ID. + ingestion_id : str, optional + Ingestion ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Current status + 'INITIALIZED'|'QUEUED'|'RUNNING'|'FAILED'|'COMPLETED'|'CANCELLED' + + Examples + -------- + >>> import awswrangler as wr + >>> status = wr.quicksight.create_ingestion("my_dataset") + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None dataset_name or dataset_id argument.") + if (dataset_id is None) and (dataset_name is not None): + dataset_id = _get.get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + if ingestion_id is None: + ingestion_id = uuid.uuid4().hex + client: boto3.client = _utils.client(service_name="quicksight", session=session) + response: Dict[str, Any] = client.create_ingestion( + DataSetId=dataset_id, IngestionId=ingestion_id, AwsAccountId=account_id + ) + return response["IngestionStatus"] diff --git a/awswrangler/quicksight/_delete.py b/awswrangler/quicksight/_delete.py new file mode 100644 index 000000000..6d3182706 --- /dev/null +++ b/awswrangler/quicksight/_delete.py @@ -0,0 +1,330 @@ +"""Amazon QuickSight Delete Module.""" + +import logging +from typing import Any, Callable, Dict, Optional + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.quicksight import _get, _list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _delete( + func_name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, **kwargs +) -> None: + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + func: Callable = getattr(client, func_name) + func(AwsAccountId=account_id, **kwargs) + + +def delete_dashboard( + name: Optional[str] = None, + dashboard_id: Optional[str] = None, + version_number: Optional[int] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Delete a dashboard. + + Note + ---- + You must pass a not None ``name`` or ``dashboard_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + dashboard_id : str, optional + The ID for the dashboard. + version_number : int, optional + The version number of the dashboard. If the version number property is provided, + only the specified version of the dashboard is deleted. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_dashboard(name="...") + """ + if (name is None) and (dashboard_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dashboard_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (dashboard_id is None) and (name is not None): + dashboard_id = _get.get_dashboard_id(name=name, account_id=account_id, boto3_session=session) + args: Dict[str, Any] = { + "func_name": "delete_dashboard", + "account_id": account_id, + "boto3_session": session, + "DashboardId": dashboard_id, + } + if version_number is not None: + args["VersionNumber"] = version_number + _delete(**args) + + +def delete_dataset( + name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Delete a dataset. + + Note + ---- + You must pass a not None ``name`` or ``dataset_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + dataset_id : str, optional + The ID for the dataset. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_dataset(name="...") + """ + if (name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (dataset_id is None) and (name is not None): + dataset_id = _get.get_dataset_id(name=name, account_id=account_id, boto3_session=session) + args: Dict[str, Any] = { + "func_name": "delete_data_set", + "account_id": account_id, + "boto3_session": session, + "DataSetId": dataset_id, + } + _delete(**args) + + +def delete_data_source( + name: Optional[str] = None, + data_source_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Delete a data source. + + Note + ---- + You must pass a not None ``name`` or ``data_source_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + data_source_id : str, optional + The ID for the data source. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_data_source(name="...") + """ + if (name is None) and (data_source_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or data_source_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (data_source_id is None) and (name is not None): + data_source_id = _get.get_data_source_id(name=name, account_id=account_id, boto3_session=session) + args: Dict[str, Any] = { + "func_name": "delete_data_source", + "account_id": account_id, + "boto3_session": session, + "DataSourceId": data_source_id, + } + _delete(**args) + + +def delete_template( + name: Optional[str] = None, + template_id: Optional[str] = None, + version_number: Optional[int] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Delete a tamplate. + + Note + ---- + You must pass a not None ``name`` or ``template_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + template_id : str, optional + The ID for the dashboard. + version_number : int, optional + Specifies the version of the template that you want to delete. + If you don't provide a version number, it deletes all versions of the template. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_template(name="...") + """ + if (name is None) and (template_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or template_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (template_id is None) and (name is not None): + template_id = _get.get_template_id(name=name, account_id=account_id, boto3_session=session) + args: Dict[str, Any] = { + "func_name": "delete_template", + "account_id": account_id, + "boto3_session": session, + "TemplateId": template_id, + } + if version_number is not None: + args["VersionNumber"] = version_number + _delete(**args) + + +def delete_all_dashboards(account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Delete all dashboards. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_all_dashboards() + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + for dashboard in _list.list_dashboards(account_id=account_id, boto3_session=session): + delete_dashboard(dashboard_id=dashboard["DashboardId"], account_id=account_id, boto3_session=session) + + +def delete_all_datasets(account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Delete all datasets. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_all_datasets() + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + for dataset in _list.list_datasets(account_id=account_id, boto3_session=session): + delete_dataset(dataset_id=dataset["DataSetId"], account_id=account_id, boto3_session=session) + + +def delete_all_data_sources(account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Delete all data sources. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_all_data_sources() + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + for data_source in _list.list_data_sources(account_id=account_id, boto3_session=session): + delete_data_source(data_source_id=data_source["DataSourceId"], account_id=account_id, boto3_session=session) + + +def delete_all_templates(account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Delete all templates. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_all_templates() + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + for template in _list.list_templates(account_id=account_id, boto3_session=session): + delete_template(template_id=template["TemplateId"], account_id=account_id, boto3_session=session) diff --git a/awswrangler/quicksight/_describe.py b/awswrangler/quicksight/_describe.py new file mode 100644 index 000000000..4eb387b51 --- /dev/null +++ b/awswrangler/quicksight/_describe.py @@ -0,0 +1,236 @@ +"""Amazon QuickSight Describe Module.""" + +import logging +from typing import Any, Dict, Optional + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.quicksight import _get + +_logger: logging.Logger = logging.getLogger(__name__) + + +def describe_dashboard( + name: Optional[str] = None, + dashboard_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight dashboard by name or ID. + + Note + ---- + You must pass a not None ``name`` or ``dashboard_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + dashboard_id : str, optional + Dashboard ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Dashboad Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_dashboard(name="my-dashboard") + """ + if (name is None) and (dashboard_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dashboard_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dashboard_id is None) and (name is not None): + dashboard_id = _get.get_dashboard_id(name=name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_dashboard(AwsAccountId=account_id, DashboardId=dashboard_id)["Dashboard"] + + +def describe_data_source( + name: Optional[str] = None, + data_source_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight data source by name or ID. + + Note + ---- + You must pass a not None ``name`` or ``data_source_id`` argument. + + Parameters + ---------- + name : str, optional + Data source name. + data_source_id : str, optional + Data source ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Data source Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_data_source("...") + """ + if (name is None) and (data_source_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or data_source_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (data_source_id is None) and (name is not None): + data_source_id = _get.get_data_source_id(name=name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_data_source(AwsAccountId=account_id, DataSourceId=data_source_id)["DataSource"] + + +def describe_data_source_permissions( + name: Optional[str] = None, + data_source_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight data source permissions by name or ID. + + Note + ---- + You must pass a not None ``name`` or ``data_source_id`` argument. + + Parameters + ---------- + name : str, optional + Data source name. + data_source_id : str, optional + Data source ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Data source Permissions Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_data_source_permissions("my-data-source") + """ + if (name is None) and (data_source_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or data_source_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (data_source_id is None) and (name is not None): + data_source_id = _get.get_data_source_id(name=name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_data_source_permissions(AwsAccountId=account_id, DataSourceId=data_source_id)["Permissions"] + + +def describe_dataset( + name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight dataset by name or ID. + + Note + ---- + You must pass a not None ``name`` or ``dataset_id`` argument. + + Parameters + ---------- + name : str, optional + Dataset name. + dataset_id : str, optional + Dataset ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Dataset Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_dataset("my-dataset") + """ + if (name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_id is None) and (name is not None): + dataset_id = _get.get_dataset_id(name=name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_data_set(AwsAccountId=account_id, DataSetId=dataset_id)["DataSet"] + + +def describe_ingestion( + ingestion_id: str = None, + dataset_name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight ingestion by ID. + + Note + ---- + You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument. + + Parameters + ---------- + ingestion_id : str + Ingestion ID. + dataset_name : str, optional + Dataset name. + dataset_id : str, optional + Dataset ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Ingestion Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_dataset(ingestion_id="...", dataset_name="...") + """ + if (dataset_name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_id is None) and (dataset_name is not None): + dataset_id = _get.get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id)[ + "Ingestion" + ] diff --git a/awswrangler/quicksight/_get.py b/awswrangler/quicksight/_get.py new file mode 100644 index 000000000..c5fc5e681 --- /dev/null +++ b/awswrangler/quicksight/_get.py @@ -0,0 +1,385 @@ +"""Amazon QuickSight Get Module.""" + +import logging +from typing import Callable, List, Optional + +import boto3 # type: ignore + +from awswrangler import exceptions +from awswrangler.quicksight import _list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _get_ids( + name: str, + func: Callable, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + ids: List[str] = [] + for item in func(account_id=account_id, boto3_session=boto3_session): + if item["Name"] == name: + ids.append(item[attr_name]) + return ids + + +def _get_id( + name: str, + func: Callable, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: + ids: List[str] = _get_ids( + name=name, func=func, attr_name=attr_name, account_id=account_id, boto3_session=boto3_session + ) + if len(ids) == 0: + raise exceptions.InvalidArgument(f"There is no {attr_name} related with name {name}") + if len(ids) > 1: + raise exceptions.InvalidArgument( + f"There is {len(ids)} {attr_name} with name {name}. " + f"Please pass the id argument to specify " + f"which one you would like to describe." + ) + return ids[0] + + +def get_dashboard_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight dashboard IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated dashboard names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Dashboard name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Dashboad IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_dashboard_ids(name="...") + """ + return _get_ids( + name=name, + func=_list.list_dashboards, + attr_name="DashboardId", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def get_dashboard_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight dashboard ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Dashboard name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dashboad ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_dashboard_id(name="...") + """ + return _get_id( + name=name, + func=_list.list_dashboards, + attr_name="DashboardId", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def get_dataset_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight dataset IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated datasets names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Dataset name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Datasets IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_dataset_ids(name="...") + """ + return _get_ids( + name=name, func=_list.list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_dataset_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight Dataset ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Dataset name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dataset ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_dataset_id(name="...") + """ + return _get_id( + name=name, func=_list.list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight data source IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated data source names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Data source IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_data_source_ids(name="...") + """ + return _get_ids( + name=name, + func=_list.list_data_sources, + attr_name="DataSourceId", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def get_data_source_id( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> str: + """Get QuickSight data source ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dataset ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_data_source_id(name="...") + """ + return _get_id( + name=name, + func=_list.list_data_sources, + attr_name="DataSourceId", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def get_template_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight template IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated templates names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Template name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Tamplate IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_template_ids(name="...") + """ + return _get_ids( + name=name, func=_list.list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_template_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight template ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Template name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Template ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_template_id(name="...") + """ + return _get_id( + name=name, func=_list.list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_arns( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight Data source ARNs given a name. + + Note + ---- + This function returns a list of ARNs because Quicksight accepts duplicated data source names, + so you may have more than 1 ARN for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Data source ARNs. + + Examples + -------- + >>> import awswrangler as wr + >>> arns = wr.quicksight.get_data_source_arns(name="...") + """ + arns: List[str] = [] + for source in _list.list_data_sources(account_id=account_id, boto3_session=boto3_session): + if source["Name"] == name: + arns.append(source["Arn"]) + return arns + + +def get_data_source_arn( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> str: + """Get QuickSight data source ARN given a name and fails if there is more than 1 ARN associated with this name. + + Note + ---- + This function returns a list of ARNs because Quicksight accepts duplicated data source names, + so you may have more than 1 ARN for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Data source ARN. + + Examples + -------- + >>> import awswrangler as wr + >>> arn = wr.quicksight.get_data_source_arn("...") + """ + arns: List[str] = get_data_source_arns(name=name, account_id=account_id, boto3_session=boto3_session) + if len(arns) == 0: + raise exceptions.InvalidArgument(f"There is not data source with name {name}") + if len(arns) > 1: + raise exceptions.InvalidArgument( + f"There is more than 1 data source with name {name}. " + f"Please pass the data_source_arn argument to specify " + f"which one you would like to describe." + ) + return arns[0] diff --git a/awswrangler/quicksight/_list.py b/awswrangler/quicksight/_list.py new file mode 100644 index 000000000..88b0c2d7d --- /dev/null +++ b/awswrangler/quicksight/_list.py @@ -0,0 +1,371 @@ +"""Amazon QuickSight List Module.""" + +import logging +from typing import Any, Callable, Dict, List, Optional + +import boto3 # type: ignore + +from awswrangler import _utils + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _list( + func_name: str, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + **kwargs, +) -> List[Dict[str, Any]]: + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + func: Callable = getattr(client, func_name) + response = func(AwsAccountId=account_id, **kwargs) + next_token: str = response.get("NextToken", None) + result: List[Dict[str, Any]] = response[attr_name] + while next_token is not None: + response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs) + next_token = response.get("NextToken", None) + result += response[attr_name] + return result + + +def list_dashboards( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List dashboards in an AWS account. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Dashboards. + + Examples + -------- + >>> import awswrangler as wr + >>> dashboards = wr.quicksight.list_dashboards() + """ + return _list( + func_name="list_dashboards", + attr_name="DashboardSummaryList", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def list_datasets( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight datasets summaries. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Datasets summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> datasets = wr.quicksight.list_datasets() + """ + return _list( + func_name="list_data_sets", attr_name="DataSetSummaries", account_id=account_id, boto3_session=boto3_session + ) + + +def list_data_sources( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight Data sources summaries. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Data sources summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> sources = wr.quicksight.list_data_sources() + """ + return _list( + func_name="list_data_sources", attr_name="DataSources", account_id=account_id, boto3_session=boto3_session + ) + + +def list_templates( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight templates. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Templates summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> templates = wr.quicksight.list_templates() + """ + return _list( + func_name="list_templates", attr_name="TemplateSummaryList", account_id=account_id, boto3_session=boto3_session + ) + + +def list_group_memberships( + group_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List all QuickSight Group memberships. + + Parameters + ---------- + group_name : str + The name of the group that you want to see a membership list of. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Group memberships. + + Examples + -------- + >>> import awswrangler as wr + >>> memberships = wr.quicksight.list_group_memberships() + """ + return _list( + func_name="list_group_memberships", + attr_name="GroupMemberList", + account_id=account_id, + boto3_session=boto3_session, + GroupName=group_name, + Namespace=namespace, + ) + + +def list_groups( + namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight Groups. + + Parameters + ---------- + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> groups = wr.quicksight.list_groups() + """ + return _list( + func_name="list_groups", + attr_name="GroupList", + account_id=account_id, + boto3_session=boto3_session, + Namespace=namespace, + ) + + +def list_iam_policy_assignments( + status: Optional[str] = None, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List IAM policy assignments in the current Amazon QuickSight account. + + Parameters + ---------- + status : str, optional + The status of the assignments. + 'ENABLED'|'DRAFT'|'DISABLED' + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> assigns = wr.quicksight.list_iam_policy_assignments() + """ + args: Dict[str, Any] = { + "func_name": "list_iam_policy_assignments", + "attr_name": "IAMPolicyAssignments", + "account_id": account_id, + "boto3_session": boto3_session, + "Namespace": namespace, + } + if status is not None: + args["AssignmentStatus"] = status + return _list(**args) + + +def list_iam_policy_assignments_for_user( + user_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List all the IAM policy assignments. + + Including the Amazon Resource Names (ARNs) for the IAM policies assigned + to the specified user and group or groups that the user belongs to. + + Parameters + ---------- + user_name : str + The name of the user. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> assigns = wr.quicksight.list_iam_policy_assignments_for_user() + """ + return _list( + func_name="list_iam_policy_assignments_for_user", + attr_name="ActiveAssignments", + account_id=account_id, + boto3_session=boto3_session, + UserName=user_name, + Namespace=namespace, + ) + + +def list_user_groups( + user_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List the Amazon QuickSight groups that an Amazon QuickSight user is a member of. + + Parameters + ---------- + user_name: str: + The Amazon QuickSight user name that you want to list group memberships for. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> groups = wr.quicksight.list_user_groups() + """ + return _list( + func_name="list_user_groups", + attr_name="GroupList", + account_id=account_id, + boto3_session=boto3_session, + UserName=user_name, + Namespace=namespace, + ) + + +def list_users( + namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """Return a list of all of the Amazon QuickSight users belonging to this account. + + Parameters + ---------- + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> users = wr.quicksight.list_users() + """ + return _list( + func_name="list_users", + attr_name="UserList", + account_id=account_id, + boto3_session=boto3_session, + Namespace=namespace, + ) diff --git a/awswrangler/quicksight/_utils.py b/awswrangler/quicksight/_utils.py new file mode 100644 index 000000000..9da3d43ab --- /dev/null +++ b/awswrangler/quicksight/_utils.py @@ -0,0 +1,80 @@ +"""Internal (private) Amazon QuickSight Utilities Module.""" + +import logging +from typing import Any, Dict, List, Optional + +import boto3 # type: ignore + +from awswrangler import _data_types, _utils, athena, catalog, exceptions +from awswrangler.quicksight import _get, _list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def extract_athena_table_columns(database: str, table: str, boto3_session: boto3.Session) -> List[Dict[str, str]]: + """Extract athena columns data types from table and raising an exception if not exist.""" + dtypes: Optional[Dict[str, str]] = catalog.get_table_types( + database=database, table=table, boto3_session=boto3_session + ) + if dtypes is None: + raise exceptions.InvalidArgument(f"{database}.{table} does not exist on Athena.") + return [{"Name": name, "Type": _data_types.athena2quicksight(dtype=dtype)} for name, dtype in dtypes.items()] + + +def extract_athena_query_columns( + sql: str, data_source_arn: str, account_id: str, boto3_session: boto3.Session +) -> List[Dict[str, str]]: + """Extract athena columns data types from a SQL query.""" + data_sources: List[Dict[str, Any]] = _list.list_data_sources(account_id=account_id, boto3_session=boto3_session) + data_source: Dict[str, Any] = [x for x in data_sources if x["Arn"] == data_source_arn][0] + workgroup: str = data_source["DataSourceParameters"]["AthenaParameters"]["WorkGroup"] + sql_wrapped: str = f"/* QuickSight */\nSELECT ds.* FROM ( {sql} ) ds LIMIT 0" + query_id: str = athena.start_query_execution(sql=sql_wrapped, workgroup=workgroup, boto3_session=boto3_session) + athena.wait_query(query_execution_id=query_id, boto3_session=boto3_session) + dtypes: Dict[str, str] = athena.get_query_columns_types(query_execution_id=query_id, boto3_session=boto3_session) + return [{"Name": name, "Type": _data_types.athena2quicksight(dtype=dtype)} for name, dtype in dtypes.items()] + + +def list_ingestions( + dataset_name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List the history of SPICE ingestions for a dataset. + + Parameters + ---------- + dataset_name : str, optional + Dataset name. + dataset_id : str, optional + The ID of the dataset used in the ingestion. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> ingestions = wr.quicksight.list_ingestions() + """ + if (dataset_name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_id is None) and (dataset_name is not None): + dataset_id = _get.get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + return _list._list( # pylint: disable=protected-access + func_name="list_ingestions", + attr_name="Ingestions", + account_id=account_id, + boto3_session=boto3_session, + DataSetId=dataset_id, + ) diff --git a/docs/source/api.rst b/docs/source/api.rst index 5cd8e9e3c..ef0f9f5dd 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -42,8 +42,10 @@ AWS Glue Catalog add_csv_partitions add_parquet_partitions create_csv_table + create_database create_parquet_table databases + delete_database delete_table_if_exists does_table_exist drop_duplicated_columns @@ -135,3 +137,39 @@ CloudWatch Logs run_query start_query wait_query + +Amazon QuickSight +----------------- + +.. currentmodule:: awswrangler.quicksight + +.. autosummary:: + :toctree: stubs + + cancel_ingestion + create_athena_data_source + create_athena_dataset + create_ingestion + delete_all_dashboards + delete_all_data_sources + delete_all_datasets + delete_all_templates + delete_dashboard + delete_data_source + delete_dataset + delete_template + describe_dashboard + describe_data_source + describe_data_source_permissions + describe_dataset + describe_ingestion + get_dashboard_id + get_dashboard_ids + get_data_source_arn + get_data_source_arns + get_data_source_id + get_data_source_ids + get_dataset_id + get_dataset_ids + get_template_id + get_template_ids From d74d79d252b649852f98263c530f8a7621ed8420 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 10:52:32 -0300 Subject: [PATCH 18/28] Organizing imports in the quicksight module. --- awswrangler/quicksight/__init__.py | 48 +- awswrangler/quicksight/_cancel.py | 4 +- awswrangler/quicksight/_create.py | 12 +- awswrangler/quicksight/_delete.py | 27 +- awswrangler/quicksight/_describe.py | 12 +- awswrangler/quicksight/_get.py | 385 -------------- awswrangler/quicksight/_get_list.py | 778 ++++++++++++++++++++++++++++ awswrangler/quicksight/_list.py | 371 ------------- awswrangler/quicksight/_utils.py | 51 +- docs/source/api.rst | 10 + docs/source/what.rst | 2 +- 11 files changed, 865 insertions(+), 835 deletions(-) delete mode 100644 awswrangler/quicksight/_get.py create mode 100644 awswrangler/quicksight/_get_list.py delete mode 100644 awswrangler/quicksight/_list.py diff --git a/awswrangler/quicksight/__init__.py b/awswrangler/quicksight/__init__.py index dfeefeed5..47b1f0b8a 100644 --- a/awswrangler/quicksight/__init__.py +++ b/awswrangler/quicksight/__init__.py @@ -1,9 +1,43 @@ """Amazon QuickSight Module.""" -from awswrangler.quicksight._cancel import * # noqa -from awswrangler.quicksight._create import * # noqa -from awswrangler.quicksight._delete import * # noqa -from awswrangler.quicksight._describe import * # noqa -from awswrangler.quicksight._get import * # noqa -from awswrangler.quicksight._list import * # noqa -from awswrangler.quicksight._utils import list_ingestions # noqa +from awswrangler.quicksight._cancel import cancel_ingestion # noqa +from awswrangler.quicksight._create import create_athena_data_source, create_athena_dataset, create_ingestion # noqa +from awswrangler.quicksight._delete import ( # noqa + delete_all_dashboards, + delete_all_data_sources, + delete_all_datasets, + delete_all_templates, + delete_dashboard, + delete_data_source, + delete_dataset, + delete_template, +) +from awswrangler.quicksight._describe import ( # noqa + describe_dashboard, + describe_data_source, + describe_data_source_permissions, + describe_dataset, + describe_ingestion, +) +from awswrangler.quicksight._get_list import ( # noqa + get_dashboard_id, + get_dashboard_ids, + get_data_source_arn, + get_data_source_arns, + get_data_source_id, + get_data_source_ids, + get_dataset_id, + get_dataset_ids, + get_template_id, + get_template_ids, + list_dashboards, + list_data_sources, + list_datasets, + list_group_memberships, + list_groups, + list_iam_policy_assignments, + list_iam_policy_assignments_for_user, + list_ingestions, + list_templates, + list_users, +) diff --git a/awswrangler/quicksight/_cancel.py b/awswrangler/quicksight/_cancel.py index 9c1dc9fda..cf27cdf45 100644 --- a/awswrangler/quicksight/_cancel.py +++ b/awswrangler/quicksight/_cancel.py @@ -6,7 +6,7 @@ import boto3 # type: ignore from awswrangler import _utils, exceptions -from awswrangler.quicksight import _get +from awswrangler.quicksight._get_list import get_dataset_id _logger: logging.Logger = logging.getLogger(__name__) @@ -53,6 +53,6 @@ def cancel_ingestion( if account_id is None: account_id = _utils.get_account_id(boto3_session=session) if (dataset_id is None) and (dataset_name is not None): - dataset_id = _get.get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) client.cancel_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id) diff --git a/awswrangler/quicksight/_create.py b/awswrangler/quicksight/_create.py index ffaed7edb..de41f0ecf 100644 --- a/awswrangler/quicksight/_create.py +++ b/awswrangler/quicksight/_create.py @@ -7,8 +7,8 @@ import boto3 # type: ignore from awswrangler import _utils, exceptions -from awswrangler.quicksight import _get -from awswrangler.quicksight import _utils as _qs_utils +from awswrangler.quicksight._get_list import get_data_source_arn, get_dataset_id +from awswrangler.quicksight._utils import extract_athena_query_columns, extract_athena_table_columns _logger: logging.Logger = logging.getLogger(__name__) @@ -279,14 +279,14 @@ def create_athena_dataset( if account_id is None: account_id = _utils.get_account_id(boto3_session=session) if (data_source_arn is None) and (data_source_name is not None): - data_source_arn = _get.get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session) + data_source_arn = get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session) if sql is not None: physical_table: Dict[str, Dict[str, Any]] = { "CustomSql": { "DataSourceArn": data_source_arn, "Name": sql_name, "SqlQuery": sql, - "Columns": _qs_utils.extract_athena_query_columns( + "Columns": extract_athena_query_columns( sql=sql, data_source_arn=data_source_arn, # type: ignore account_id=account_id, @@ -300,7 +300,7 @@ def create_athena_dataset( "DataSourceArn": data_source_arn, "Schema": database, "Name": table, - "InputColumns": _qs_utils.extract_athena_table_columns( + "InputColumns": extract_athena_table_columns( database=database, # type: ignore table=table, # type: ignore boto3_session=session, @@ -378,7 +378,7 @@ def create_ingestion( if (dataset_name is None) and (dataset_id is None): raise exceptions.InvalidArgument("You must pass a not None dataset_name or dataset_id argument.") if (dataset_id is None) and (dataset_name is not None): - dataset_id = _get.get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) if ingestion_id is None: ingestion_id = uuid.uuid4().hex client: boto3.client = _utils.client(service_name="quicksight", session=session) diff --git a/awswrangler/quicksight/_delete.py b/awswrangler/quicksight/_delete.py index 6d3182706..cc45e9108 100644 --- a/awswrangler/quicksight/_delete.py +++ b/awswrangler/quicksight/_delete.py @@ -6,7 +6,16 @@ import boto3 # type: ignore from awswrangler import _utils, exceptions -from awswrangler.quicksight import _get, _list +from awswrangler.quicksight._get_list import ( + get_dashboard_id, + get_data_source_id, + get_dataset_id, + get_template_id, + list_dashboards, + list_data_sources, + list_datasets, + list_templates, +) _logger: logging.Logger = logging.getLogger(__name__) @@ -63,7 +72,7 @@ def delete_dashboard( raise exceptions.InvalidArgument("You must pass a not None name or dashboard_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if (dashboard_id is None) and (name is not None): - dashboard_id = _get.get_dashboard_id(name=name, account_id=account_id, boto3_session=session) + dashboard_id = get_dashboard_id(name=name, account_id=account_id, boto3_session=session) args: Dict[str, Any] = { "func_name": "delete_dashboard", "account_id": account_id, @@ -112,7 +121,7 @@ def delete_dataset( raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if (dataset_id is None) and (name is not None): - dataset_id = _get.get_dataset_id(name=name, account_id=account_id, boto3_session=session) + dataset_id = get_dataset_id(name=name, account_id=account_id, boto3_session=session) args: Dict[str, Any] = { "func_name": "delete_data_set", "account_id": account_id, @@ -159,7 +168,7 @@ def delete_data_source( raise exceptions.InvalidArgument("You must pass a not None name or data_source_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if (data_source_id is None) and (name is not None): - data_source_id = _get.get_data_source_id(name=name, account_id=account_id, boto3_session=session) + data_source_id = get_data_source_id(name=name, account_id=account_id, boto3_session=session) args: Dict[str, Any] = { "func_name": "delete_data_source", "account_id": account_id, @@ -210,7 +219,7 @@ def delete_template( raise exceptions.InvalidArgument("You must pass a not None name or template_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if (template_id is None) and (name is not None): - template_id = _get.get_template_id(name=name, account_id=account_id, boto3_session=session) + template_id = get_template_id(name=name, account_id=account_id, boto3_session=session) args: Dict[str, Any] = { "func_name": "delete_template", "account_id": account_id, @@ -245,7 +254,7 @@ def delete_all_dashboards(account_id: Optional[str] = None, boto3_session: Optio session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = _utils.get_account_id(boto3_session=session) - for dashboard in _list.list_dashboards(account_id=account_id, boto3_session=session): + for dashboard in list_dashboards(account_id=account_id, boto3_session=session): delete_dashboard(dashboard_id=dashboard["DashboardId"], account_id=account_id, boto3_session=session) @@ -272,7 +281,7 @@ def delete_all_datasets(account_id: Optional[str] = None, boto3_session: Optiona session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = _utils.get_account_id(boto3_session=session) - for dataset in _list.list_datasets(account_id=account_id, boto3_session=session): + for dataset in list_datasets(account_id=account_id, boto3_session=session): delete_dataset(dataset_id=dataset["DataSetId"], account_id=account_id, boto3_session=session) @@ -299,7 +308,7 @@ def delete_all_data_sources(account_id: Optional[str] = None, boto3_session: Opt session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = _utils.get_account_id(boto3_session=session) - for data_source in _list.list_data_sources(account_id=account_id, boto3_session=session): + for data_source in list_data_sources(account_id=account_id, boto3_session=session): delete_data_source(data_source_id=data_source["DataSourceId"], account_id=account_id, boto3_session=session) @@ -326,5 +335,5 @@ def delete_all_templates(account_id: Optional[str] = None, boto3_session: Option session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = _utils.get_account_id(boto3_session=session) - for template in _list.list_templates(account_id=account_id, boto3_session=session): + for template in list_templates(account_id=account_id, boto3_session=session): delete_template(template_id=template["TemplateId"], account_id=account_id, boto3_session=session) diff --git a/awswrangler/quicksight/_describe.py b/awswrangler/quicksight/_describe.py index 4eb387b51..d46b2bfb6 100644 --- a/awswrangler/quicksight/_describe.py +++ b/awswrangler/quicksight/_describe.py @@ -6,7 +6,7 @@ import boto3 # type: ignore from awswrangler import _utils, exceptions -from awswrangler.quicksight import _get +from awswrangler.quicksight._get_list import get_dashboard_id, get_data_source_id, get_dataset_id _logger: logging.Logger = logging.getLogger(__name__) @@ -50,7 +50,7 @@ def describe_dashboard( if account_id is None: account_id = _utils.get_account_id(boto3_session=session) if (dashboard_id is None) and (name is not None): - dashboard_id = _get.get_dashboard_id(name=name, account_id=account_id, boto3_session=session) + dashboard_id = get_dashboard_id(name=name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) return client.describe_dashboard(AwsAccountId=account_id, DashboardId=dashboard_id)["Dashboard"] @@ -94,7 +94,7 @@ def describe_data_source( if account_id is None: account_id = _utils.get_account_id(boto3_session=session) if (data_source_id is None) and (name is not None): - data_source_id = _get.get_data_source_id(name=name, account_id=account_id, boto3_session=session) + data_source_id = get_data_source_id(name=name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) return client.describe_data_source(AwsAccountId=account_id, DataSourceId=data_source_id)["DataSource"] @@ -138,7 +138,7 @@ def describe_data_source_permissions( if account_id is None: account_id = _utils.get_account_id(boto3_session=session) if (data_source_id is None) and (name is not None): - data_source_id = _get.get_data_source_id(name=name, account_id=account_id, boto3_session=session) + data_source_id = get_data_source_id(name=name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) return client.describe_data_source_permissions(AwsAccountId=account_id, DataSourceId=data_source_id)["Permissions"] @@ -182,7 +182,7 @@ def describe_dataset( if account_id is None: account_id = _utils.get_account_id(boto3_session=session) if (dataset_id is None) and (name is not None): - dataset_id = _get.get_dataset_id(name=name, account_id=account_id, boto3_session=session) + dataset_id = get_dataset_id(name=name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) return client.describe_data_set(AwsAccountId=account_id, DataSetId=dataset_id)["DataSet"] @@ -229,7 +229,7 @@ def describe_ingestion( if account_id is None: account_id = _utils.get_account_id(boto3_session=session) if (dataset_id is None) and (dataset_name is not None): - dataset_id = _get.get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) return client.describe_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id)[ "Ingestion" diff --git a/awswrangler/quicksight/_get.py b/awswrangler/quicksight/_get.py deleted file mode 100644 index c5fc5e681..000000000 --- a/awswrangler/quicksight/_get.py +++ /dev/null @@ -1,385 +0,0 @@ -"""Amazon QuickSight Get Module.""" - -import logging -from typing import Callable, List, Optional - -import boto3 # type: ignore - -from awswrangler import exceptions -from awswrangler.quicksight import _list - -_logger: logging.Logger = logging.getLogger(__name__) - - -def _get_ids( - name: str, - func: Callable, - attr_name: str, - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[str]: - ids: List[str] = [] - for item in func(account_id=account_id, boto3_session=boto3_session): - if item["Name"] == name: - ids.append(item[attr_name]) - return ids - - -def _get_id( - name: str, - func: Callable, - attr_name: str, - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> str: - ids: List[str] = _get_ids( - name=name, func=func, attr_name=attr_name, account_id=account_id, boto3_session=boto3_session - ) - if len(ids) == 0: - raise exceptions.InvalidArgument(f"There is no {attr_name} related with name {name}") - if len(ids) > 1: - raise exceptions.InvalidArgument( - f"There is {len(ids)} {attr_name} with name {name}. " - f"Please pass the id argument to specify " - f"which one you would like to describe." - ) - return ids[0] - - -def get_dashboard_ids( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight dashboard IDs given a name. - - Note - ---- - This function returns a list of ID because Quicksight accepts duplicated dashboard names, - so you may have more than 1 ID for a given name. - - Parameters - ---------- - name : str - Dashboard name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Dashboad IDs. - - Examples - -------- - >>> import awswrangler as wr - >>> ids = wr.quicksight.get_dashboard_ids(name="...") - """ - return _get_ids( - name=name, - func=_list.list_dashboards, - attr_name="DashboardId", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def get_dashboard_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: - """Get QuickSight dashboard ID given a name and fails if there is more than 1 ID associated with this name. - - Parameters - ---------- - name : str - Dashboard name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Dashboad ID. - - Examples - -------- - >>> import awswrangler as wr - >>> my_id = wr.quicksight.get_dashboard_id(name="...") - """ - return _get_id( - name=name, - func=_list.list_dashboards, - attr_name="DashboardId", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def get_dataset_ids( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight dataset IDs given a name. - - Note - ---- - This function returns a list of ID because Quicksight accepts duplicated datasets names, - so you may have more than 1 ID for a given name. - - Parameters - ---------- - name : str - Dataset name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Datasets IDs. - - Examples - -------- - >>> import awswrangler as wr - >>> ids = wr.quicksight.get_dataset_ids(name="...") - """ - return _get_ids( - name=name, func=_list.list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session - ) - - -def get_dataset_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: - """Get QuickSight Dataset ID given a name and fails if there is more than 1 ID associated with this name. - - Parameters - ---------- - name : str - Dataset name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Dataset ID. - - Examples - -------- - >>> import awswrangler as wr - >>> my_id = wr.quicksight.get_dataset_id(name="...") - """ - return _get_id( - name=name, func=_list.list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session - ) - - -def get_data_source_ids( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight data source IDs given a name. - - Note - ---- - This function returns a list of ID because Quicksight accepts duplicated data source names, - so you may have more than 1 ID for a given name. - - Parameters - ---------- - name : str - Data source name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Data source IDs. - - Examples - -------- - >>> import awswrangler as wr - >>> ids = wr.quicksight.get_data_source_ids(name="...") - """ - return _get_ids( - name=name, - func=_list.list_data_sources, - attr_name="DataSourceId", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def get_data_source_id( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> str: - """Get QuickSight data source ID given a name and fails if there is more than 1 ID associated with this name. - - Parameters - ---------- - name : str - Data source name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Dataset ID. - - Examples - -------- - >>> import awswrangler as wr - >>> my_id = wr.quicksight.get_data_source_id(name="...") - """ - return _get_id( - name=name, - func=_list.list_data_sources, - attr_name="DataSourceId", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def get_template_ids( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight template IDs given a name. - - Note - ---- - This function returns a list of ID because Quicksight accepts duplicated templates names, - so you may have more than 1 ID for a given name. - - Parameters - ---------- - name : str - Template name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Tamplate IDs. - - Examples - -------- - >>> import awswrangler as wr - >>> ids = wr.quicksight.get_template_ids(name="...") - """ - return _get_ids( - name=name, func=_list.list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session - ) - - -def get_template_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: - """Get QuickSight template ID given a name and fails if there is more than 1 ID associated with this name. - - Parameters - ---------- - name : str - Template name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Template ID. - - Examples - -------- - >>> import awswrangler as wr - >>> my_id = wr.quicksight.get_template_id(name="...") - """ - return _get_id( - name=name, func=_list.list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session - ) - - -def get_data_source_arns( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight Data source ARNs given a name. - - Note - ---- - This function returns a list of ARNs because Quicksight accepts duplicated data source names, - so you may have more than 1 ARN for a given name. - - Parameters - ---------- - name : str - Data source name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Data source ARNs. - - Examples - -------- - >>> import awswrangler as wr - >>> arns = wr.quicksight.get_data_source_arns(name="...") - """ - arns: List[str] = [] - for source in _list.list_data_sources(account_id=account_id, boto3_session=boto3_session): - if source["Name"] == name: - arns.append(source["Arn"]) - return arns - - -def get_data_source_arn( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> str: - """Get QuickSight data source ARN given a name and fails if there is more than 1 ARN associated with this name. - - Note - ---- - This function returns a list of ARNs because Quicksight accepts duplicated data source names, - so you may have more than 1 ARN for a given name. - - Parameters - ---------- - name : str - Data source name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Data source ARN. - - Examples - -------- - >>> import awswrangler as wr - >>> arn = wr.quicksight.get_data_source_arn("...") - """ - arns: List[str] = get_data_source_arns(name=name, account_id=account_id, boto3_session=boto3_session) - if len(arns) == 0: - raise exceptions.InvalidArgument(f"There is not data source with name {name}") - if len(arns) > 1: - raise exceptions.InvalidArgument( - f"There is more than 1 data source with name {name}. " - f"Please pass the data_source_arn argument to specify " - f"which one you would like to describe." - ) - return arns[0] diff --git a/awswrangler/quicksight/_get_list.py b/awswrangler/quicksight/_get_list.py new file mode 100644 index 000000000..98035e26e --- /dev/null +++ b/awswrangler/quicksight/_get_list.py @@ -0,0 +1,778 @@ +""" +Amazon QuickSight List and Get Module. + +List and Get MUST be together to avoid circular dependency. +""" + +import logging +from typing import Any, Callable, Dict, List, Optional + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _list( + func_name: str, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + **kwargs, +) -> List[Dict[str, Any]]: + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + func: Callable = getattr(client, func_name) + response = func(AwsAccountId=account_id, **kwargs) + next_token: str = response.get("NextToken", None) + result: List[Dict[str, Any]] = response[attr_name] + while next_token is not None: + response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs) + next_token = response.get("NextToken", None) + result += response[attr_name] + return result + + +def list_dashboards( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List dashboards in an AWS account. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Dashboards. + + Examples + -------- + >>> import awswrangler as wr + >>> dashboards = wr.quicksight.list_dashboards() + """ + return _list( + func_name="list_dashboards", + attr_name="DashboardSummaryList", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def list_datasets( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight datasets summaries. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Datasets summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> datasets = wr.quicksight.list_datasets() + """ + return _list( + func_name="list_data_sets", attr_name="DataSetSummaries", account_id=account_id, boto3_session=boto3_session + ) + + +def list_data_sources( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight Data sources summaries. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Data sources summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> sources = wr.quicksight.list_data_sources() + """ + return _list( + func_name="list_data_sources", attr_name="DataSources", account_id=account_id, boto3_session=boto3_session + ) + + +def list_templates( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight templates. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Templates summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> templates = wr.quicksight.list_templates() + """ + return _list( + func_name="list_templates", attr_name="TemplateSummaryList", account_id=account_id, boto3_session=boto3_session + ) + + +def list_group_memberships( + group_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List all QuickSight Group memberships. + + Parameters + ---------- + group_name : str + The name of the group that you want to see a membership list of. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Group memberships. + + Examples + -------- + >>> import awswrangler as wr + >>> memberships = wr.quicksight.list_group_memberships() + """ + return _list( + func_name="list_group_memberships", + attr_name="GroupMemberList", + account_id=account_id, + boto3_session=boto3_session, + GroupName=group_name, + Namespace=namespace, + ) + + +def list_groups( + namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight Groups. + + Parameters + ---------- + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> groups = wr.quicksight.list_groups() + """ + return _list( + func_name="list_groups", + attr_name="GroupList", + account_id=account_id, + boto3_session=boto3_session, + Namespace=namespace, + ) + + +def list_iam_policy_assignments( + status: Optional[str] = None, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List IAM policy assignments in the current Amazon QuickSight account. + + Parameters + ---------- + status : str, optional + The status of the assignments. + 'ENABLED'|'DRAFT'|'DISABLED' + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> assigns = wr.quicksight.list_iam_policy_assignments() + """ + args: Dict[str, Any] = { + "func_name": "list_iam_policy_assignments", + "attr_name": "IAMPolicyAssignments", + "account_id": account_id, + "boto3_session": boto3_session, + "Namespace": namespace, + } + if status is not None: + args["AssignmentStatus"] = status + return _list(**args) + + +def list_iam_policy_assignments_for_user( + user_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List all the IAM policy assignments. + + Including the Amazon Resource Names (ARNs) for the IAM policies assigned + to the specified user and group or groups that the user belongs to. + + Parameters + ---------- + user_name : str + The name of the user. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> assigns = wr.quicksight.list_iam_policy_assignments_for_user() + """ + return _list( + func_name="list_iam_policy_assignments_for_user", + attr_name="ActiveAssignments", + account_id=account_id, + boto3_session=boto3_session, + UserName=user_name, + Namespace=namespace, + ) + + +def list_user_groups( + user_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List the Amazon QuickSight groups that an Amazon QuickSight user is a member of. + + Parameters + ---------- + user_name: str: + The Amazon QuickSight user name that you want to list group memberships for. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> groups = wr.quicksight.list_user_groups() + """ + return _list( + func_name="list_user_groups", + attr_name="GroupList", + account_id=account_id, + boto3_session=boto3_session, + UserName=user_name, + Namespace=namespace, + ) + + +def list_users( + namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """Return a list of all of the Amazon QuickSight users belonging to this account. + + Parameters + ---------- + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> users = wr.quicksight.list_users() + """ + return _list( + func_name="list_users", + attr_name="UserList", + account_id=account_id, + boto3_session=boto3_session, + Namespace=namespace, + ) + + +def list_ingestions( + dataset_name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List the history of SPICE ingestions for a dataset. + + Parameters + ---------- + dataset_name : str, optional + Dataset name. + dataset_id : str, optional + The ID of the dataset used in the ingestion. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> ingestions = wr.quicksight.list_ingestions() + """ + if (dataset_name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_id is None) and (dataset_name is not None): + dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + return _list( + func_name="list_ingestions", + attr_name="Ingestions", + account_id=account_id, + boto3_session=boto3_session, + DataSetId=dataset_id, + ) + + +def _get_ids( + name: str, + func: Callable, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + ids: List[str] = [] + for item in func(account_id=account_id, boto3_session=boto3_session): + if item["Name"] == name: + ids.append(item[attr_name]) + return ids + + +def _get_id( + name: str, + func: Callable, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: + ids: List[str] = _get_ids( + name=name, func=func, attr_name=attr_name, account_id=account_id, boto3_session=boto3_session + ) + if len(ids) == 0: + raise exceptions.InvalidArgument(f"There is no {attr_name} related with name {name}") + if len(ids) > 1: + raise exceptions.InvalidArgument( + f"There is {len(ids)} {attr_name} with name {name}. " + f"Please pass the id argument to specify " + f"which one you would like to describe." + ) + return ids[0] + + +def get_dashboard_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight dashboard IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated dashboard names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Dashboard name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Dashboad IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_dashboard_ids(name="...") + """ + return _get_ids( + name=name, func=list_dashboards, attr_name="DashboardId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_dashboard_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight dashboard ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Dashboard name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dashboad ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_dashboard_id(name="...") + """ + return _get_id( + name=name, func=list_dashboards, attr_name="DashboardId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_dataset_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight dataset IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated datasets names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Dataset name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Datasets IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_dataset_ids(name="...") + """ + return _get_ids( + name=name, func=list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_dataset_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight Dataset ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Dataset name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dataset ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_dataset_id(name="...") + """ + return _get_id( + name=name, func=list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight data source IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated data source names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Data source IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_data_source_ids(name="...") + """ + return _get_ids( + name=name, func=list_data_sources, attr_name="DataSourceId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_id( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> str: + """Get QuickSight data source ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dataset ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_data_source_id(name="...") + """ + return _get_id( + name=name, func=list_data_sources, attr_name="DataSourceId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_template_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight template IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated templates names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Template name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Tamplate IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_template_ids(name="...") + """ + return _get_ids( + name=name, func=list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_template_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight template ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Template name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Template ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_template_id(name="...") + """ + return _get_id( + name=name, func=list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_arns( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight Data source ARNs given a name. + + Note + ---- + This function returns a list of ARNs because Quicksight accepts duplicated data source names, + so you may have more than 1 ARN for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Data source ARNs. + + Examples + -------- + >>> import awswrangler as wr + >>> arns = wr.quicksight.get_data_source_arns(name="...") + """ + arns: List[str] = [] + for source in list_data_sources(account_id=account_id, boto3_session=boto3_session): + if source["Name"] == name: + arns.append(source["Arn"]) + return arns + + +def get_data_source_arn( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> str: + """Get QuickSight data source ARN given a name and fails if there is more than 1 ARN associated with this name. + + Note + ---- + This function returns a list of ARNs because Quicksight accepts duplicated data source names, + so you may have more than 1 ARN for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Data source ARN. + + Examples + -------- + >>> import awswrangler as wr + >>> arn = wr.quicksight.get_data_source_arn("...") + """ + arns: List[str] = get_data_source_arns(name=name, account_id=account_id, boto3_session=boto3_session) + if len(arns) == 0: + raise exceptions.InvalidArgument(f"There is not data source with name {name}") + if len(arns) > 1: + raise exceptions.InvalidArgument( + f"There is more than 1 data source with name {name}. " + f"Please pass the data_source_arn argument to specify " + f"which one you would like to describe." + ) + return arns[0] diff --git a/awswrangler/quicksight/_list.py b/awswrangler/quicksight/_list.py deleted file mode 100644 index 88b0c2d7d..000000000 --- a/awswrangler/quicksight/_list.py +++ /dev/null @@ -1,371 +0,0 @@ -"""Amazon QuickSight List Module.""" - -import logging -from typing import Any, Callable, Dict, List, Optional - -import boto3 # type: ignore - -from awswrangler import _utils - -_logger: logging.Logger = logging.getLogger(__name__) - - -def _list( - func_name: str, - attr_name: str, - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, - **kwargs, -) -> List[Dict[str, Any]]: - session: boto3.Session = _utils.ensure_session(session=boto3_session) - if account_id is None: - account_id = _utils.get_account_id(boto3_session=session) - client: boto3.client = _utils.client(service_name="quicksight", session=session) - func: Callable = getattr(client, func_name) - response = func(AwsAccountId=account_id, **kwargs) - next_token: str = response.get("NextToken", None) - result: List[Dict[str, Any]] = response[attr_name] - while next_token is not None: - response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs) - next_token = response.get("NextToken", None) - result += response[attr_name] - return result - - -def list_dashboards( - account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List dashboards in an AWS account. - - Parameters - ---------- - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Dashboards. - - Examples - -------- - >>> import awswrangler as wr - >>> dashboards = wr.quicksight.list_dashboards() - """ - return _list( - func_name="list_dashboards", - attr_name="DashboardSummaryList", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def list_datasets( - account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List all QuickSight datasets summaries. - - Parameters - ---------- - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Datasets summaries. - - Examples - -------- - >>> import awswrangler as wr - >>> datasets = wr.quicksight.list_datasets() - """ - return _list( - func_name="list_data_sets", attr_name="DataSetSummaries", account_id=account_id, boto3_session=boto3_session - ) - - -def list_data_sources( - account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List all QuickSight Data sources summaries. - - Parameters - ---------- - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Data sources summaries. - - Examples - -------- - >>> import awswrangler as wr - >>> sources = wr.quicksight.list_data_sources() - """ - return _list( - func_name="list_data_sources", attr_name="DataSources", account_id=account_id, boto3_session=boto3_session - ) - - -def list_templates( - account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List all QuickSight templates. - - Parameters - ---------- - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Templates summaries. - - Examples - -------- - >>> import awswrangler as wr - >>> templates = wr.quicksight.list_templates() - """ - return _list( - func_name="list_templates", attr_name="TemplateSummaryList", account_id=account_id, boto3_session=boto3_session - ) - - -def list_group_memberships( - group_name: str, - namespace: str = "default", - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[Dict[str, Any]]: - """List all QuickSight Group memberships. - - Parameters - ---------- - group_name : str - The name of the group that you want to see a membership list of. - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Group memberships. - - Examples - -------- - >>> import awswrangler as wr - >>> memberships = wr.quicksight.list_group_memberships() - """ - return _list( - func_name="list_group_memberships", - attr_name="GroupMemberList", - account_id=account_id, - boto3_session=boto3_session, - GroupName=group_name, - Namespace=namespace, - ) - - -def list_groups( - namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List all QuickSight Groups. - - Parameters - ---------- - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Groups. - - Examples - -------- - >>> import awswrangler as wr - >>> groups = wr.quicksight.list_groups() - """ - return _list( - func_name="list_groups", - attr_name="GroupList", - account_id=account_id, - boto3_session=boto3_session, - Namespace=namespace, - ) - - -def list_iam_policy_assignments( - status: Optional[str] = None, - namespace: str = "default", - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[Dict[str, Any]]: - """List IAM policy assignments in the current Amazon QuickSight account. - - Parameters - ---------- - status : str, optional - The status of the assignments. - 'ENABLED'|'DRAFT'|'DISABLED' - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - IAM policy assignments. - - Examples - -------- - >>> import awswrangler as wr - >>> assigns = wr.quicksight.list_iam_policy_assignments() - """ - args: Dict[str, Any] = { - "func_name": "list_iam_policy_assignments", - "attr_name": "IAMPolicyAssignments", - "account_id": account_id, - "boto3_session": boto3_session, - "Namespace": namespace, - } - if status is not None: - args["AssignmentStatus"] = status - return _list(**args) - - -def list_iam_policy_assignments_for_user( - user_name: str, - namespace: str = "default", - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[Dict[str, Any]]: - """List all the IAM policy assignments. - - Including the Amazon Resource Names (ARNs) for the IAM policies assigned - to the specified user and group or groups that the user belongs to. - - Parameters - ---------- - user_name : str - The name of the user. - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - IAM policy assignments. - - Examples - -------- - >>> import awswrangler as wr - >>> assigns = wr.quicksight.list_iam_policy_assignments_for_user() - """ - return _list( - func_name="list_iam_policy_assignments_for_user", - attr_name="ActiveAssignments", - account_id=account_id, - boto3_session=boto3_session, - UserName=user_name, - Namespace=namespace, - ) - - -def list_user_groups( - user_name: str, - namespace: str = "default", - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[Dict[str, Any]]: - """List the Amazon QuickSight groups that an Amazon QuickSight user is a member of. - - Parameters - ---------- - user_name: str: - The Amazon QuickSight user name that you want to list group memberships for. - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Groups. - - Examples - -------- - >>> import awswrangler as wr - >>> groups = wr.quicksight.list_user_groups() - """ - return _list( - func_name="list_user_groups", - attr_name="GroupList", - account_id=account_id, - boto3_session=boto3_session, - UserName=user_name, - Namespace=namespace, - ) - - -def list_users( - namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """Return a list of all of the Amazon QuickSight users belonging to this account. - - Parameters - ---------- - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Groups. - - Examples - -------- - >>> import awswrangler as wr - >>> users = wr.quicksight.list_users() - """ - return _list( - func_name="list_users", - attr_name="UserList", - account_id=account_id, - boto3_session=boto3_session, - Namespace=namespace, - ) diff --git a/awswrangler/quicksight/_utils.py b/awswrangler/quicksight/_utils.py index 9da3d43ab..957cf9f53 100644 --- a/awswrangler/quicksight/_utils.py +++ b/awswrangler/quicksight/_utils.py @@ -5,8 +5,8 @@ import boto3 # type: ignore -from awswrangler import _data_types, _utils, athena, catalog, exceptions -from awswrangler.quicksight import _get, _list +from awswrangler import _data_types, athena, catalog, exceptions +from awswrangler.quicksight._get_list import list_data_sources _logger: logging.Logger = logging.getLogger(__name__) @@ -25,7 +25,7 @@ def extract_athena_query_columns( sql: str, data_source_arn: str, account_id: str, boto3_session: boto3.Session ) -> List[Dict[str, str]]: """Extract athena columns data types from a SQL query.""" - data_sources: List[Dict[str, Any]] = _list.list_data_sources(account_id=account_id, boto3_session=boto3_session) + data_sources: List[Dict[str, Any]] = list_data_sources(account_id=account_id, boto3_session=boto3_session) data_source: Dict[str, Any] = [x for x in data_sources if x["Arn"] == data_source_arn][0] workgroup: str = data_source["DataSourceParameters"]["AthenaParameters"]["WorkGroup"] sql_wrapped: str = f"/* QuickSight */\nSELECT ds.* FROM ( {sql} ) ds LIMIT 0" @@ -33,48 +33,3 @@ def extract_athena_query_columns( athena.wait_query(query_execution_id=query_id, boto3_session=boto3_session) dtypes: Dict[str, str] = athena.get_query_columns_types(query_execution_id=query_id, boto3_session=boto3_session) return [{"Name": name, "Type": _data_types.athena2quicksight(dtype=dtype)} for name, dtype in dtypes.items()] - - -def list_ingestions( - dataset_name: Optional[str] = None, - dataset_id: Optional[str] = None, - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[Dict[str, Any]]: - """List the history of SPICE ingestions for a dataset. - - Parameters - ---------- - dataset_name : str, optional - Dataset name. - dataset_id : str, optional - The ID of the dataset used in the ingestion. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - IAM policy assignments. - - Examples - -------- - >>> import awswrangler as wr - >>> ingestions = wr.quicksight.list_ingestions() - """ - if (dataset_name is None) and (dataset_id is None): - raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") - session: boto3.Session = _utils.ensure_session(session=boto3_session) - if account_id is None: - account_id = _utils.get_account_id(boto3_session=session) - if (dataset_id is None) and (dataset_name is not None): - dataset_id = _get.get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) - return _list._list( # pylint: disable=protected-access - func_name="list_ingestions", - attr_name="Ingestions", - account_id=account_id, - boto3_session=boto3_session, - DataSetId=dataset_id, - ) diff --git a/docs/source/api.rst b/docs/source/api.rst index ef0f9f5dd..a61526683 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -173,3 +173,13 @@ Amazon QuickSight get_dataset_ids get_template_id get_template_ids + list_dashboards + list_data_sources + list_datasets + list_groups + list_group_memberships + list_iam_policy_assignments + list_iam_policy_assignments_for_user + list_ingestions + list_templates + list_users diff --git a/docs/source/what.rst b/docs/source/what.rst index 0a169b74d..71c721782 100644 --- a/docs/source/what.rst +++ b/docs/source/what.rst @@ -1,7 +1,7 @@ What is AWS Data Wrangler? ========================== -An `open-source `_ Python package that extends the power of `Pandas `_ library to AWS connecting **DataFrames** and AWS data related services (**Amazon Redshift**, **AWS Glue**, **Amazon Athena**, **Amazon EMR**, etc). +An `open-source `_ Python package that extends the power of `Pandas `_ library to AWS connecting **DataFrames** and AWS data related services (**Amazon Redshift**, **AWS Glue**, **Amazon Athena**, **Amazon EMR**, **Amazon QuickSight**, etc). Built on top of other open-source projects like `Pandas `_, `Apache Arrow `_, `Boto3 `_, `s3fs `_, `SQLAlchemy `_, `Psycopg2 `_ and `PyMySQL `_, it offers abstracted functions to execute usual ETL tasks like load/unload data from **Data Lakes**, **Data Warehouses** and **Databases**. From b67cf9fb9c9227580c16f470af3eb56b3e8c90bb Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 11:07:28 -0300 Subject: [PATCH 19/28] Remove duplicated paragraph from the sessions tutorial. --- tutorials/002 - Sessions.ipynb | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/tutorials/002 - Sessions.ipynb b/tutorials/002 - Sessions.ipynb index 2ff88ad1a..b305ed429 100644 --- a/tutorials/002 - Sessions.ipynb +++ b/tutorials/002 - Sessions.ipynb @@ -124,28 +124,6 @@ "\n", "wr.s3.does_object_exist(\"s3://noaa-ghcn-pds/fake\", boto3_session=my_session)" ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_session = boto3.Session(region_name=\"us-east-2\")\n", - "\n", - "wr.s3.does_object_exist(\"s3://noaa-ghcn-pds/fake\", boto3_session=my_session)" - ] } ], "metadata": { From 526a830cca6b0893b924adbe8f95259af48ddc32 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 12:19:02 -0300 Subject: [PATCH 20/28] Fixing bug on tables catalog tables w/o PartitionKeys. --- awswrangler/catalog.py | 64 ++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index 5ea578d20..aa2f34b85 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -439,8 +439,9 @@ def get_table_types( dtypes: Dict[str, str] = {} for col in response["Table"]["StorageDescriptor"]["Columns"]: dtypes[col["Name"]] = col["Type"] - for par in response["Table"]["PartitionKeys"]: - dtypes[par["Name"]] = par["Type"] + if "PartitionKeys" in response["Table"]: + for par in response["Table"]["PartitionKeys"]: + dtypes[par["Name"]] = par["Type"] return dtypes @@ -527,6 +528,11 @@ def get_tables( ) -> Iterator[Dict[str, Any]]: """Get an iterator of tables. + Note + ---- + Please, does not filter using name_contains and name_prefix/name_suffix at the same time. + Only name_prefix and name_suffix can be combined together. + Parameters ---------- catalog_id : str, optional @@ -560,15 +566,17 @@ def get_tables( if catalog_id is not None: args["CatalogId"] = catalog_id if (name_prefix is not None) and (name_suffix is not None) and (name_contains is not None): - args["Expression"] = f"{name_prefix}.*{name_contains}.*{name_suffix}" + raise exceptions.InvalidArgumentCombination("Please, does not filter using name_contains and " + "name_prefix/name_suffix at the same time. Only " + "name_prefix and name_suffix can be combined together.") elif (name_prefix is not None) and (name_suffix is not None): - args["Expression"] = f"{name_prefix}.*{name_suffix}" + args["Expression"] = f"{name_prefix}*{name_suffix}" elif name_contains is not None: - args["Expression"] = f".*{name_contains}.*" + args["Expression"] = f"*{name_contains}*" elif name_prefix is not None: - args["Expression"] = f"{name_prefix}.*" + args["Expression"] = f"{name_prefix}*" elif name_suffix is not None: - args["Expression"] = f".*{name_suffix}" + args["Expression"] = f"*{name_suffix}" if database is not None: dbs: List[str] = [database] else: @@ -647,15 +655,21 @@ def tables( tbls = tbls[:limit] df_dict: Dict[str, List] = {"Database": [], "Table": [], "Description": [], "Columns": [], "Partitions": []} - for table in tbls: - df_dict["Database"].append(table["DatabaseName"]) - df_dict["Table"].append(table["Name"]) - if "Description" in table: - df_dict["Description"].append(table["Description"]) + for tbl in tbls: + df_dict["Database"].append(tbl["DatabaseName"]) + df_dict["Table"].append(tbl["Name"]) + if "Description" in tbl: + df_dict["Description"].append(tbl["Description"]) else: df_dict["Description"].append("") - df_dict["Columns"].append(", ".join([x["Name"] for x in table["StorageDescriptor"]["Columns"]])) - df_dict["Partitions"].append(", ".join([x["Name"] for x in table["PartitionKeys"]])) + if "Columns" in tbl["StorageDescriptor"]: + df_dict["Columns"].append(", ".join([x["Name"] for x in tbl["StorageDescriptor"]["Columns"]])) + else: + df_dict["Columns"].append("") + if "PartitionKeys" in tbl: + df_dict["Partitions"].append(", ".join([x["Name"] for x in tbl["PartitionKeys"]])) + else: + df_dict["Partitions"].append("") return pd.DataFrame(data=df_dict) @@ -771,14 +785,15 @@ def table( df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") - for col in tbl["PartitionKeys"]: - df_dict["Column Name"].append(col["Name"]) - df_dict["Type"].append(col["Type"]) - df_dict["Partition"].append(True) - if "Comment" in col: - df_dict["Comment"].append(col["Comment"]) - else: - df_dict["Comment"].append("") + if "PartitionKeys" in tbl: + for col in tbl["PartitionKeys"]: + df_dict["Column Name"].append(col["Name"]) + df_dict["Type"].append(col["Type"]) + df_dict["Partition"].append(True) + if "Comment" in col: + df_dict["Comment"].append(col["Comment"]) + else: + df_dict["Comment"].append("") return pd.DataFrame(data=df_dict) @@ -1692,8 +1707,9 @@ def get_columns_comments( comments: Dict[str, str] = {} for c in response["Table"]["StorageDescriptor"]["Columns"]: comments[c["Name"]] = c["Comment"] - for p in response["Table"]["PartitionKeys"]: - comments[p["Name"]] = p["Comment"] + if "PartitionKeys" in response["Table"]: + for p in response["Table"]["PartitionKeys"]: + comments[p["Name"]] = p["Comment"] return comments From 8953bc2029664f984cd8a19e373ba854570a0d64 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Fri, 12 Jun 2020 08:45:26 -0300 Subject: [PATCH 21/28] Rollback SQLAlchemy version. #281 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6ee02a1fb..273fff794 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ s3fs~=0.4.2 psycopg2-binary~=2.8.0 pymysql~=0.9.0 sqlalchemy-redshift~=0.7.0 -SQLAlchemy~=1.3.10 +SQLAlchemy>=1.3.10,<1.3.14 From a1a0ca351d1b3a912ee7f438a389fec4e60561cc Mon Sep 17 00:00:00 2001 From: igorborgest Date: Wed, 10 Jun 2020 20:02:05 -0300 Subject: [PATCH 22/28] First quicksight codes. :rocket: --- awswrangler/quicksight/_get.py | 385 ++++++++++++++++++++++++++++++++ awswrangler/quicksight/_list.py | 371 ++++++++++++++++++++++++++++++ 2 files changed, 756 insertions(+) create mode 100644 awswrangler/quicksight/_get.py create mode 100644 awswrangler/quicksight/_list.py diff --git a/awswrangler/quicksight/_get.py b/awswrangler/quicksight/_get.py new file mode 100644 index 000000000..c5fc5e681 --- /dev/null +++ b/awswrangler/quicksight/_get.py @@ -0,0 +1,385 @@ +"""Amazon QuickSight Get Module.""" + +import logging +from typing import Callable, List, Optional + +import boto3 # type: ignore + +from awswrangler import exceptions +from awswrangler.quicksight import _list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _get_ids( + name: str, + func: Callable, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + ids: List[str] = [] + for item in func(account_id=account_id, boto3_session=boto3_session): + if item["Name"] == name: + ids.append(item[attr_name]) + return ids + + +def _get_id( + name: str, + func: Callable, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: + ids: List[str] = _get_ids( + name=name, func=func, attr_name=attr_name, account_id=account_id, boto3_session=boto3_session + ) + if len(ids) == 0: + raise exceptions.InvalidArgument(f"There is no {attr_name} related with name {name}") + if len(ids) > 1: + raise exceptions.InvalidArgument( + f"There is {len(ids)} {attr_name} with name {name}. " + f"Please pass the id argument to specify " + f"which one you would like to describe." + ) + return ids[0] + + +def get_dashboard_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight dashboard IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated dashboard names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Dashboard name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Dashboad IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_dashboard_ids(name="...") + """ + return _get_ids( + name=name, + func=_list.list_dashboards, + attr_name="DashboardId", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def get_dashboard_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight dashboard ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Dashboard name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dashboad ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_dashboard_id(name="...") + """ + return _get_id( + name=name, + func=_list.list_dashboards, + attr_name="DashboardId", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def get_dataset_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight dataset IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated datasets names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Dataset name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Datasets IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_dataset_ids(name="...") + """ + return _get_ids( + name=name, func=_list.list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_dataset_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight Dataset ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Dataset name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dataset ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_dataset_id(name="...") + """ + return _get_id( + name=name, func=_list.list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight data source IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated data source names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Data source IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_data_source_ids(name="...") + """ + return _get_ids( + name=name, + func=_list.list_data_sources, + attr_name="DataSourceId", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def get_data_source_id( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> str: + """Get QuickSight data source ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dataset ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_data_source_id(name="...") + """ + return _get_id( + name=name, + func=_list.list_data_sources, + attr_name="DataSourceId", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def get_template_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight template IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated templates names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Template name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Tamplate IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_template_ids(name="...") + """ + return _get_ids( + name=name, func=_list.list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_template_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight template ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Template name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Template ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_template_id(name="...") + """ + return _get_id( + name=name, func=_list.list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_arns( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight Data source ARNs given a name. + + Note + ---- + This function returns a list of ARNs because Quicksight accepts duplicated data source names, + so you may have more than 1 ARN for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Data source ARNs. + + Examples + -------- + >>> import awswrangler as wr + >>> arns = wr.quicksight.get_data_source_arns(name="...") + """ + arns: List[str] = [] + for source in _list.list_data_sources(account_id=account_id, boto3_session=boto3_session): + if source["Name"] == name: + arns.append(source["Arn"]) + return arns + + +def get_data_source_arn( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> str: + """Get QuickSight data source ARN given a name and fails if there is more than 1 ARN associated with this name. + + Note + ---- + This function returns a list of ARNs because Quicksight accepts duplicated data source names, + so you may have more than 1 ARN for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Data source ARN. + + Examples + -------- + >>> import awswrangler as wr + >>> arn = wr.quicksight.get_data_source_arn("...") + """ + arns: List[str] = get_data_source_arns(name=name, account_id=account_id, boto3_session=boto3_session) + if len(arns) == 0: + raise exceptions.InvalidArgument(f"There is not data source with name {name}") + if len(arns) > 1: + raise exceptions.InvalidArgument( + f"There is more than 1 data source with name {name}. " + f"Please pass the data_source_arn argument to specify " + f"which one you would like to describe." + ) + return arns[0] diff --git a/awswrangler/quicksight/_list.py b/awswrangler/quicksight/_list.py new file mode 100644 index 000000000..88b0c2d7d --- /dev/null +++ b/awswrangler/quicksight/_list.py @@ -0,0 +1,371 @@ +"""Amazon QuickSight List Module.""" + +import logging +from typing import Any, Callable, Dict, List, Optional + +import boto3 # type: ignore + +from awswrangler import _utils + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _list( + func_name: str, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + **kwargs, +) -> List[Dict[str, Any]]: + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + func: Callable = getattr(client, func_name) + response = func(AwsAccountId=account_id, **kwargs) + next_token: str = response.get("NextToken", None) + result: List[Dict[str, Any]] = response[attr_name] + while next_token is not None: + response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs) + next_token = response.get("NextToken", None) + result += response[attr_name] + return result + + +def list_dashboards( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List dashboards in an AWS account. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Dashboards. + + Examples + -------- + >>> import awswrangler as wr + >>> dashboards = wr.quicksight.list_dashboards() + """ + return _list( + func_name="list_dashboards", + attr_name="DashboardSummaryList", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def list_datasets( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight datasets summaries. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Datasets summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> datasets = wr.quicksight.list_datasets() + """ + return _list( + func_name="list_data_sets", attr_name="DataSetSummaries", account_id=account_id, boto3_session=boto3_session + ) + + +def list_data_sources( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight Data sources summaries. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Data sources summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> sources = wr.quicksight.list_data_sources() + """ + return _list( + func_name="list_data_sources", attr_name="DataSources", account_id=account_id, boto3_session=boto3_session + ) + + +def list_templates( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight templates. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Templates summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> templates = wr.quicksight.list_templates() + """ + return _list( + func_name="list_templates", attr_name="TemplateSummaryList", account_id=account_id, boto3_session=boto3_session + ) + + +def list_group_memberships( + group_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List all QuickSight Group memberships. + + Parameters + ---------- + group_name : str + The name of the group that you want to see a membership list of. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Group memberships. + + Examples + -------- + >>> import awswrangler as wr + >>> memberships = wr.quicksight.list_group_memberships() + """ + return _list( + func_name="list_group_memberships", + attr_name="GroupMemberList", + account_id=account_id, + boto3_session=boto3_session, + GroupName=group_name, + Namespace=namespace, + ) + + +def list_groups( + namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight Groups. + + Parameters + ---------- + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> groups = wr.quicksight.list_groups() + """ + return _list( + func_name="list_groups", + attr_name="GroupList", + account_id=account_id, + boto3_session=boto3_session, + Namespace=namespace, + ) + + +def list_iam_policy_assignments( + status: Optional[str] = None, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List IAM policy assignments in the current Amazon QuickSight account. + + Parameters + ---------- + status : str, optional + The status of the assignments. + 'ENABLED'|'DRAFT'|'DISABLED' + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> assigns = wr.quicksight.list_iam_policy_assignments() + """ + args: Dict[str, Any] = { + "func_name": "list_iam_policy_assignments", + "attr_name": "IAMPolicyAssignments", + "account_id": account_id, + "boto3_session": boto3_session, + "Namespace": namespace, + } + if status is not None: + args["AssignmentStatus"] = status + return _list(**args) + + +def list_iam_policy_assignments_for_user( + user_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List all the IAM policy assignments. + + Including the Amazon Resource Names (ARNs) for the IAM policies assigned + to the specified user and group or groups that the user belongs to. + + Parameters + ---------- + user_name : str + The name of the user. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> assigns = wr.quicksight.list_iam_policy_assignments_for_user() + """ + return _list( + func_name="list_iam_policy_assignments_for_user", + attr_name="ActiveAssignments", + account_id=account_id, + boto3_session=boto3_session, + UserName=user_name, + Namespace=namespace, + ) + + +def list_user_groups( + user_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List the Amazon QuickSight groups that an Amazon QuickSight user is a member of. + + Parameters + ---------- + user_name: str: + The Amazon QuickSight user name that you want to list group memberships for. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> groups = wr.quicksight.list_user_groups() + """ + return _list( + func_name="list_user_groups", + attr_name="GroupList", + account_id=account_id, + boto3_session=boto3_session, + UserName=user_name, + Namespace=namespace, + ) + + +def list_users( + namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """Return a list of all of the Amazon QuickSight users belonging to this account. + + Parameters + ---------- + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> users = wr.quicksight.list_users() + """ + return _list( + func_name="list_users", + attr_name="UserList", + account_id=account_id, + boto3_session=boto3_session, + Namespace=namespace, + ) From 431bf99cda203b6ce3b6e13ab200a3234f7eb791 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Thu, 11 Jun 2020 10:52:32 -0300 Subject: [PATCH 23/28] Organizing imports in the quicksight module. --- awswrangler/quicksight/_get.py | 385 -------------------------------- awswrangler/quicksight/_list.py | 371 ------------------------------ 2 files changed, 756 deletions(-) delete mode 100644 awswrangler/quicksight/_get.py delete mode 100644 awswrangler/quicksight/_list.py diff --git a/awswrangler/quicksight/_get.py b/awswrangler/quicksight/_get.py deleted file mode 100644 index c5fc5e681..000000000 --- a/awswrangler/quicksight/_get.py +++ /dev/null @@ -1,385 +0,0 @@ -"""Amazon QuickSight Get Module.""" - -import logging -from typing import Callable, List, Optional - -import boto3 # type: ignore - -from awswrangler import exceptions -from awswrangler.quicksight import _list - -_logger: logging.Logger = logging.getLogger(__name__) - - -def _get_ids( - name: str, - func: Callable, - attr_name: str, - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[str]: - ids: List[str] = [] - for item in func(account_id=account_id, boto3_session=boto3_session): - if item["Name"] == name: - ids.append(item[attr_name]) - return ids - - -def _get_id( - name: str, - func: Callable, - attr_name: str, - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> str: - ids: List[str] = _get_ids( - name=name, func=func, attr_name=attr_name, account_id=account_id, boto3_session=boto3_session - ) - if len(ids) == 0: - raise exceptions.InvalidArgument(f"There is no {attr_name} related with name {name}") - if len(ids) > 1: - raise exceptions.InvalidArgument( - f"There is {len(ids)} {attr_name} with name {name}. " - f"Please pass the id argument to specify " - f"which one you would like to describe." - ) - return ids[0] - - -def get_dashboard_ids( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight dashboard IDs given a name. - - Note - ---- - This function returns a list of ID because Quicksight accepts duplicated dashboard names, - so you may have more than 1 ID for a given name. - - Parameters - ---------- - name : str - Dashboard name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Dashboad IDs. - - Examples - -------- - >>> import awswrangler as wr - >>> ids = wr.quicksight.get_dashboard_ids(name="...") - """ - return _get_ids( - name=name, - func=_list.list_dashboards, - attr_name="DashboardId", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def get_dashboard_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: - """Get QuickSight dashboard ID given a name and fails if there is more than 1 ID associated with this name. - - Parameters - ---------- - name : str - Dashboard name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Dashboad ID. - - Examples - -------- - >>> import awswrangler as wr - >>> my_id = wr.quicksight.get_dashboard_id(name="...") - """ - return _get_id( - name=name, - func=_list.list_dashboards, - attr_name="DashboardId", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def get_dataset_ids( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight dataset IDs given a name. - - Note - ---- - This function returns a list of ID because Quicksight accepts duplicated datasets names, - so you may have more than 1 ID for a given name. - - Parameters - ---------- - name : str - Dataset name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Datasets IDs. - - Examples - -------- - >>> import awswrangler as wr - >>> ids = wr.quicksight.get_dataset_ids(name="...") - """ - return _get_ids( - name=name, func=_list.list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session - ) - - -def get_dataset_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: - """Get QuickSight Dataset ID given a name and fails if there is more than 1 ID associated with this name. - - Parameters - ---------- - name : str - Dataset name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Dataset ID. - - Examples - -------- - >>> import awswrangler as wr - >>> my_id = wr.quicksight.get_dataset_id(name="...") - """ - return _get_id( - name=name, func=_list.list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session - ) - - -def get_data_source_ids( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight data source IDs given a name. - - Note - ---- - This function returns a list of ID because Quicksight accepts duplicated data source names, - so you may have more than 1 ID for a given name. - - Parameters - ---------- - name : str - Data source name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Data source IDs. - - Examples - -------- - >>> import awswrangler as wr - >>> ids = wr.quicksight.get_data_source_ids(name="...") - """ - return _get_ids( - name=name, - func=_list.list_data_sources, - attr_name="DataSourceId", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def get_data_source_id( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> str: - """Get QuickSight data source ID given a name and fails if there is more than 1 ID associated with this name. - - Parameters - ---------- - name : str - Data source name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Dataset ID. - - Examples - -------- - >>> import awswrangler as wr - >>> my_id = wr.quicksight.get_data_source_id(name="...") - """ - return _get_id( - name=name, - func=_list.list_data_sources, - attr_name="DataSourceId", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def get_template_ids( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight template IDs given a name. - - Note - ---- - This function returns a list of ID because Quicksight accepts duplicated templates names, - so you may have more than 1 ID for a given name. - - Parameters - ---------- - name : str - Template name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Tamplate IDs. - - Examples - -------- - >>> import awswrangler as wr - >>> ids = wr.quicksight.get_template_ids(name="...") - """ - return _get_ids( - name=name, func=_list.list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session - ) - - -def get_template_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: - """Get QuickSight template ID given a name and fails if there is more than 1 ID associated with this name. - - Parameters - ---------- - name : str - Template name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Template ID. - - Examples - -------- - >>> import awswrangler as wr - >>> my_id = wr.quicksight.get_template_id(name="...") - """ - return _get_id( - name=name, func=_list.list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session - ) - - -def get_data_source_arns( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[str]: - """Get QuickSight Data source ARNs given a name. - - Note - ---- - This function returns a list of ARNs because Quicksight accepts duplicated data source names, - so you may have more than 1 ARN for a given name. - - Parameters - ---------- - name : str - Data source name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - Data source ARNs. - - Examples - -------- - >>> import awswrangler as wr - >>> arns = wr.quicksight.get_data_source_arns(name="...") - """ - arns: List[str] = [] - for source in _list.list_data_sources(account_id=account_id, boto3_session=boto3_session): - if source["Name"] == name: - arns.append(source["Arn"]) - return arns - - -def get_data_source_arn( - name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> str: - """Get QuickSight data source ARN given a name and fails if there is more than 1 ARN associated with this name. - - Note - ---- - This function returns a list of ARNs because Quicksight accepts duplicated data source names, - so you may have more than 1 ARN for a given name. - - Parameters - ---------- - name : str - Data source name. - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Data source ARN. - - Examples - -------- - >>> import awswrangler as wr - >>> arn = wr.quicksight.get_data_source_arn("...") - """ - arns: List[str] = get_data_source_arns(name=name, account_id=account_id, boto3_session=boto3_session) - if len(arns) == 0: - raise exceptions.InvalidArgument(f"There is not data source with name {name}") - if len(arns) > 1: - raise exceptions.InvalidArgument( - f"There is more than 1 data source with name {name}. " - f"Please pass the data_source_arn argument to specify " - f"which one you would like to describe." - ) - return arns[0] diff --git a/awswrangler/quicksight/_list.py b/awswrangler/quicksight/_list.py deleted file mode 100644 index 88b0c2d7d..000000000 --- a/awswrangler/quicksight/_list.py +++ /dev/null @@ -1,371 +0,0 @@ -"""Amazon QuickSight List Module.""" - -import logging -from typing import Any, Callable, Dict, List, Optional - -import boto3 # type: ignore - -from awswrangler import _utils - -_logger: logging.Logger = logging.getLogger(__name__) - - -def _list( - func_name: str, - attr_name: str, - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, - **kwargs, -) -> List[Dict[str, Any]]: - session: boto3.Session = _utils.ensure_session(session=boto3_session) - if account_id is None: - account_id = _utils.get_account_id(boto3_session=session) - client: boto3.client = _utils.client(service_name="quicksight", session=session) - func: Callable = getattr(client, func_name) - response = func(AwsAccountId=account_id, **kwargs) - next_token: str = response.get("NextToken", None) - result: List[Dict[str, Any]] = response[attr_name] - while next_token is not None: - response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs) - next_token = response.get("NextToken", None) - result += response[attr_name] - return result - - -def list_dashboards( - account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List dashboards in an AWS account. - - Parameters - ---------- - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Dashboards. - - Examples - -------- - >>> import awswrangler as wr - >>> dashboards = wr.quicksight.list_dashboards() - """ - return _list( - func_name="list_dashboards", - attr_name="DashboardSummaryList", - account_id=account_id, - boto3_session=boto3_session, - ) - - -def list_datasets( - account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List all QuickSight datasets summaries. - - Parameters - ---------- - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Datasets summaries. - - Examples - -------- - >>> import awswrangler as wr - >>> datasets = wr.quicksight.list_datasets() - """ - return _list( - func_name="list_data_sets", attr_name="DataSetSummaries", account_id=account_id, boto3_session=boto3_session - ) - - -def list_data_sources( - account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List all QuickSight Data sources summaries. - - Parameters - ---------- - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Data sources summaries. - - Examples - -------- - >>> import awswrangler as wr - >>> sources = wr.quicksight.list_data_sources() - """ - return _list( - func_name="list_data_sources", attr_name="DataSources", account_id=account_id, boto3_session=boto3_session - ) - - -def list_templates( - account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List all QuickSight templates. - - Parameters - ---------- - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Templates summaries. - - Examples - -------- - >>> import awswrangler as wr - >>> templates = wr.quicksight.list_templates() - """ - return _list( - func_name="list_templates", attr_name="TemplateSummaryList", account_id=account_id, boto3_session=boto3_session - ) - - -def list_group_memberships( - group_name: str, - namespace: str = "default", - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[Dict[str, Any]]: - """List all QuickSight Group memberships. - - Parameters - ---------- - group_name : str - The name of the group that you want to see a membership list of. - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Group memberships. - - Examples - -------- - >>> import awswrangler as wr - >>> memberships = wr.quicksight.list_group_memberships() - """ - return _list( - func_name="list_group_memberships", - attr_name="GroupMemberList", - account_id=account_id, - boto3_session=boto3_session, - GroupName=group_name, - Namespace=namespace, - ) - - -def list_groups( - namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """List all QuickSight Groups. - - Parameters - ---------- - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Groups. - - Examples - -------- - >>> import awswrangler as wr - >>> groups = wr.quicksight.list_groups() - """ - return _list( - func_name="list_groups", - attr_name="GroupList", - account_id=account_id, - boto3_session=boto3_session, - Namespace=namespace, - ) - - -def list_iam_policy_assignments( - status: Optional[str] = None, - namespace: str = "default", - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[Dict[str, Any]]: - """List IAM policy assignments in the current Amazon QuickSight account. - - Parameters - ---------- - status : str, optional - The status of the assignments. - 'ENABLED'|'DRAFT'|'DISABLED' - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - IAM policy assignments. - - Examples - -------- - >>> import awswrangler as wr - >>> assigns = wr.quicksight.list_iam_policy_assignments() - """ - args: Dict[str, Any] = { - "func_name": "list_iam_policy_assignments", - "attr_name": "IAMPolicyAssignments", - "account_id": account_id, - "boto3_session": boto3_session, - "Namespace": namespace, - } - if status is not None: - args["AssignmentStatus"] = status - return _list(**args) - - -def list_iam_policy_assignments_for_user( - user_name: str, - namespace: str = "default", - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[Dict[str, Any]]: - """List all the IAM policy assignments. - - Including the Amazon Resource Names (ARNs) for the IAM policies assigned - to the specified user and group or groups that the user belongs to. - - Parameters - ---------- - user_name : str - The name of the user. - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - IAM policy assignments. - - Examples - -------- - >>> import awswrangler as wr - >>> assigns = wr.quicksight.list_iam_policy_assignments_for_user() - """ - return _list( - func_name="list_iam_policy_assignments_for_user", - attr_name="ActiveAssignments", - account_id=account_id, - boto3_session=boto3_session, - UserName=user_name, - Namespace=namespace, - ) - - -def list_user_groups( - user_name: str, - namespace: str = "default", - account_id: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[Dict[str, Any]]: - """List the Amazon QuickSight groups that an Amazon QuickSight user is a member of. - - Parameters - ---------- - user_name: str: - The Amazon QuickSight user name that you want to list group memberships for. - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Groups. - - Examples - -------- - >>> import awswrangler as wr - >>> groups = wr.quicksight.list_user_groups() - """ - return _list( - func_name="list_user_groups", - attr_name="GroupList", - account_id=account_id, - boto3_session=boto3_session, - UserName=user_name, - Namespace=namespace, - ) - - -def list_users( - namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None -) -> List[Dict[str, Any]]: - """Return a list of all of the Amazon QuickSight users belonging to this account. - - Parameters - ---------- - namespace : str - The namespace. Currently, you should set this to default . - account_id : str, optional - If None, the account ID will be inferred from your boto3 session. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[Dict[str, Any]] - Groups. - - Examples - -------- - >>> import awswrangler as wr - >>> users = wr.quicksight.list_users() - """ - return _list( - func_name="list_users", - attr_name="UserList", - account_id=account_id, - boto3_session=boto3_session, - Namespace=namespace, - ) From 9d970c87cf5e3c37c711b752954934431b0af411 Mon Sep 17 00:00:00 2001 From: Ying Wang Date: Thu, 11 Jun 2020 21:39:57 -0400 Subject: [PATCH 24/28] Fixed a bug of user name. --- awswrangler/quicksight/_create.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/awswrangler/quicksight/_create.py b/awswrangler/quicksight/_create.py index de41f0ecf..2dba7e118 100644 --- a/awswrangler/quicksight/_create.py +++ b/awswrangler/quicksight/_create.py @@ -53,8 +53,7 @@ def _generate_principal(user_name: str, account_id: str, region: str) -> str: - user_name = user_name if "/" in user_name else f"default/{user_name}" - return f"arn:aws:quicksight:{region}:{account_id}:user/{user_name}" + return f"arn:aws:quicksight:{region}:{account_id}:user/default/{user_name}" def _generate_permissions( From 7f84c9e3e55e8bfdf53326dc94aea1e645074e48 Mon Sep 17 00:00:00 2001 From: Ying Wang Date: Thu, 11 Jun 2020 21:40:27 -0400 Subject: [PATCH 25/28] QuickSight general clean up. --- README.md | 26 +- awswrangler/quicksight/_create.py | 11 +- docs/source/index.rst | 23 +- tutorials/018 - QuickSight.ipynb | 1298 +++++++++++++++++++++++++++++ 4 files changed, 1350 insertions(+), 8 deletions(-) create mode 100644 tutorials/018 - QuickSight.ipynb diff --git a/README.md b/README.md index cabd20ea3..cb27ae253 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,30 @@ df = wr.s3.read_parquet("s3://bucket/dataset/", dataset=True) df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db") # Getting Redshift connection (SQLAlchemy) from Glue Catalog Connections -engine = wr.catalog.get_engine("my-redshift-connection") - # Retrieving the data from Amazon Redshift Spectrum +engine = wr.catalog.get_engine("my-redshift-connection") df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) + +# Creating QuickSight Data Source and Dataset to reflect our new table +wr.quicksight.create_athena_data_source("athena-source", allowed_to_manage=["username"]) +wr.quicksight.create_athena_dataset( + name="my-dataset", + database="my_db", + table="my_table", + data_source_name="athena-source", + allowed_to_manage=["username"] +) + +# Getting MySQL connection (SQLAlchemy) from Glue Catalog Connections +# Load the data into MySQL +engine = wr.catalog.get_engine("my-mysql-connection") +wr.db.to_sql(df, engine, schema="test", name="my_table") + +# Getting PostgreSQL connection (SQLAlchemy) from Glue Catalog Connections +# Load the data into PostgreSQL +engine = wr.catalog.get_engine("my-postgresql-connection") +wr.db.to_sql(df, engine, schema="test", name="my_table") + ``` ## [Read The Docs](https://aws-data-wrangler.readthedocs.io/) @@ -80,6 +100,7 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) - [015 - EMR](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/015%20-%20EMR.ipynb) - [016 - EMR & Docker](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/016%20-%20EMR%20%26%20Docker.ipynb) - [017 - Partition Projection](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/017%20-%20Partition%20Projection.ipynb) + - [018 - QuickSight](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/018%20-%20QuickSight.ipynb) - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html) - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3) - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog) @@ -87,6 +108,7 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) - [Databases (Redshift, PostgreSQL, MySQL)](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#databases-redshift-postgresql-mysql) - [EMR Cluster](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#emr-cluster) - [CloudWatch Logs](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#cloudwatch-logs) + - [QuickSight](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#quicksight) - [**License**](https://github.com/awslabs/aws-data-wrangler/blob/master/LICENSE) - [**Contributing**](https://github.com/awslabs/aws-data-wrangler/blob/master/CONTRIBUTING.md) - [**Legacy Docs** (pre-1.0.0)](https://aws-data-wrangler.readthedocs.io/en/legacy/) diff --git a/awswrangler/quicksight/_create.py b/awswrangler/quicksight/_create.py index 2dba7e118..d659e2ce1 100644 --- a/awswrangler/quicksight/_create.py +++ b/awswrangler/quicksight/_create.py @@ -53,7 +53,8 @@ def _generate_principal(user_name: str, account_id: str, region: str) -> str: - return f"arn:aws:quicksight:{region}:{account_id}:user/default/{user_name}" + user_name = user_name if "/" in user_name else f"default/{user_name}" + return f"arn:aws:quicksight:{region}:{account_id}:user/{user_name}" def _generate_permissions( @@ -273,6 +274,8 @@ def create_athena_dataset( raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.") if ((database is None) and (table is None)) and (sql is None): raise exceptions.InvalidArgument("You must pass database/table OR sql argument.") + if (database is not None) and (sql is not None): + raise exceptions.InvalidArgument("If you provide sql argument, please include the database name inside the sql statement. Do NOT pass in with database argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) client: boto3.client = _utils.client(service_name="quicksight", session=session) if account_id is None: @@ -363,8 +366,8 @@ def create_ingestion( Returns ------- - Current status - 'INITIALIZED'|'QUEUED'|'RUNNING'|'FAILED'|'COMPLETED'|'CANCELLED' + str + Ingestion ID Examples -------- @@ -384,4 +387,4 @@ def create_ingestion( response: Dict[str, Any] = client.create_ingestion( DataSetId=dataset_id, IngestionId=ingestion_id, AwsAccountId=account_id ) - return response["IngestionStatus"] + return response["IngestionId"] diff --git a/docs/source/index.rst b/docs/source/index.rst index 6c0380007..2335c0209 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -26,11 +26,30 @@ Quick Start df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db") # Getting Redshift connection (SQLAlchemy) from Glue Catalog Connections - engine = wr.catalog.get_engine("my-redshift-connection") - # Retrieving the data from Amazon Redshift Spectrum + engine = wr.catalog.get_engine("my-redshift-connection") df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) + # Creating QuickSight Data Source and Dataset to reflect our new table + wr.quicksight.create_athena_data_source("athena-source", allowed_to_manage=["username"]) + wr.quicksight.create_athena_dataset( + name="my-dataset", + database="my_db", + table="my_table", + data_source_name="athena-source", + allowed_to_manage=["username"] + ) + + # Getting MySQL connection (SQLAlchemy) from Glue Catalog Connections + # Load the data into MySQL + engine = wr.catalog.get_engine("my-mysql-connection") + wr.db.to_sql(df, engine, schema="test", name="my_table") + + # Getting PostgreSQL connection (SQLAlchemy) from Glue Catalog Connections + # Load the data into PostgreSQL + engine = wr.catalog.get_engine("my-postgresql-connection") + wr.db.to_sql(df, engine, schema="test", name="my_table") + Read The Docs ------------- diff --git a/tutorials/018 - QuickSight.ipynb b/tutorials/018 - QuickSight.ipynb new file mode 100644 index 000000000..a90fe6573 --- /dev/null +++ b/tutorials/018 - QuickSight.ipynb @@ -0,0 +1,1298 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n", + "\n", + "# 18 - QuickSight\n", + "\n", + "For this tutorial we will use the public AWS COVID-19 data lake.\n", + "\n", + "References:\n", + "\n", + "* [A public data lake for analysis of COVID-19 data](https://aws.amazon.com/blogs/big-data/a-public-data-lake-for-analysis-of-covid-19-data/)\n", + "* [Exploring the public AWS COVID-19 data lake](https://aws.amazon.com/blogs/big-data/exploring-the-public-aws-covid-19-data-lake/)\n", + "* [CloudFormation template](https://covid19-lake.s3.us-east-2.amazonaws.com/cfn/CovidLakeStack.template.json)\n", + "\n", + "*Please, install the Cloudformation template above to have access to the public data lake.*\n", + "\n", + "*P.S. To be able to access the public data lake, you must allow explicitly QuickSight to access the related external bucket.*" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ + "import awswrangler as wr\n", + "from time import sleep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List users of QuickSight account" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'username': 'dev', 'role': 'ADMIN'}]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[{\"username\": user[\"UserName\"], \"role\": user[\"Role\"]} for user in wr.quicksight.list_users('default')]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatabaseDescription
0aws_data_wranglerAWS Data Wrangler Test Arena - Glue Database
1awswrangler_test
2covid-19
3defaultDefault Hive database
\n", + "
" + ], + "text/plain": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 awswrangler_test \n", + "2 covid-19 \n", + "3 default Default Hive database" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.catalog.databases()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatabaseTableDescriptionColumnsPartitions
0covid-19alleninstitute_comprehend_medicalComprehend Medical results run against Allen I...paper_id, date, dx_name, test_name, procedure_...
1covid-19alleninstitute_metadataMetadata on papers pulled from the Allen Insti...cord_uid, sha, source_x, title, doi, pmcid, pu...
2covid-19country_codesLookup table for country codescountry, alpha-2 code, alpha-3 code, numeric c...
3covid-19county_populationsLookup table for population for each county ba...id, id2, county, state, population estimate 2018
4covid-19covid_knowledge_graph_edgesAWS Knowledge Graph for COVID-19 dataid, label, from, to, score
5covid-19covid_knowledge_graph_nodes_authorAWS Knowledge Graph for COVID-19 dataid, label, first, last, full_name
6covid-19covid_knowledge_graph_nodes_conceptAWS Knowledge Graph for COVID-19 dataid, label, entity, concept
7covid-19covid_knowledge_graph_nodes_institutionAWS Knowledge Graph for COVID-19 dataid, label, institution, country, settlement
8covid-19covid_knowledge_graph_nodes_paperAWS Knowledge Graph for COVID-19 dataid, label, doi, sha_code, publish_time, source...
9covid-19covid_knowledge_graph_nodes_topicAWS Knowledge Graph for COVID-19 dataid, label, topic, topic_num
10covid-19covid_testing_states_dailyUSA total test daily trend by state. Sourced ...date, state, positive, negative, pending, hosp...
11covid-19covid_testing_us_dailyUSA total test daily trend. Sourced from covi...date, states, positive, negative, posneg, pend...
12covid-19covid_testing_us_totalUSA total tests. Sourced from covidtracking.c...positive, negative, posneg, hospitalized, deat...
13covid-19covidcast_dataCMU Delphi's COVID-19 Surveillance Datadata_source, signal, geo_type, time_value, geo...
14covid-19covidcast_metadataCMU Delphi's COVID-19 Surveillance Metadatadata_source, signal, time_type, geo_type, min_...
15covid-19enigma_jhuJohns Hopkins University Consolidated data on ...fips, admin2, province_state, country_region, ...
16covid-19enigma_jhu_timeseriesJohns Hopkins University data on COVID-19 case...uid, fips, iso2, iso3, code3, admin2, latitude...
17covid-19hospital_bedsData on hospital beds and their utilization in...objectid, hospital_name, hospital_type, hq_add...
18covid-19nytimes_countiesData on COVID-19 cases from NY Times at US cou...date, county, state, fips, cases, deaths
19covid-19nytimes_statesData on COVID-19 cases from NY Times at US sta...date, state, fips, cases, deaths
20covid-19prediction_models_county_predictionsCounty-level Predictions Data. Sourced from Yu...countyfips, countyname, statename, severity_co...
21covid-19prediction_models_severity_indexSeverity Index models. Sourced from Yu Group a...severity_1-day, severity_2-day, severity_3-day...
22covid-19tableau_covid_datahubCOVID-19 data that has been gathered and unifi...country_short_name, country_alpha_3_code, coun...
23covid-19tableau_jhuJohns Hopkins University data on COVID-19 case...case_type, cases, difference, date, country_re...
24covid-19us_state_abbreviationsLookup table for US state abbreviationsstate, abbreviation
25covid-19world_cases_deaths_testingData on confirmed cases, deaths, and testing. ...iso_code, location, date, total_cases, new_cas...
\n", + "
" + ], + "text/plain": [ + " Database Table \\\n", + "0 covid-19 alleninstitute_comprehend_medical \n", + "1 covid-19 alleninstitute_metadata \n", + "2 covid-19 country_codes \n", + "3 covid-19 county_populations \n", + "4 covid-19 covid_knowledge_graph_edges \n", + "5 covid-19 covid_knowledge_graph_nodes_author \n", + "6 covid-19 covid_knowledge_graph_nodes_concept \n", + "7 covid-19 covid_knowledge_graph_nodes_institution \n", + "8 covid-19 covid_knowledge_graph_nodes_paper \n", + "9 covid-19 covid_knowledge_graph_nodes_topic \n", + "10 covid-19 covid_testing_states_daily \n", + "11 covid-19 covid_testing_us_daily \n", + "12 covid-19 covid_testing_us_total \n", + "13 covid-19 covidcast_data \n", + "14 covid-19 covidcast_metadata \n", + "15 covid-19 enigma_jhu \n", + "16 covid-19 enigma_jhu_timeseries \n", + "17 covid-19 hospital_beds \n", + "18 covid-19 nytimes_counties \n", + "19 covid-19 nytimes_states \n", + "20 covid-19 prediction_models_county_predictions \n", + "21 covid-19 prediction_models_severity_index \n", + "22 covid-19 tableau_covid_datahub \n", + "23 covid-19 tableau_jhu \n", + "24 covid-19 us_state_abbreviations \n", + "25 covid-19 world_cases_deaths_testing \n", + "\n", + " Description \\\n", + "0 Comprehend Medical results run against Allen I... \n", + "1 Metadata on papers pulled from the Allen Insti... \n", + "2 Lookup table for country codes \n", + "3 Lookup table for population for each county ba... \n", + "4 AWS Knowledge Graph for COVID-19 data \n", + "5 AWS Knowledge Graph for COVID-19 data \n", + "6 AWS Knowledge Graph for COVID-19 data \n", + "7 AWS Knowledge Graph for COVID-19 data \n", + "8 AWS Knowledge Graph for COVID-19 data \n", + "9 AWS Knowledge Graph for COVID-19 data \n", + "10 USA total test daily trend by state. Sourced ... \n", + "11 USA total test daily trend. Sourced from covi... \n", + "12 USA total tests. Sourced from covidtracking.c... \n", + "13 CMU Delphi's COVID-19 Surveillance Data \n", + "14 CMU Delphi's COVID-19 Surveillance Metadata \n", + "15 Johns Hopkins University Consolidated data on ... \n", + "16 Johns Hopkins University data on COVID-19 case... \n", + "17 Data on hospital beds and their utilization in... \n", + "18 Data on COVID-19 cases from NY Times at US cou... \n", + "19 Data on COVID-19 cases from NY Times at US sta... \n", + "20 County-level Predictions Data. Sourced from Yu... \n", + "21 Severity Index models. Sourced from Yu Group a... \n", + "22 COVID-19 data that has been gathered and unifi... \n", + "23 Johns Hopkins University data on COVID-19 case... \n", + "24 Lookup table for US state abbreviations \n", + "25 Data on confirmed cases, deaths, and testing. ... \n", + "\n", + " Columns Partitions \n", + "0 paper_id, date, dx_name, test_name, procedure_... \n", + "1 cord_uid, sha, source_x, title, doi, pmcid, pu... \n", + "2 country, alpha-2 code, alpha-3 code, numeric c... \n", + "3 id, id2, county, state, population estimate 2018 \n", + "4 id, label, from, to, score \n", + "5 id, label, first, last, full_name \n", + "6 id, label, entity, concept \n", + "7 id, label, institution, country, settlement \n", + "8 id, label, doi, sha_code, publish_time, source... \n", + "9 id, label, topic, topic_num \n", + "10 date, state, positive, negative, pending, hosp... \n", + "11 date, states, positive, negative, posneg, pend... \n", + "12 positive, negative, posneg, hospitalized, deat... \n", + "13 data_source, signal, geo_type, time_value, geo... \n", + "14 data_source, signal, time_type, geo_type, min_... \n", + "15 fips, admin2, province_state, country_region, ... \n", + "16 uid, fips, iso2, iso3, code3, admin2, latitude... \n", + "17 objectid, hospital_name, hospital_type, hq_add... \n", + "18 date, county, state, fips, cases, deaths \n", + "19 date, state, fips, cases, deaths \n", + "20 countyfips, countyname, statename, severity_co... \n", + "21 severity_1-day, severity_2-day, severity_3-day... \n", + "22 country_short_name, country_alpha_3_code, coun... \n", + "23 case_type, cases, difference, date, country_re... \n", + "24 state, abbreviation \n", + "25 iso_code, location, date, total_cases, new_cas... " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.catalog.tables(database=\"covid-19\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create data source of QuickSight\n", + "Note: data source stores the connection information." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "wr.quicksight.create_athena_data_source(\n", + " name=\"covid-19\",\n", + " workgroup=\"primary\",\n", + " allowed_to_manage=[\"dev\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatabaseTableDescriptionColumnsPartitions
0covid-19nytimes_countiesData on COVID-19 cases from NY Times at US cou...date, county, state, fips, cases, deaths
1covid-19nytimes_statesData on COVID-19 cases from NY Times at US sta...date, state, fips, cases, deaths
\n", + "
" + ], + "text/plain": [ + " Database Table \\\n", + "0 covid-19 nytimes_counties \n", + "1 covid-19 nytimes_states \n", + "\n", + " Description \\\n", + "0 Data on COVID-19 cases from NY Times at US cou... \n", + "1 Data on COVID-19 cases from NY Times at US sta... \n", + "\n", + " Columns Partitions \n", + "0 date, county, state, fips, cases, deaths \n", + "1 date, state, fips, cases, deaths " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.catalog.tables(database=\"covid-19\", name_contains=\"nyt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datecountystatefipscasesdeaths
02020-01-21SnohomishWashington5306110
12020-01-22SnohomishWashington5306110
22020-01-23SnohomishWashington5306110
32020-01-24CookIllinois1703110
42020-01-24SnohomishWashington5306110
52020-01-25OrangeCalifornia0605910
62020-01-25CookIllinois1703110
72020-01-25SnohomishWashington5306110
82020-01-26MaricopaArizona0401310
92020-01-26Los AngelesCalifornia0603710
\n", + "
" + ], + "text/plain": [ + " date county state fips cases deaths\n", + "0 2020-01-21 Snohomish Washington 53061 1 0\n", + "1 2020-01-22 Snohomish Washington 53061 1 0\n", + "2 2020-01-23 Snohomish Washington 53061 1 0\n", + "3 2020-01-24 Cook Illinois 17031 1 0\n", + "4 2020-01-24 Snohomish Washington 53061 1 0\n", + "5 2020-01-25 Orange California 06059 1 0\n", + "6 2020-01-25 Cook Illinois 17031 1 0\n", + "7 2020-01-25 Snohomish Washington 53061 1 0\n", + "8 2020-01-26 Maricopa Arizona 04013 1 0\n", + "9 2020-01-26 Los Angeles California 06037 1 0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.athena.read_sql_query(\"SELECT * FROM nytimes_counties limit 10\", database=\"covid-19\", ctas_approach=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datecountystatefipsconfirmeddeathspopulationcounty2Hospitalhospital_fipslicensed_bedsstaffed_bedsicu_bedsbed_utilizationpotential_increase_bed_capacity
02020-04-12ParkMontana300677016736Park030067252540.4325480
12020-04-12RavalliMontana300813043172Ravalli030081252550.5677810
22020-04-12Silver BowMontana3009311034993Silver Bow0300939871110.55145727
32020-04-12ClayNebraska31035206214Clay<NA><NA><NA><NA><NA>NaN<NA>
42020-04-12CumingNebraska31039208940Cuming031039252540.2044930
................................................
2276842020-06-11HockleyTexas4821928122980Hockley048219484880.1206050
2276852020-06-11HudspethTexas482291104795Hudspeth<NA><NA><NA><NA><NA>NaN<NA>
2276862020-06-11JonesTexas48253633019817Jones04825345710.71859138
2276872020-06-11La SalleTexas48283407531La Salle<NA><NA><NA><NA><NA>NaN<NA>
2276882020-06-11LimestoneTexas4829336123519Limestone048293786990.1639409
\n", + "

227689 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " date county state fips confirmed deaths population \\\n", + "0 2020-04-12 Park Montana 30067 7 0 16736 \n", + "1 2020-04-12 Ravalli Montana 30081 3 0 43172 \n", + "2 2020-04-12 Silver Bow Montana 30093 11 0 34993 \n", + "3 2020-04-12 Clay Nebraska 31035 2 0 6214 \n", + "4 2020-04-12 Cuming Nebraska 31039 2 0 8940 \n", + "... ... ... ... ... ... ... ... \n", + "227684 2020-06-11 Hockley Texas 48219 28 1 22980 \n", + "227685 2020-06-11 Hudspeth Texas 48229 11 0 4795 \n", + "227686 2020-06-11 Jones Texas 48253 633 0 19817 \n", + "227687 2020-06-11 La Salle Texas 48283 4 0 7531 \n", + "227688 2020-06-11 Limestone Texas 48293 36 1 23519 \n", + "\n", + " county2 Hospital hospital_fips licensed_beds staffed_beds \\\n", + "0 Park 0 30067 25 25 \n", + "1 Ravalli 0 30081 25 25 \n", + "2 Silver Bow 0 30093 98 71 \n", + "3 Clay \n", + "4 Cuming 0 31039 25 25 \n", + "... ... ... ... ... ... \n", + "227684 Hockley 0 48219 48 48 \n", + "227685 Hudspeth \n", + "227686 Jones 0 48253 45 7 \n", + "227687 La Salle \n", + "227688 Limestone 0 48293 78 69 \n", + "\n", + " icu_beds bed_utilization potential_increase_bed_capacity \n", + "0 4 0.432548 0 \n", + "1 5 0.567781 0 \n", + "2 11 0.551457 27 \n", + "3 NaN \n", + "4 4 0.204493 0 \n", + "... ... ... ... \n", + "227684 8 0.120605 0 \n", + "227685 NaN \n", + "227686 1 0.718591 38 \n", + "227687 NaN \n", + "227688 9 0.163940 9 \n", + "\n", + "[227689 rows x 15 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = \"\"\"\n", + "SELECT \n", + " j.*, \n", + " co.Population, \n", + " co.county AS county2, \n", + " hb.* \n", + "FROM \n", + " (\n", + " SELECT \n", + " date, \n", + " county, \n", + " state, \n", + " fips, \n", + " cases as confirmed, \n", + " deaths \n", + " FROM \"covid-19\".nytimes_counties\n", + " ) j \n", + " LEFT OUTER JOIN (\n", + " SELECT \n", + " DISTINCT county, \n", + " state, \n", + " \"population estimate 2018\" AS Population \n", + " FROM \n", + " \"covid-19\".county_populations \n", + " WHERE \n", + " state IN (\n", + " SELECT \n", + " DISTINCT state \n", + " FROM \n", + " \"covid-19\".nytimes_counties\n", + " ) \n", + " AND county IN (\n", + " SELECT \n", + " DISTINCT county as county \n", + " FROM \"covid-19\".nytimes_counties\n", + " )\n", + " ) co ON co.county = j.county \n", + " AND co.state = j.state \n", + " LEFT OUTER JOIN (\n", + " SELECT \n", + " count(objectid) as Hospital, \n", + " fips as hospital_fips, \n", + " sum(num_licensed_beds) as licensed_beds, \n", + " sum(num_staffed_beds) as staffed_beds, \n", + " sum(num_icu_beds) as icu_beds, \n", + " avg(bed_utilization) as bed_utilization, \n", + " sum(\n", + " potential_increase_in_bed_capac\n", + " ) as potential_increase_bed_capacity \n", + " FROM \"covid-19\".hospital_beds \n", + " WHERE \n", + " fips in (\n", + " SELECT \n", + " DISTINCT fips \n", + " FROM \n", + " \"covid-19\".nytimes_counties\n", + " ) \n", + " GROUP BY \n", + " 2\n", + " ) hb ON hb.hospital_fips = j.fips\n", + "\"\"\"\n", + "\n", + "wr.athena.read_sql_query(sql, database=\"covid-19\", ctas_approach=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Dataset with custom SQL option" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "wr.quicksight.create_athena_dataset(\n", + " name=\"covid19-nytimes-usa\",\n", + " sql=sql,\n", + " sql_name='CustomSQL',\n", + " data_source_name=\"covid-19\",\n", + " import_mode='SPICE',\n", + " allowed_to_manage=[\"dev\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "ingestion_id = wr.quicksight.create_ingestion(\"covid19-nytimes-usa\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wait ingestion" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "while wr.quicksight.describe_ingestion(ingestion_id=ingestion_id, dataset_name=\"covid19-nytimes-usa\")[\"IngestionStatus\"] not in [\"COMPLETED\", \"FAILED\"]:\n", + " sleep(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Describe last ingestion" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'RowsIngested': 227689, 'RowsDropped': 0}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.quicksight.describe_ingestion(ingestion_id=ingestion_id, dataset_name=\"covid19-nytimes-usa\")[\"RowInfo\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List all ingestions" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'time': datetime.datetime(2020, 6, 12, 15, 13, 46, 996000, tzinfo=tzlocal()),\n", + " 'source': 'MANUAL'},\n", + " {'time': datetime.datetime(2020, 6, 12, 15, 13, 42, 344000, tzinfo=tzlocal()),\n", + " 'source': 'MANUAL'}]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[{\"time\": user[\"CreatedTime\"], \"source\": user[\"RequestSource\"]} for user in wr.quicksight.list_ingestions(\"covid19-nytimes-usa\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create new dataset from a table directly" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "wr.quicksight.create_athena_dataset(\n", + " name=\"covid-19-tableau_jhu\",\n", + " table=\"tableau_jhu\",\n", + " data_source_name=\"covid-19\",\n", + " database=\"covid-19\",\n", + " import_mode='DIRECT_QUERY',\n", + " rename_columns={\n", + " \"cases\": \"Count_of_Cases\", \n", + " \"combined_key\": \"County\"\n", + " },\n", + " cast_columns_types={\n", + " \"Count_of_Cases\": \"INTEGER\"\n", + " },\n", + " allowed_to_manage=[\"dev\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cleaning up" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "wr.quicksight.delete_data_source(\"covid-19\")\n", + "wr.quicksight.delete_dataset(\"covid19-nytimes-usa\")\n", + "wr.quicksight.delete_dataset(\"covid-19-tableau_jhu\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "awswrangler", + "language": "python", + "name": "awswrangler" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 775781051b3f5453bb10fd8012915304c2c393da Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 13 Jun 2020 21:49:51 -0300 Subject: [PATCH 26/28] Bumping version to 1.5.0 --- README.md | 4 ++-- awswrangler/__init__.py | 4 ++-- awswrangler/__metadata__.py | 2 +- awswrangler/catalog.py | 18 ++++++++++-------- awswrangler/cloudwatch.py | 14 +++++++------- awswrangler/db.py | 6 +++--- awswrangler/quicksight/__init__.py | 1 + awswrangler/quicksight/_create.py | 5 ++++- awswrangler/s3/_delete.py | 4 ++-- awswrangler/s3/_describe.py | 6 ++++-- awswrangler/s3/_read.py | 14 +++++++------- awswrangler/s3/_wait.py | 10 +++++----- docs/source/api.rst | 1 + testing/test_awswrangler/test_data_lake.py | 13 +++++-------- testing/test_awswrangler/test_metadata.py | 2 +- 15 files changed, 55 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index cb27ae253..73931fa01 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,13 @@ ![AWS Data Wrangler](docs/source/_static/logo2.png?raw=true "AWS Data Wrangler") -[![Release](https://img.shields.io/badge/release-1.4.0-brightgreen.svg)](https://pypi.org/project/awswrangler/) +[![Release](https://img.shields.io/badge/release-1.5.0-brightgreen.svg)](https://pypi.org/project/awswrangler/) [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) -[![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen.svg)](https://pypi.org/project/awswrangler/) +[![Coverage](https://img.shields.io/badge/coverage-90%25-brightgreen.svg)](https://pypi.org/project/awswrangler/) ![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest) diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py index 9aff3abcd..94cc28ba7 100644 --- a/awswrangler/__init__.py +++ b/awswrangler/__init__.py @@ -5,10 +5,10 @@ """ -import logging +import logging as _logging from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, quicksight, s3 # noqa from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ # noqa from awswrangler._utils import get_account_id # noqa -logging.getLogger("awswrangler").addHandler(logging.NullHandler()) +_logging.getLogger("awswrangler").addHandler(_logging.NullHandler()) diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py index dc3dcb059..b2ebec6d8 100644 --- a/awswrangler/__metadata__.py +++ b/awswrangler/__metadata__.py @@ -7,5 +7,5 @@ __title__ = "awswrangler" __description__ = "Pandas on AWS." -__version__ = "1.4.0" +__version__ = "1.5.0" __license__ = "Apache License 2.0" diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index aa2f34b85..7cea51199 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -6,7 +6,7 @@ import re import unicodedata from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from urllib.parse import quote_plus +from urllib.parse import quote_plus as _quote_plus import boto3 # type: ignore import pandas as pd # type: ignore @@ -566,10 +566,12 @@ def get_tables( if catalog_id is not None: args["CatalogId"] = catalog_id if (name_prefix is not None) and (name_suffix is not None) and (name_contains is not None): - raise exceptions.InvalidArgumentCombination("Please, does not filter using name_contains and " - "name_prefix/name_suffix at the same time. Only " - "name_prefix and name_suffix can be combined together.") - elif (name_prefix is not None) and (name_suffix is not None): + raise exceptions.InvalidArgumentCombination( + "Please, does not filter using name_contains and " + "name_prefix/name_suffix at the same time. Only " + "name_prefix and name_suffix can be combined together." + ) + if (name_prefix is not None) and (name_suffix is not None): args["Expression"] = f"{name_prefix}*{name_suffix}" elif name_contains is not None: args["Expression"] = f"*{name_contains}*" @@ -665,7 +667,7 @@ def tables( if "Columns" in tbl["StorageDescriptor"]: df_dict["Columns"].append(", ".join([x["Name"] for x in tbl["StorageDescriptor"]["Columns"]])) else: - df_dict["Columns"].append("") + df_dict["Columns"].append("") # pragma: no cover if "PartitionKeys" in tbl: df_dict["Partitions"].append(", ".join([x["Name"] for x in tbl["PartitionKeys"]])) else: @@ -1008,8 +1010,8 @@ def get_engine( db_type: str = details["JDBC_CONNECTION_URL"].split(":")[1].lower() host: str = details["JDBC_CONNECTION_URL"].split(":")[2].replace("/", "") port, database = details["JDBC_CONNECTION_URL"].split(":")[3].split("/") - user: str = quote_plus(details["USERNAME"]) - password: str = quote_plus(details["PASSWORD"]) + user: str = _quote_plus(details["USERNAME"]) + password: str = _quote_plus(details["PASSWORD"]) if db_type == "postgresql": _utils.ensure_postgresql_casts() if db_type in ("redshift", "postgresql"): diff --git a/awswrangler/cloudwatch.py b/awswrangler/cloudwatch.py index c36fab70b..5ee5f722f 100644 --- a/awswrangler/cloudwatch.py +++ b/awswrangler/cloudwatch.py @@ -1,8 +1,8 @@ """CloudWatch Logs module.""" +import datetime import logging import time -from datetime import datetime from typing import Any, Dict, List, Optional import boto3 # type: ignore @@ -18,8 +18,8 @@ def start_query( query: str, log_group_names: List[str], - start_time: datetime = datetime(year=1970, month=1, day=1), - end_time: datetime = datetime.now(), + start_time: datetime.datetime = datetime.datetime(year=1970, month=1, day=1), + end_time: datetime.datetime = datetime.datetime.now(), limit: Optional[int] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: @@ -120,8 +120,8 @@ def wait_query(query_id: str, boto3_session: Optional[boto3.Session] = None) -> def run_query( query: str, log_group_names: List[str], - start_time: datetime = datetime(year=1970, month=1, day=1), - end_time: datetime = datetime.now(), + start_time: datetime.datetime = datetime.datetime(year=1970, month=1, day=1), + end_time: datetime.datetime = datetime.datetime.now(), limit: Optional[int] = None, boto3_session: Optional[boto3.Session] = None, ) -> List[List[Dict[str, str]]]: @@ -174,8 +174,8 @@ def run_query( def read_logs( query: str, log_group_names: List[str], - start_time: datetime = datetime(year=1970, month=1, day=1), - end_time: datetime = datetime.now(), + start_time: datetime.datetime = datetime.datetime(year=1970, month=1, day=1), + end_time: datetime.datetime = datetime.datetime.now(), limit: Optional[int] = None, boto3_session: Optional[boto3.Session] = None, ) -> pd.DataFrame: diff --git a/awswrangler/db.py b/awswrangler/db.py index 4f696ab2e..1cebdfc4e 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -4,7 +4,7 @@ import logging import time from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from urllib.parse import quote_plus +from urllib.parse import quote_plus as _quote_plus import boto3 # type: ignore import pandas as pd # type: ignore @@ -350,8 +350,8 @@ def get_redshift_temp_engine( res: Dict[str, Any] = client_redshift.get_cluster_credentials( DbUser=user, ClusterIdentifier=cluster_identifier, DurationSeconds=duration, AutoCreate=False ) - _user: str = quote_plus(res["DbUser"]) - password: str = quote_plus(res["DbPassword"]) + _user: str = _quote_plus(res["DbUser"]) + password: str = _quote_plus(res["DbPassword"]) cluster: Dict[str, Any] = client_redshift.describe_clusters(ClusterIdentifier=cluster_identifier)["Clusters"][0] host: str = cluster["Endpoint"]["Address"] port: str = cluster["Endpoint"]["Port"] diff --git a/awswrangler/quicksight/__init__.py b/awswrangler/quicksight/__init__.py index 47b1f0b8a..4e81c2431 100644 --- a/awswrangler/quicksight/__init__.py +++ b/awswrangler/quicksight/__init__.py @@ -39,5 +39,6 @@ list_iam_policy_assignments_for_user, list_ingestions, list_templates, + list_user_groups, list_users, ) diff --git a/awswrangler/quicksight/_create.py b/awswrangler/quicksight/_create.py index d659e2ce1..34e2c3103 100644 --- a/awswrangler/quicksight/_create.py +++ b/awswrangler/quicksight/_create.py @@ -275,7 +275,10 @@ def create_athena_dataset( if ((database is None) and (table is None)) and (sql is None): raise exceptions.InvalidArgument("You must pass database/table OR sql argument.") if (database is not None) and (sql is not None): - raise exceptions.InvalidArgument("If you provide sql argument, please include the database name inside the sql statement. Do NOT pass in with database argument.") + raise exceptions.InvalidArgument( + "If you provide sql argument, please include the database name inside the sql statement." + "Do NOT pass in with database argument." + ) session: boto3.Session = _utils.ensure_session(session=boto3_session) client: boto3.client = _utils.client(service_name="quicksight", session=session) if account_id is None: diff --git a/awswrangler/s3/_delete.py b/awswrangler/s3/_delete.py index 3c9cad484..b2d53759a 100644 --- a/awswrangler/s3/_delete.py +++ b/awswrangler/s3/_delete.py @@ -1,8 +1,8 @@ """Amazon S3 CopDeletey Module (PRIVATE).""" import concurrent.futures +import itertools import logging -from itertools import repeat from typing import Dict, List, Optional, Union import boto3 # type: ignore @@ -82,4 +82,4 @@ def delete_objects( else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - list(executor.map(_delete_objects, repeat(bucket), chunks, repeat(client_s3))) + list(executor.map(_delete_objects, itertools.repeat(bucket), chunks, itertools.repeat(client_s3))) diff --git a/awswrangler/s3/_describe.py b/awswrangler/s3/_describe.py index 0319dcb94..c2e09bad8 100644 --- a/awswrangler/s3/_describe.py +++ b/awswrangler/s3/_describe.py @@ -1,9 +1,9 @@ """Amazon S3 Describe Module (INTERNAL).""" import concurrent.futures +import itertools import logging import time -from itertools import repeat from typing import Any, Dict, List, Optional, Tuple, Union import boto3 # type: ignore @@ -94,7 +94,9 @@ def describe_objects( else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - resp_list = list(executor.map(_describe_object, paths, repeat(wait_time), repeat(client_s3))) + resp_list = list( + executor.map(_describe_object, paths, itertools.repeat(wait_time), itertools.repeat(client_s3)) + ) desc_dict: Dict[str, Dict[str, Any]] = dict(resp_list) return desc_dict diff --git a/awswrangler/s3/_read.py b/awswrangler/s3/_read.py index 857f3fdcf..e249aca82 100644 --- a/awswrangler/s3/_read.py +++ b/awswrangler/s3/_read.py @@ -1,8 +1,8 @@ """Amazon S3 Read Module (PRIVATE).""" import concurrent.futures +import itertools import logging -from itertools import repeat from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union import boto3 # type: ignore @@ -128,13 +128,13 @@ def _read_text( df = pd.concat( objs=executor.map( _read_text_full, - repeat(parser_func), - repeat(path_root), + itertools.repeat(parser_func), + itertools.repeat(path_root), paths, - repeat(_utils.boto3_to_primitives(boto3_session=session)), # Boto3.Session - repeat(pandas_kwargs), - repeat(s3_additional_kwargs), - repeat(dataset), + itertools.repeat(_utils.boto3_to_primitives(boto3_session=session)), # Boto3.Session + itertools.repeat(pandas_kwargs), + itertools.repeat(s3_additional_kwargs), + itertools.repeat(dataset), ), ignore_index=True, sort=False, diff --git a/awswrangler/s3/_wait.py b/awswrangler/s3/_wait.py index c2ebc7a74..45487db61 100644 --- a/awswrangler/s3/_wait.py +++ b/awswrangler/s3/_wait.py @@ -1,8 +1,8 @@ """Amazon S3 Wait Module (PRIVATE).""" import concurrent.futures +import itertools import logging -from itertools import repeat from typing import List, Optional, Tuple, Union import boto3 # type: ignore @@ -38,10 +38,10 @@ def _wait_objects( executor.map( _wait_objects_concurrent, _paths, - repeat(waiter_name), - repeat(client_s3), - repeat(_delay), - repeat(max_attempts), + itertools.repeat(waiter_name), + itertools.repeat(client_s3), + itertools.repeat(_delay), + itertools.repeat(max_attempts), ) ) return None diff --git a/docs/source/api.rst b/docs/source/api.rst index a61526683..16bb6ed0c 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -183,3 +183,4 @@ Amazon QuickSight list_ingestions list_templates list_users + list_user_groups diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 19fb1ca19..e0c43de9d 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -1078,15 +1078,12 @@ def test_catalog(path, database, table): if tbl["Name"] == table: assert tbl["TableType"] == "EXTERNAL_TABLE" # prefix & suffix & name_contains - tables = list( - wr.catalog.get_tables( - name_prefix=table[0], name_contains=table[3], name_suffix=table[-1], catalog_id=account_id + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + list( + wr.catalog.get_tables( + name_prefix=table[0], name_contains=table[3], name_suffix=table[-1], catalog_id=account_id + ) ) - ) - assert len(tables) > 0 - for tbl in tables: - if tbl["Name"] == table: - assert tbl["TableType"] == "EXTERNAL_TABLE" # prefix & suffix tables = list(wr.catalog.get_tables(name_prefix=table[0], name_suffix=table[-1], catalog_id=account_id)) assert len(tables) > 0 diff --git a/testing/test_awswrangler/test_metadata.py b/testing/test_awswrangler/test_metadata.py index c8f0bc067..d4084dff1 100644 --- a/testing/test_awswrangler/test_metadata.py +++ b/testing/test_awswrangler/test_metadata.py @@ -2,7 +2,7 @@ def test_metadata(): - assert wr.__version__ == "1.4.0" + assert wr.__version__ == "1.5.0" assert wr.__title__ == "awswrangler" assert wr.__description__ == "Pandas on AWS." assert wr.__license__ == "Apache License 2.0" From b9ac28fbb5e84e762e6b92fbb116104026044b8c Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 13 Jun 2020 21:59:38 -0300 Subject: [PATCH 27/28] Updating README.md --- README.md | 14 +++++--------- docs/source/index.rst | 9 +++------ 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 73931fa01..4eedd504d 100644 --- a/README.md +++ b/README.md @@ -43,8 +43,7 @@ df = wr.s3.read_parquet("s3://bucket/dataset/", dataset=True) # Retrieving the data from Amazon Athena df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db") -# Getting Redshift connection (SQLAlchemy) from Glue Catalog Connections -# Retrieving the data from Amazon Redshift Spectrum +# Get Redshift connection (SQLAlchemy) from Glue Catalog and retrieving data from Redshift Spectrum engine = wr.catalog.get_engine("my-redshift-connection") df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) @@ -58,16 +57,13 @@ wr.quicksight.create_athena_dataset( allowed_to_manage=["username"] ) -# Getting MySQL connection (SQLAlchemy) from Glue Catalog Connections -# Load the data into MySQL +# Get MySQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into MySQL engine = wr.catalog.get_engine("my-mysql-connection") -wr.db.to_sql(df, engine, schema="test", name="my_table") +wr.db.to_sql(df, engine, schema="test", name="my_table") -# Getting PostgreSQL connection (SQLAlchemy) from Glue Catalog Connections -# Load the data into PostgreSQL +# Get PostgreSQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into PostgreSQL engine = wr.catalog.get_engine("my-postgresql-connection") -wr.db.to_sql(df, engine, schema="test", name="my_table") - +wr.db.to_sql(df, engine, schema="test", name="my_table") ``` ## [Read The Docs](https://aws-data-wrangler.readthedocs.io/) diff --git a/docs/source/index.rst b/docs/source/index.rst index 2335c0209..e0db8d42d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -25,8 +25,7 @@ Quick Start # Retrieving the data from Amazon Athena df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db") - # Getting Redshift connection (SQLAlchemy) from Glue Catalog Connections - # Retrieving the data from Amazon Redshift Spectrum + # Get Redshift connection (SQLAlchemy) from Glue Catalog and retrieving data from Redshift Spectrum engine = wr.catalog.get_engine("my-redshift-connection") df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) @@ -40,13 +39,11 @@ Quick Start allowed_to_manage=["username"] ) - # Getting MySQL connection (SQLAlchemy) from Glue Catalog Connections - # Load the data into MySQL + # Get MySQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into MySQL engine = wr.catalog.get_engine("my-mysql-connection") wr.db.to_sql(df, engine, schema="test", name="my_table") - # Getting PostgreSQL connection (SQLAlchemy) from Glue Catalog Connections - # Load the data into PostgreSQL + # Get PostgreSQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into PostgreSQL engine = wr.catalog.get_engine("my-postgresql-connection") wr.db.to_sql(df, engine, schema="test", name="my_table") From 35f1675cf0b96689ccef5fbcde2d092899269791 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 13 Jun 2020 22:05:56 -0300 Subject: [PATCH 28/28] Updating README.md --- README.md | 2 +- docs/source/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4eedd504d..41c903571 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ df = wr.s3.read_parquet("s3://bucket/dataset/", dataset=True) # Retrieving the data from Amazon Athena df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db") -# Get Redshift connection (SQLAlchemy) from Glue Catalog and retrieving data from Redshift Spectrum +# Get Redshift connection (SQLAlchemy) from Glue and retrieving data from Redshift Spectrum engine = wr.catalog.get_engine("my-redshift-connection") df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) diff --git a/docs/source/index.rst b/docs/source/index.rst index e0db8d42d..ccd45d34c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -25,7 +25,7 @@ Quick Start # Retrieving the data from Amazon Athena df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db") - # Get Redshift connection (SQLAlchemy) from Glue Catalog and retrieving data from Redshift Spectrum + # Get Redshift connection (SQLAlchemy) from Glue and retrieving data from Redshift Spectrum engine = wr.catalog.get_engine("my-redshift-connection") df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine)