diff --git a/README.md b/README.md index cabd20ea3..41c903571 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,13 @@ ![AWS Data Wrangler](docs/source/_static/logo2.png?raw=true "AWS Data Wrangler") -[![Release](https://img.shields.io/badge/release-1.4.0-brightgreen.svg)](https://pypi.org/project/awswrangler/) +[![Release](https://img.shields.io/badge/release-1.5.0-brightgreen.svg)](https://pypi.org/project/awswrangler/) [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) -[![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen.svg)](https://pypi.org/project/awswrangler/) +[![Coverage](https://img.shields.io/badge/coverage-90%25-brightgreen.svg)](https://pypi.org/project/awswrangler/) ![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest) @@ -43,11 +43,27 @@ df = wr.s3.read_parquet("s3://bucket/dataset/", dataset=True) # Retrieving the data from Amazon Athena df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db") -# Getting Redshift connection (SQLAlchemy) from Glue Catalog Connections +# Get Redshift connection (SQLAlchemy) from Glue and retrieving data from Redshift Spectrum engine = wr.catalog.get_engine("my-redshift-connection") - -# Retrieving the data from Amazon Redshift Spectrum df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) + +# Creating QuickSight Data Source and Dataset to reflect our new table +wr.quicksight.create_athena_data_source("athena-source", allowed_to_manage=["username"]) +wr.quicksight.create_athena_dataset( + name="my-dataset", + database="my_db", + table="my_table", + data_source_name="athena-source", + allowed_to_manage=["username"] +) + +# Get MySQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into MySQL +engine = wr.catalog.get_engine("my-mysql-connection") +wr.db.to_sql(df, engine, schema="test", name="my_table") + +# Get PostgreSQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into PostgreSQL +engine = wr.catalog.get_engine("my-postgresql-connection") +wr.db.to_sql(df, engine, schema="test", name="my_table") ``` ## [Read The Docs](https://aws-data-wrangler.readthedocs.io/) @@ -80,6 +96,7 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) - [015 - EMR](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/015%20-%20EMR.ipynb) - [016 - EMR & Docker](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/016%20-%20EMR%20%26%20Docker.ipynb) - [017 - Partition Projection](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/017%20-%20Partition%20Projection.ipynb) + - [018 - QuickSight](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/018%20-%20QuickSight.ipynb) - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html) - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3) - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog) @@ -87,6 +104,7 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) - [Databases (Redshift, PostgreSQL, MySQL)](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#databases-redshift-postgresql-mysql) - [EMR Cluster](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#emr-cluster) - [CloudWatch Logs](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#cloudwatch-logs) + - [QuickSight](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#quicksight) - [**License**](https://github.com/awslabs/aws-data-wrangler/blob/master/LICENSE) - [**Contributing**](https://github.com/awslabs/aws-data-wrangler/blob/master/CONTRIBUTING.md) - [**Legacy Docs** (pre-1.0.0)](https://aws-data-wrangler.readthedocs.io/en/legacy/) diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py index 4413ab5f4..94cc28ba7 100644 --- a/awswrangler/__init__.py +++ b/awswrangler/__init__.py @@ -5,10 +5,10 @@ """ -import logging +import logging as _logging -from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3 # noqa +from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, quicksight, s3 # noqa from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ # noqa from awswrangler._utils import get_account_id # noqa -logging.getLogger("awswrangler").addHandler(logging.NullHandler()) +_logging.getLogger("awswrangler").addHandler(_logging.NullHandler()) diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py index dc3dcb059..b2ebec6d8 100644 --- a/awswrangler/__metadata__.py +++ b/awswrangler/__metadata__.py @@ -7,5 +7,5 @@ __title__ = "awswrangler" __description__ = "Pandas on AWS." -__version__ = "1.4.0" +__version__ = "1.5.0" __license__ = "Apache License 2.0" diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py index 50cc0e372..e0ffcf208 100644 --- a/awswrangler/_data_types.py +++ b/awswrangler/_data_types.py @@ -114,6 +114,34 @@ def athena2redshift( # pylint: disable=too-many-branches,too-many-return-statem raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}") # pragma: no cover +def athena2quicksight(dtype: str) -> str: # pylint: disable=too-many-branches,too-many-return-statements + """Athena to Quicksight data types conversion.""" + dtype = dtype.lower() + if dtype == "smallint": + return "INTEGER" + if dtype in ("int", "integer"): + return "INTEGER" + if dtype == "bigint": + return "INTEGER" + if dtype == "float": + return "DECIMAL" + if dtype == "double": + return "DECIMAL" + if dtype in ("boolean", "bool"): + return "BOOLEAN" + if dtype in ("string", "char", "varchar"): + return "STRING" + if dtype == "timestamp": + return "DATETIME" + if dtype == "date": + return "DATETIME" + if dtype.startswith("decimal"): + return "DECIMAL" + if dtype in ("binary" or "varbinary"): + return "BIT" + raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}") # pragma: no cover + + def pyarrow2athena(dtype: pa.DataType) -> str: # pylint: disable=too-many-branches,too-many-return-statements """Pyarrow to Athena data types conversion.""" if pa.types.is_int8(dtype): diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py index c399701f8..860b5ac24 100644 --- a/awswrangler/_utils.py +++ b/awswrangler/_utils.py @@ -1,10 +1,11 @@ """Internal (private) Utilities Module.""" +import copy import logging import math import os import random -from typing import Any, Dict, Generator, List, Optional, Tuple +from typing import Any, Dict, Generator, List, Optional, Tuple, Union import boto3 # type: ignore import botocore.config # type: ignore @@ -17,8 +18,10 @@ _logger: logging.Logger = logging.getLogger(__name__) -def ensure_session(session: Optional[boto3.Session] = None) -> boto3.Session: +def ensure_session(session: Optional[Union[boto3.Session, Dict[str, Optional[str]]]] = None) -> boto3.Session: """Ensure that a valid boto3.Session will be returned.""" + if isinstance(session, dict): # Primitives received + return boto3_from_primitives(primitives=session) if session is not None: return session # Ensure the boto3's default session is used so that its parameters can be @@ -28,6 +31,30 @@ def ensure_session(session: Optional[boto3.Session] = None) -> boto3.Session: return boto3.Session() # pragma: no cover +def boto3_to_primitives(boto3_session: Optional[boto3.Session] = None) -> Dict[str, Optional[str]]: + """Convert Boto3 Session to Python primitives.""" + _boto3_session: boto3.Session = ensure_session(session=boto3_session) + credentials = _boto3_session.get_credentials() + return { + "aws_access_key_id": getattr(credentials, "access_key", None), + "aws_secret_access_key": getattr(credentials, "secret_key", None), + "aws_session_token": getattr(credentials, "token", None), + "region_name": _boto3_session.region_name, + "profile_name": _boto3_session.profile_name, + } + + +def boto3_from_primitives(primitives: Dict[str, Optional[str]] = None) -> boto3.Session: + """Convert Python primitives to Boto3 Session.""" + if primitives is None: + return boto3.DEFAULT_SESSION # pragma: no cover + _primitives: Dict[str, Optional[str]] = copy.deepcopy(primitives) + profile_name: Optional[str] = _primitives.get("profile_name", None) + _primitives["profile_name"] = None if profile_name in (None, "default") else profile_name + args: Dict[str, str] = {k: v for k, v in _primitives.items() if v is not None} + return boto3.Session(**args) + + def client(service_name: str, session: Optional[boto3.Session] = None) -> boto3.client: """Create a valid boto3.client.""" return ensure_session(session=session).client( @@ -63,6 +90,8 @@ def parse_path(path: str) -> Tuple[str, str]: >>> bucket, key = parse_path('s3://bucket/key') """ + if path.startswith("s3://") is False: + raise exceptions.InvalidArgumentValue(f"'{path}' is not a valid path. It MUST start with 's3://'") parts = path.replace("s3://", "").split("/", 1) bucket: str = parts[0] key: str = "" @@ -139,7 +168,8 @@ def chunkify(lst: List[Any], num_chunks: int = 1, max_length: Optional[int] = No def get_fs( - session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None + session: Optional[Union[boto3.Session, Dict[str, Optional[str]]]] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> s3fs.S3FileSystem: """Build a S3FileSystem from a given boto3 session.""" fs: s3fs.S3FileSystem = s3fs.S3FileSystem( diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index 057f9e4a5..7cea51199 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -6,7 +6,7 @@ import re import unicodedata from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from urllib.parse import quote_plus +from urllib.parse import quote_plus as _quote_plus import boto3 # type: ignore import pandas as pd # type: ignore @@ -17,6 +17,83 @@ _logger: logging.Logger = logging.getLogger(__name__) +def create_database( + name: str, + description: Optional[str] = None, + catalog_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Create a database in AWS Glue Catalog. + + Parameters + ---------- + name : str + Database name. + description : str, optional + A Descrption for the Database. + catalog_id : str, optional + The ID of the Data Catalog from which to retrieve Databases. + If none is provided, the AWS account ID is used by default. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.catalog.create_database( + ... name='awswrangler_test' + ... ) + """ + args: Dict[str, str] = {} + client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) + args["Name"] = name + if description is not None: + args["Description"] = description + + if catalog_id is not None: + client_glue.create_database(CatalogId=catalog_id, DatabaseInput=args) # pragma: no cover + else: + client_glue.create_database(DatabaseInput=args) + + +def delete_database(name: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Create a database in AWS Glue Catalog. + + Parameters + ---------- + name : str + Database name. + catalog_id : str, optional + The ID of the Data Catalog from which to retrieve Databases. + If none is provided, the AWS account ID is used by default. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.catalog.delete_database( + ... name='awswrangler_test' + ... ) + """ + client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) + + if catalog_id is not None: + client_glue.delete_database(CatalogId=catalog_id, Name=name) # pragma: no cover + else: + client_glue.delete_database(Name=name) + + def delete_table_if_exists(database: str, table: str, boto3_session: Optional[boto3.Session] = None) -> bool: """Delete Glue table if exists. @@ -362,8 +439,9 @@ def get_table_types( dtypes: Dict[str, str] = {} for col in response["Table"]["StorageDescriptor"]["Columns"]: dtypes[col["Name"]] = col["Type"] - for par in response["Table"]["PartitionKeys"]: - dtypes[par["Name"]] = par["Type"] + if "PartitionKeys" in response["Table"]: + for par in response["Table"]["PartitionKeys"]: + dtypes[par["Name"]] = par["Type"] return dtypes @@ -450,6 +528,11 @@ def get_tables( ) -> Iterator[Dict[str, Any]]: """Get an iterator of tables. + Note + ---- + Please, does not filter using name_contains and name_prefix/name_suffix at the same time. + Only name_prefix and name_suffix can be combined together. + Parameters ---------- catalog_id : str, optional @@ -483,15 +566,19 @@ def get_tables( if catalog_id is not None: args["CatalogId"] = catalog_id if (name_prefix is not None) and (name_suffix is not None) and (name_contains is not None): - args["Expression"] = f"{name_prefix}.*{name_contains}.*{name_suffix}" - elif (name_prefix is not None) and (name_suffix is not None): - args["Expression"] = f"{name_prefix}.*{name_suffix}" + raise exceptions.InvalidArgumentCombination( + "Please, does not filter using name_contains and " + "name_prefix/name_suffix at the same time. Only " + "name_prefix and name_suffix can be combined together." + ) + if (name_prefix is not None) and (name_suffix is not None): + args["Expression"] = f"{name_prefix}*{name_suffix}" elif name_contains is not None: - args["Expression"] = f".*{name_contains}.*" + args["Expression"] = f"*{name_contains}*" elif name_prefix is not None: - args["Expression"] = f"{name_prefix}.*" + args["Expression"] = f"{name_prefix}*" elif name_suffix is not None: - args["Expression"] = f".*{name_suffix}" + args["Expression"] = f"*{name_suffix}" if database is not None: dbs: List[str] = [database] else: @@ -570,15 +657,21 @@ def tables( tbls = tbls[:limit] df_dict: Dict[str, List] = {"Database": [], "Table": [], "Description": [], "Columns": [], "Partitions": []} - for table in tbls: - df_dict["Database"].append(table["DatabaseName"]) - df_dict["Table"].append(table["Name"]) - if "Description" in table: - df_dict["Description"].append(table["Description"]) + for tbl in tbls: + df_dict["Database"].append(tbl["DatabaseName"]) + df_dict["Table"].append(tbl["Name"]) + if "Description" in tbl: + df_dict["Description"].append(tbl["Description"]) else: df_dict["Description"].append("") - df_dict["Columns"].append(", ".join([x["Name"] for x in table["StorageDescriptor"]["Columns"]])) - df_dict["Partitions"].append(", ".join([x["Name"] for x in table["PartitionKeys"]])) + if "Columns" in tbl["StorageDescriptor"]: + df_dict["Columns"].append(", ".join([x["Name"] for x in tbl["StorageDescriptor"]["Columns"]])) + else: + df_dict["Columns"].append("") # pragma: no cover + if "PartitionKeys" in tbl: + df_dict["Partitions"].append(", ".join([x["Name"] for x in tbl["PartitionKeys"]])) + else: + df_dict["Partitions"].append("") return pd.DataFrame(data=df_dict) @@ -694,14 +787,15 @@ def table( df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") - for col in tbl["PartitionKeys"]: - df_dict["Column Name"].append(col["Name"]) - df_dict["Type"].append(col["Type"]) - df_dict["Partition"].append(True) - if "Comment" in col: - df_dict["Comment"].append(col["Comment"]) - else: - df_dict["Comment"].append("") + if "PartitionKeys" in tbl: + for col in tbl["PartitionKeys"]: + df_dict["Column Name"].append(col["Name"]) + df_dict["Type"].append(col["Type"]) + df_dict["Partition"].append(True) + if "Comment" in col: + df_dict["Comment"].append(col["Comment"]) + else: + df_dict["Comment"].append("") return pd.DataFrame(data=df_dict) @@ -809,6 +903,10 @@ def sanitize_table_name(table: str) -> str: def drop_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame: """Drop all repeated columns (duplicated names). + Note + ---- + This transformation will run `inplace` and will make changes in the original DataFrame. + Note ---- It is different from Panda's drop_duplicates() function which considers the column values. @@ -835,11 +933,14 @@ def drop_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame: 1 2 """ - duplicated_cols = df.columns.duplicated() - duplicated_cols_names: List[str] = list(df.columns[duplicated_cols]) - if len(duplicated_cols_names) > 0: - _logger.warning("Dropping repeated columns: %s", duplicated_cols_names) - return df.loc[:, ~duplicated_cols] + duplicated = df.columns.duplicated() + if duplicated.any(): + _logger.warning("Dropping duplicated columns...") + columns = df.columns.values + columns[duplicated] = "AWSDataWranglerDuplicatedMarker" + df.columns = columns + df.drop(columns="AWSDataWranglerDuplicatedMarker", inplace=True) + return df def get_connection( @@ -909,8 +1010,8 @@ def get_engine( db_type: str = details["JDBC_CONNECTION_URL"].split(":")[1].lower() host: str = details["JDBC_CONNECTION_URL"].split(":")[2].replace("/", "") port, database = details["JDBC_CONNECTION_URL"].split(":")[3].split("/") - user: str = quote_plus(details["USERNAME"]) - password: str = quote_plus(details["PASSWORD"]) + user: str = _quote_plus(details["USERNAME"]) + password: str = _quote_plus(details["PASSWORD"]) if db_type == "postgresql": _utils.ensure_postgresql_casts() if db_type in ("redshift", "postgresql"): @@ -1608,8 +1709,9 @@ def get_columns_comments( comments: Dict[str, str] = {} for c in response["Table"]["StorageDescriptor"]["Columns"]: comments[c["Name"]] = c["Comment"] - for p in response["Table"]["PartitionKeys"]: - comments[p["Name"]] = p["Comment"] + if "PartitionKeys" in response["Table"]: + for p in response["Table"]["PartitionKeys"]: + comments[p["Name"]] = p["Comment"] return comments diff --git a/awswrangler/cloudwatch.py b/awswrangler/cloudwatch.py index c36fab70b..5ee5f722f 100644 --- a/awswrangler/cloudwatch.py +++ b/awswrangler/cloudwatch.py @@ -1,8 +1,8 @@ """CloudWatch Logs module.""" +import datetime import logging import time -from datetime import datetime from typing import Any, Dict, List, Optional import boto3 # type: ignore @@ -18,8 +18,8 @@ def start_query( query: str, log_group_names: List[str], - start_time: datetime = datetime(year=1970, month=1, day=1), - end_time: datetime = datetime.now(), + start_time: datetime.datetime = datetime.datetime(year=1970, month=1, day=1), + end_time: datetime.datetime = datetime.datetime.now(), limit: Optional[int] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: @@ -120,8 +120,8 @@ def wait_query(query_id: str, boto3_session: Optional[boto3.Session] = None) -> def run_query( query: str, log_group_names: List[str], - start_time: datetime = datetime(year=1970, month=1, day=1), - end_time: datetime = datetime.now(), + start_time: datetime.datetime = datetime.datetime(year=1970, month=1, day=1), + end_time: datetime.datetime = datetime.datetime.now(), limit: Optional[int] = None, boto3_session: Optional[boto3.Session] = None, ) -> List[List[Dict[str, str]]]: @@ -174,8 +174,8 @@ def run_query( def read_logs( query: str, log_group_names: List[str], - start_time: datetime = datetime(year=1970, month=1, day=1), - end_time: datetime = datetime.now(), + start_time: datetime.datetime = datetime.datetime(year=1970, month=1, day=1), + end_time: datetime.datetime = datetime.datetime.now(), limit: Optional[int] = None, boto3_session: Optional[boto3.Session] = None, ) -> pd.DataFrame: diff --git a/awswrangler/db.py b/awswrangler/db.py index 61a495c12..1cebdfc4e 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -4,7 +4,7 @@ import logging import time from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from urllib.parse import quote_plus +from urllib.parse import quote_plus as _quote_plus import boto3 # type: ignore import pandas as pd # type: ignore @@ -13,6 +13,7 @@ from sqlalchemy.sql.visitors import VisitableType # type: ignore from awswrangler import _data_types, _utils, exceptions, s3 +from awswrangler.s3._list import path2list # noqa _logger: logging.Logger = logging.getLogger(__name__) @@ -34,6 +35,10 @@ def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs) -> ---- Redshift: For large DataFrames (1MM+ rows) consider the function **wr.db.copy_to_redshift()**. + Note + ---- + Redshift: `index=False` will be forced. + Parameters ---------- df : pandas.DataFrame @@ -92,6 +97,8 @@ def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs) -> ) pandas_kwargs["dtype"] = dtypes pandas_kwargs["con"] = con + if pandas_kwargs["con"].name.lower() == "redshift": # Redshift does not accept index + pandas_kwargs["index"] = False max_attempts: int = 3 for attempt in range(max_attempts): try: @@ -343,8 +350,8 @@ def get_redshift_temp_engine( res: Dict[str, Any] = client_redshift.get_cluster_credentials( DbUser=user, ClusterIdentifier=cluster_identifier, DurationSeconds=duration, AutoCreate=False ) - _user: str = quote_plus(res["DbUser"]) - password: str = quote_plus(res["DbPassword"]) + _user: str = _quote_plus(res["DbUser"]) + password: str = _quote_plus(res["DbPassword"]) cluster: Dict[str, Any] = client_redshift.describe_clusters(ClusterIdentifier=cluster_identifier)["Clusters"][0] host: str = cluster["Endpoint"]["Address"] port: str = cluster["Endpoint"]["Port"] @@ -649,7 +656,7 @@ def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-argument """ _varchar_lengths: Dict[str, int] = {} if varchar_lengths is None else varchar_lengths session: boto3.Session = _utils.ensure_session(session=boto3_session) - paths: List[str] = s3._path2list(path=path, boto3_session=session) # pylint: disable=protected-access + paths: List[str] = path2list(path=path, boto3_session=session) # pylint: disable=protected-access manifest_directory = manifest_directory if manifest_directory.endswith("/") else f"{manifest_directory}/" manifest_path: str = f"{manifest_directory}manifest.json" write_redshift_copy_manifest( diff --git a/awswrangler/quicksight/__init__.py b/awswrangler/quicksight/__init__.py new file mode 100644 index 000000000..4e81c2431 --- /dev/null +++ b/awswrangler/quicksight/__init__.py @@ -0,0 +1,44 @@ +"""Amazon QuickSight Module.""" + +from awswrangler.quicksight._cancel import cancel_ingestion # noqa +from awswrangler.quicksight._create import create_athena_data_source, create_athena_dataset, create_ingestion # noqa +from awswrangler.quicksight._delete import ( # noqa + delete_all_dashboards, + delete_all_data_sources, + delete_all_datasets, + delete_all_templates, + delete_dashboard, + delete_data_source, + delete_dataset, + delete_template, +) +from awswrangler.quicksight._describe import ( # noqa + describe_dashboard, + describe_data_source, + describe_data_source_permissions, + describe_dataset, + describe_ingestion, +) +from awswrangler.quicksight._get_list import ( # noqa + get_dashboard_id, + get_dashboard_ids, + get_data_source_arn, + get_data_source_arns, + get_data_source_id, + get_data_source_ids, + get_dataset_id, + get_dataset_ids, + get_template_id, + get_template_ids, + list_dashboards, + list_data_sources, + list_datasets, + list_group_memberships, + list_groups, + list_iam_policy_assignments, + list_iam_policy_assignments_for_user, + list_ingestions, + list_templates, + list_user_groups, + list_users, +) diff --git a/awswrangler/quicksight/_cancel.py b/awswrangler/quicksight/_cancel.py new file mode 100644 index 000000000..cf27cdf45 --- /dev/null +++ b/awswrangler/quicksight/_cancel.py @@ -0,0 +1,58 @@ +"""Amazon QuickSight Cancel Module.""" + +import logging +from typing import Optional + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.quicksight._get_list import get_dataset_id + +_logger: logging.Logger = logging.getLogger(__name__) + + +def cancel_ingestion( + ingestion_id: str = None, + dataset_name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Cancel an ongoing ingestion of data into SPICE. + + Note + ---- + You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument. + + Parameters + ---------- + ingestion_id : str + Ingestion ID. + dataset_name : str, optional + Dataset name. + dataset_id : str, optional + Dataset ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.cancel_ingestion(ingestion_id="...", dataset_name="...") + """ + if (dataset_name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_id is None) and (dataset_name is not None): + dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + client.cancel_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id) diff --git a/awswrangler/quicksight/_create.py b/awswrangler/quicksight/_create.py new file mode 100644 index 000000000..34e2c3103 --- /dev/null +++ b/awswrangler/quicksight/_create.py @@ -0,0 +1,393 @@ +"""Amazon QuickSight Create Module.""" + +import logging +import uuid +from typing import Any, Dict, List, Optional, Union + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.quicksight._get_list import get_data_source_arn, get_dataset_id +from awswrangler.quicksight._utils import extract_athena_query_columns, extract_athena_table_columns + +_logger: logging.Logger = logging.getLogger(__name__) + +_ALLOWED_ACTIONS: Dict[str, Dict[str, List[str]]] = { + "data_source": { + "allowed_to_use": [ + "quicksight:DescribeDataSource", + "quicksight:DescribeDataSourcePermissions", + "quicksight:PassDataSource", + ], + "allowed_to_manage": [ + "quicksight:DescribeDataSource", + "quicksight:DescribeDataSourcePermissions", + "quicksight:PassDataSource", + "quicksight:UpdateDataSource", + "quicksight:DeleteDataSource", + "quicksight:UpdateDataSourcePermissions", + ], + }, + "dataset": { + "allowed_to_use": [ + "quicksight:DescribeDataSet", + "quicksight:DescribeDataSetPermissions", + "quicksight:PassDataSet", + "quicksight:DescribeIngestion", + "quicksight:ListIngestions", + ], + "allowed_to_manage": [ + "quicksight:DescribeDataSet", + "quicksight:DescribeDataSetPermissions", + "quicksight:PassDataSet", + "quicksight:DescribeIngestion", + "quicksight:ListIngestions", + "quicksight:UpdateDataSet", + "quicksight:DeleteDataSet", + "quicksight:CreateIngestion", + "quicksight:CancelIngestion", + "quicksight:UpdateDataSetPermissions", + ], + }, +} + + +def _generate_principal(user_name: str, account_id: str, region: str) -> str: + user_name = user_name if "/" in user_name else f"default/{user_name}" + return f"arn:aws:quicksight:{region}:{account_id}:user/{user_name}" + + +def _generate_permissions( + resource: str, + account_id: str, + boto3_session: boto3.Session, + allowed_to_use: Optional[List[str]] = None, + allowed_to_manage: Optional[List[str]] = None, +) -> List[Dict[str, Union[str, List[str]]]]: + permissions: List[Dict[str, Union[str, List[str]]]] = [] + if (allowed_to_use is None) and (allowed_to_manage is None): + return permissions + + # Forcing same principal not be in both lists at the same time. + if (allowed_to_use is not None) and (allowed_to_manage is not None): + allowed_to_use = list(set(allowed_to_use) - set(allowed_to_manage)) + + region: str = _utils.get_region_from_session(boto3_session=boto3_session) + if allowed_to_use is not None: + permissions += [ + { + "Principal": _generate_principal(user_name=user_name, account_id=account_id, region=region), + "Actions": _ALLOWED_ACTIONS[resource]["allowed_to_use"], + } + for user_name in allowed_to_use + ] + if allowed_to_manage is not None: + permissions += [ + { + "Principal": _generate_principal(user_name=user_name, account_id=account_id, region=region), + "Actions": _ALLOWED_ACTIONS[resource]["allowed_to_manage"], + } + for user_name in allowed_to_manage + ] + return permissions + + +def _generate_transformations( + rename_columns: Optional[Dict[str, str]], cast_columns_types=Optional[Dict[str, str]] +) -> List[Dict[str, Dict[str, Any]]]: + trans: List[Dict[str, Dict[str, Any]]] = [] + if rename_columns is not None: + for k, v in rename_columns.items(): + trans.append({"RenameColumnOperation": {"ColumnName": k, "NewColumnName": v}}) + if cast_columns_types is not None: + for k, v in cast_columns_types.items(): + trans.append({"CastColumnTypeOperation": {"ColumnName": k, "NewColumnType": v.upper()}}) + return trans + + +def create_athena_data_source( + name: str, + workgroup: str = "primary", + allowed_to_use: Optional[List[str]] = None, + allowed_to_manage: Optional[List[str]] = None, + tags: Optional[Dict[str, str]] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Create a QuickSight data source pointing to an Athena/Workgroup. + + Note + ---- + You will not be able to see the the data source in the console + if you not pass your user to one of the ``allowed_*`` arguments. + + Parameters + ---------- + name : str + Data source name. + workgroup : str + Athena workgroup. + tags : Dict[str, str], optional + Key/Value collection to put on the Cluster. + e.g. {"foo": "boo", "bar": "xoo"}) + allowed_to_use : optional + List of principals that will be allowed to see and use the data source. + e.g. ["John"] + allowed_to_manage : optional + List of principals that will be allowed to see, use, update and delete the data source. + e.g. ["Mary"] + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.create_athena_data_source( + ... name="...", + ... allowed_to_manage=["john"] + ... ) + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + args: Dict[str, Any] = { + "AwsAccountId": account_id, + "DataSourceId": name, + "Name": name, + "Type": "ATHENA", + "DataSourceParameters": {"AthenaParameters": {"WorkGroup": workgroup}}, + "SslProperties": {"DisableSsl": True}, + } + permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( + resource="data_source", + account_id=account_id, + boto3_session=session, + allowed_to_use=allowed_to_use, + allowed_to_manage=allowed_to_manage, + ) + if permissions: + args["Permissions"] = permissions + if tags is not None: + _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] + args["Tags"] = _tags + client.create_data_source(**args) + + +def create_athena_dataset( + name: str, + database: Optional[str] = None, + table: Optional[str] = None, + sql: Optional[str] = None, + sql_name: str = "CustomSQL", + data_source_name: Optional[str] = None, + data_source_arn: Optional[str] = None, + import_mode: str = "DIRECT_QUERY", + allowed_to_use: Optional[List[str]] = None, + allowed_to_manage: Optional[List[str]] = None, + logical_table_alias: str = "LogicalTable", + rename_columns: Optional[Dict[str, str]] = None, + cast_columns_types: Optional[Dict[str, str]] = None, + tags: Optional[Dict[str, str]] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Create a QuickSight dataset. + + Note + ---- + You will not be able to see the the dataset in the console + if you not pass your user to one of the ``allowed_*`` arguments. + + Note + ---- + You must pass ``database``/``table`` OR ``sql`` argument. + + Note + ---- + You must pass ``data_source_name`` OR ``data_source_arn`` argument. + + Parameters + ---------- + name : str + Dataset name. + database : str + Athena's database name. + table : str + Athena's table name. + sql : str + Use a SQL query to define your table. + sql_name : str + Query name. + data_source_name : str, optional + QuickSight data source name. + data_source_arn : str, optional + QuickSight data source ARN. + import_mode : str + Indicates whether you want to import the data into SPICE. + 'SPICE'|'DIRECT_QUERY' + tags : Dict[str, str], optional + Key/Value collection to put on the Cluster. + e.g. {"foo": "boo", "bar": "xoo"}) + allowed_to_use : optional + List of principals that will be allowed to see and use the data source. + e.g. ["john", "Mary"] + allowed_to_manage : optional + List of principals that will be allowed to see, use, update and delete the data source. + e.g. ["Mary"] + logical_table_alias : str + A display name for the logical table. + rename_columns : Dict[str, str], optional + Dictionary to map column renames. e.g. {"old_name": "new_name", "old_name2": "new_name2"} + cast_columns_types : Dict[str, str], optional + Dictionary to map column casts. e.g. {"col_name": "STRING", "col_name2": "DECIMAL"} + Valid types: 'STRING'|'INTEGER'|'DECIMAL'|'DATETIME' + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.create_athena_dataset( + ... name="...", + ... database="..." + ... table="..." + ... data_source_name="..." + ... allowed_to_manage=["Mary"] + ... ) + """ + if (data_source_name is None) and (data_source_arn is None): + raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.") + if ((database is None) and (table is None)) and (sql is None): + raise exceptions.InvalidArgument("You must pass database/table OR sql argument.") + if (database is not None) and (sql is not None): + raise exceptions.InvalidArgument( + "If you provide sql argument, please include the database name inside the sql statement." + "Do NOT pass in with database argument." + ) + session: boto3.Session = _utils.ensure_session(session=boto3_session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (data_source_arn is None) and (data_source_name is not None): + data_source_arn = get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session) + if sql is not None: + physical_table: Dict[str, Dict[str, Any]] = { + "CustomSql": { + "DataSourceArn": data_source_arn, + "Name": sql_name, + "SqlQuery": sql, + "Columns": extract_athena_query_columns( + sql=sql, + data_source_arn=data_source_arn, # type: ignore + account_id=account_id, + boto3_session=session, + ), + } + } + else: + physical_table = { + "RelationalTable": { + "DataSourceArn": data_source_arn, + "Schema": database, + "Name": table, + "InputColumns": extract_athena_table_columns( + database=database, # type: ignore + table=table, # type: ignore + boto3_session=session, + ), + } + } + table_uuid: str = uuid.uuid4().hex + args: Dict[str, Any] = { + "AwsAccountId": account_id, + "DataSetId": name, + "Name": name, + "ImportMode": import_mode, + "PhysicalTableMap": {table_uuid: physical_table}, + "LogicalTableMap": {table_uuid: {"Alias": logical_table_alias, "Source": {"PhysicalTableId": table_uuid}}}, + } + trans: List[Dict[str, Dict[str, Any]]] = _generate_transformations( + rename_columns=rename_columns, cast_columns_types=cast_columns_types + ) + if trans: + args["LogicalTableMap"][table_uuid]["DataTransforms"] = trans + permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( + resource="dataset", + account_id=account_id, + boto3_session=session, + allowed_to_use=allowed_to_use, + allowed_to_manage=allowed_to_manage, + ) + if permissions: + args["Permissions"] = permissions + if tags is not None: + _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] + args["Tags"] = _tags + client.create_data_set(**args) + + +def create_ingestion( + dataset_name: Optional[str] = None, + dataset_id: Optional[str] = None, + ingestion_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: + """Create and starts a new SPICE ingestion on a dataset. + + Note + ---- + You must pass ``dataset_name`` OR ``dataset_id`` argument. + + Parameters + ---------- + dataset_name : str, optional + Dataset name. + dataset_id : str, optional + Dataset ID. + ingestion_id : str, optional + Ingestion ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Ingestion ID + + Examples + -------- + >>> import awswrangler as wr + >>> status = wr.quicksight.create_ingestion("my_dataset") + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None dataset_name or dataset_id argument.") + if (dataset_id is None) and (dataset_name is not None): + dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + if ingestion_id is None: + ingestion_id = uuid.uuid4().hex + client: boto3.client = _utils.client(service_name="quicksight", session=session) + response: Dict[str, Any] = client.create_ingestion( + DataSetId=dataset_id, IngestionId=ingestion_id, AwsAccountId=account_id + ) + return response["IngestionId"] diff --git a/awswrangler/quicksight/_delete.py b/awswrangler/quicksight/_delete.py new file mode 100644 index 000000000..cc45e9108 --- /dev/null +++ b/awswrangler/quicksight/_delete.py @@ -0,0 +1,339 @@ +"""Amazon QuickSight Delete Module.""" + +import logging +from typing import Any, Callable, Dict, Optional + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.quicksight._get_list import ( + get_dashboard_id, + get_data_source_id, + get_dataset_id, + get_template_id, + list_dashboards, + list_data_sources, + list_datasets, + list_templates, +) + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _delete( + func_name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, **kwargs +) -> None: + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + func: Callable = getattr(client, func_name) + func(AwsAccountId=account_id, **kwargs) + + +def delete_dashboard( + name: Optional[str] = None, + dashboard_id: Optional[str] = None, + version_number: Optional[int] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Delete a dashboard. + + Note + ---- + You must pass a not None ``name`` or ``dashboard_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + dashboard_id : str, optional + The ID for the dashboard. + version_number : int, optional + The version number of the dashboard. If the version number property is provided, + only the specified version of the dashboard is deleted. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_dashboard(name="...") + """ + if (name is None) and (dashboard_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dashboard_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (dashboard_id is None) and (name is not None): + dashboard_id = get_dashboard_id(name=name, account_id=account_id, boto3_session=session) + args: Dict[str, Any] = { + "func_name": "delete_dashboard", + "account_id": account_id, + "boto3_session": session, + "DashboardId": dashboard_id, + } + if version_number is not None: + args["VersionNumber"] = version_number + _delete(**args) + + +def delete_dataset( + name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Delete a dataset. + + Note + ---- + You must pass a not None ``name`` or ``dataset_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + dataset_id : str, optional + The ID for the dataset. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_dataset(name="...") + """ + if (name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (dataset_id is None) and (name is not None): + dataset_id = get_dataset_id(name=name, account_id=account_id, boto3_session=session) + args: Dict[str, Any] = { + "func_name": "delete_data_set", + "account_id": account_id, + "boto3_session": session, + "DataSetId": dataset_id, + } + _delete(**args) + + +def delete_data_source( + name: Optional[str] = None, + data_source_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Delete a data source. + + Note + ---- + You must pass a not None ``name`` or ``data_source_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + data_source_id : str, optional + The ID for the data source. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_data_source(name="...") + """ + if (name is None) and (data_source_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or data_source_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (data_source_id is None) and (name is not None): + data_source_id = get_data_source_id(name=name, account_id=account_id, boto3_session=session) + args: Dict[str, Any] = { + "func_name": "delete_data_source", + "account_id": account_id, + "boto3_session": session, + "DataSourceId": data_source_id, + } + _delete(**args) + + +def delete_template( + name: Optional[str] = None, + template_id: Optional[str] = None, + version_number: Optional[int] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Delete a tamplate. + + Note + ---- + You must pass a not None ``name`` or ``template_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + template_id : str, optional + The ID for the dashboard. + version_number : int, optional + Specifies the version of the template that you want to delete. + If you don't provide a version number, it deletes all versions of the template. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_template(name="...") + """ + if (name is None) and (template_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or template_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (template_id is None) and (name is not None): + template_id = get_template_id(name=name, account_id=account_id, boto3_session=session) + args: Dict[str, Any] = { + "func_name": "delete_template", + "account_id": account_id, + "boto3_session": session, + "TemplateId": template_id, + } + if version_number is not None: + args["VersionNumber"] = version_number + _delete(**args) + + +def delete_all_dashboards(account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Delete all dashboards. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_all_dashboards() + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + for dashboard in list_dashboards(account_id=account_id, boto3_session=session): + delete_dashboard(dashboard_id=dashboard["DashboardId"], account_id=account_id, boto3_session=session) + + +def delete_all_datasets(account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Delete all datasets. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_all_datasets() + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + for dataset in list_datasets(account_id=account_id, boto3_session=session): + delete_dataset(dataset_id=dataset["DataSetId"], account_id=account_id, boto3_session=session) + + +def delete_all_data_sources(account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Delete all data sources. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_all_data_sources() + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + for data_source in list_data_sources(account_id=account_id, boto3_session=session): + delete_data_source(data_source_id=data_source["DataSourceId"], account_id=account_id, boto3_session=session) + + +def delete_all_templates(account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: + """Delete all templates. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.quicksight.delete_all_templates() + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + for template in list_templates(account_id=account_id, boto3_session=session): + delete_template(template_id=template["TemplateId"], account_id=account_id, boto3_session=session) diff --git a/awswrangler/quicksight/_describe.py b/awswrangler/quicksight/_describe.py new file mode 100644 index 000000000..d46b2bfb6 --- /dev/null +++ b/awswrangler/quicksight/_describe.py @@ -0,0 +1,236 @@ +"""Amazon QuickSight Describe Module.""" + +import logging +from typing import Any, Dict, Optional + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.quicksight._get_list import get_dashboard_id, get_data_source_id, get_dataset_id + +_logger: logging.Logger = logging.getLogger(__name__) + + +def describe_dashboard( + name: Optional[str] = None, + dashboard_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight dashboard by name or ID. + + Note + ---- + You must pass a not None ``name`` or ``dashboard_id`` argument. + + Parameters + ---------- + name : str, optional + Dashboard name. + dashboard_id : str, optional + Dashboard ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Dashboad Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_dashboard(name="my-dashboard") + """ + if (name is None) and (dashboard_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dashboard_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dashboard_id is None) and (name is not None): + dashboard_id = get_dashboard_id(name=name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_dashboard(AwsAccountId=account_id, DashboardId=dashboard_id)["Dashboard"] + + +def describe_data_source( + name: Optional[str] = None, + data_source_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight data source by name or ID. + + Note + ---- + You must pass a not None ``name`` or ``data_source_id`` argument. + + Parameters + ---------- + name : str, optional + Data source name. + data_source_id : str, optional + Data source ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Data source Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_data_source("...") + """ + if (name is None) and (data_source_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or data_source_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (data_source_id is None) and (name is not None): + data_source_id = get_data_source_id(name=name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_data_source(AwsAccountId=account_id, DataSourceId=data_source_id)["DataSource"] + + +def describe_data_source_permissions( + name: Optional[str] = None, + data_source_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight data source permissions by name or ID. + + Note + ---- + You must pass a not None ``name`` or ``data_source_id`` argument. + + Parameters + ---------- + name : str, optional + Data source name. + data_source_id : str, optional + Data source ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Data source Permissions Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_data_source_permissions("my-data-source") + """ + if (name is None) and (data_source_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or data_source_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (data_source_id is None) and (name is not None): + data_source_id = get_data_source_id(name=name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_data_source_permissions(AwsAccountId=account_id, DataSourceId=data_source_id)["Permissions"] + + +def describe_dataset( + name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight dataset by name or ID. + + Note + ---- + You must pass a not None ``name`` or ``dataset_id`` argument. + + Parameters + ---------- + name : str, optional + Dataset name. + dataset_id : str, optional + Dataset ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Dataset Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_dataset("my-dataset") + """ + if (name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_id is None) and (name is not None): + dataset_id = get_dataset_id(name=name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_data_set(AwsAccountId=account_id, DataSetId=dataset_id)["DataSet"] + + +def describe_ingestion( + ingestion_id: str = None, + dataset_name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Any]: + """Describe a QuickSight ingestion by ID. + + Note + ---- + You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument. + + Parameters + ---------- + ingestion_id : str + Ingestion ID. + dataset_name : str, optional + Dataset name. + dataset_id : str, optional + Dataset ID. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Any] + Ingestion Description. + + Examples + -------- + >>> import awswrangler as wr + >>> description = wr.quicksight.describe_dataset(ingestion_id="...", dataset_name="...") + """ + if (dataset_name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_id is None) and (dataset_name is not None): + dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + return client.describe_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id)[ + "Ingestion" + ] diff --git a/awswrangler/quicksight/_get_list.py b/awswrangler/quicksight/_get_list.py new file mode 100644 index 000000000..98035e26e --- /dev/null +++ b/awswrangler/quicksight/_get_list.py @@ -0,0 +1,778 @@ +""" +Amazon QuickSight List and Get Module. + +List and Get MUST be together to avoid circular dependency. +""" + +import logging +from typing import Any, Callable, Dict, List, Optional + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _list( + func_name: str, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + **kwargs, +) -> List[Dict[str, Any]]: + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + client: boto3.client = _utils.client(service_name="quicksight", session=session) + func: Callable = getattr(client, func_name) + response = func(AwsAccountId=account_id, **kwargs) + next_token: str = response.get("NextToken", None) + result: List[Dict[str, Any]] = response[attr_name] + while next_token is not None: + response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs) + next_token = response.get("NextToken", None) + result += response[attr_name] + return result + + +def list_dashboards( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List dashboards in an AWS account. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Dashboards. + + Examples + -------- + >>> import awswrangler as wr + >>> dashboards = wr.quicksight.list_dashboards() + """ + return _list( + func_name="list_dashboards", + attr_name="DashboardSummaryList", + account_id=account_id, + boto3_session=boto3_session, + ) + + +def list_datasets( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight datasets summaries. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Datasets summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> datasets = wr.quicksight.list_datasets() + """ + return _list( + func_name="list_data_sets", attr_name="DataSetSummaries", account_id=account_id, boto3_session=boto3_session + ) + + +def list_data_sources( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight Data sources summaries. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Data sources summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> sources = wr.quicksight.list_data_sources() + """ + return _list( + func_name="list_data_sources", attr_name="DataSources", account_id=account_id, boto3_session=boto3_session + ) + + +def list_templates( + account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight templates. + + Parameters + ---------- + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Templates summaries. + + Examples + -------- + >>> import awswrangler as wr + >>> templates = wr.quicksight.list_templates() + """ + return _list( + func_name="list_templates", attr_name="TemplateSummaryList", account_id=account_id, boto3_session=boto3_session + ) + + +def list_group_memberships( + group_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List all QuickSight Group memberships. + + Parameters + ---------- + group_name : str + The name of the group that you want to see a membership list of. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Group memberships. + + Examples + -------- + >>> import awswrangler as wr + >>> memberships = wr.quicksight.list_group_memberships() + """ + return _list( + func_name="list_group_memberships", + attr_name="GroupMemberList", + account_id=account_id, + boto3_session=boto3_session, + GroupName=group_name, + Namespace=namespace, + ) + + +def list_groups( + namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """List all QuickSight Groups. + + Parameters + ---------- + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> groups = wr.quicksight.list_groups() + """ + return _list( + func_name="list_groups", + attr_name="GroupList", + account_id=account_id, + boto3_session=boto3_session, + Namespace=namespace, + ) + + +def list_iam_policy_assignments( + status: Optional[str] = None, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List IAM policy assignments in the current Amazon QuickSight account. + + Parameters + ---------- + status : str, optional + The status of the assignments. + 'ENABLED'|'DRAFT'|'DISABLED' + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> assigns = wr.quicksight.list_iam_policy_assignments() + """ + args: Dict[str, Any] = { + "func_name": "list_iam_policy_assignments", + "attr_name": "IAMPolicyAssignments", + "account_id": account_id, + "boto3_session": boto3_session, + "Namespace": namespace, + } + if status is not None: + args["AssignmentStatus"] = status + return _list(**args) + + +def list_iam_policy_assignments_for_user( + user_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List all the IAM policy assignments. + + Including the Amazon Resource Names (ARNs) for the IAM policies assigned + to the specified user and group or groups that the user belongs to. + + Parameters + ---------- + user_name : str + The name of the user. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> assigns = wr.quicksight.list_iam_policy_assignments_for_user() + """ + return _list( + func_name="list_iam_policy_assignments_for_user", + attr_name="ActiveAssignments", + account_id=account_id, + boto3_session=boto3_session, + UserName=user_name, + Namespace=namespace, + ) + + +def list_user_groups( + user_name: str, + namespace: str = "default", + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List the Amazon QuickSight groups that an Amazon QuickSight user is a member of. + + Parameters + ---------- + user_name: str: + The Amazon QuickSight user name that you want to list group memberships for. + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> groups = wr.quicksight.list_user_groups() + """ + return _list( + func_name="list_user_groups", + attr_name="GroupList", + account_id=account_id, + boto3_session=boto3_session, + UserName=user_name, + Namespace=namespace, + ) + + +def list_users( + namespace: str = "default", account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[Dict[str, Any]]: + """Return a list of all of the Amazon QuickSight users belonging to this account. + + Parameters + ---------- + namespace : str + The namespace. Currently, you should set this to default . + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + Groups. + + Examples + -------- + >>> import awswrangler as wr + >>> users = wr.quicksight.list_users() + """ + return _list( + func_name="list_users", + attr_name="UserList", + account_id=account_id, + boto3_session=boto3_session, + Namespace=namespace, + ) + + +def list_ingestions( + dataset_name: Optional[str] = None, + dataset_id: Optional[str] = None, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[Dict[str, Any]]: + """List the history of SPICE ingestions for a dataset. + + Parameters + ---------- + dataset_name : str, optional + Dataset name. + dataset_id : str, optional + The ID of the dataset used in the ingestion. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[Dict[str, Any]] + IAM policy assignments. + + Examples + -------- + >>> import awswrangler as wr + >>> ingestions = wr.quicksight.list_ingestions() + """ + if (dataset_name is None) and (dataset_id is None): + raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if account_id is None: + account_id = _utils.get_account_id(boto3_session=session) + if (dataset_id is None) and (dataset_name is not None): + dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) + return _list( + func_name="list_ingestions", + attr_name="Ingestions", + account_id=account_id, + boto3_session=boto3_session, + DataSetId=dataset_id, + ) + + +def _get_ids( + name: str, + func: Callable, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + ids: List[str] = [] + for item in func(account_id=account_id, boto3_session=boto3_session): + if item["Name"] == name: + ids.append(item[attr_name]) + return ids + + +def _get_id( + name: str, + func: Callable, + attr_name: str, + account_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> str: + ids: List[str] = _get_ids( + name=name, func=func, attr_name=attr_name, account_id=account_id, boto3_session=boto3_session + ) + if len(ids) == 0: + raise exceptions.InvalidArgument(f"There is no {attr_name} related with name {name}") + if len(ids) > 1: + raise exceptions.InvalidArgument( + f"There is {len(ids)} {attr_name} with name {name}. " + f"Please pass the id argument to specify " + f"which one you would like to describe." + ) + return ids[0] + + +def get_dashboard_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight dashboard IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated dashboard names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Dashboard name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Dashboad IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_dashboard_ids(name="...") + """ + return _get_ids( + name=name, func=list_dashboards, attr_name="DashboardId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_dashboard_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight dashboard ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Dashboard name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dashboad ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_dashboard_id(name="...") + """ + return _get_id( + name=name, func=list_dashboards, attr_name="DashboardId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_dataset_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight dataset IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated datasets names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Dataset name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Datasets IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_dataset_ids(name="...") + """ + return _get_ids( + name=name, func=list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_dataset_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight Dataset ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Dataset name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dataset ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_dataset_id(name="...") + """ + return _get_id( + name=name, func=list_datasets, attr_name="DataSetId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight data source IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated data source names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Data source IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_data_source_ids(name="...") + """ + return _get_ids( + name=name, func=list_data_sources, attr_name="DataSourceId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_id( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> str: + """Get QuickSight data source ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Dataset ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_data_source_id(name="...") + """ + return _get_id( + name=name, func=list_data_sources, attr_name="DataSourceId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_template_ids( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight template IDs given a name. + + Note + ---- + This function returns a list of ID because Quicksight accepts duplicated templates names, + so you may have more than 1 ID for a given name. + + Parameters + ---------- + name : str + Template name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Tamplate IDs. + + Examples + -------- + >>> import awswrangler as wr + >>> ids = wr.quicksight.get_template_ids(name="...") + """ + return _get_ids( + name=name, func=list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_template_id(name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> str: + """Get QuickSight template ID given a name and fails if there is more than 1 ID associated with this name. + + Parameters + ---------- + name : str + Template name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Template ID. + + Examples + -------- + >>> import awswrangler as wr + >>> my_id = wr.quicksight.get_template_id(name="...") + """ + return _get_id( + name=name, func=list_templates, attr_name="TemplateId", account_id=account_id, boto3_session=boto3_session + ) + + +def get_data_source_arns( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[str]: + """Get QuickSight Data source ARNs given a name. + + Note + ---- + This function returns a list of ARNs because Quicksight accepts duplicated data source names, + so you may have more than 1 ARN for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + Data source ARNs. + + Examples + -------- + >>> import awswrangler as wr + >>> arns = wr.quicksight.get_data_source_arns(name="...") + """ + arns: List[str] = [] + for source in list_data_sources(account_id=account_id, boto3_session=boto3_session): + if source["Name"] == name: + arns.append(source["Arn"]) + return arns + + +def get_data_source_arn( + name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> str: + """Get QuickSight data source ARN given a name and fails if there is more than 1 ARN associated with this name. + + Note + ---- + This function returns a list of ARNs because Quicksight accepts duplicated data source names, + so you may have more than 1 ARN for a given name. + + Parameters + ---------- + name : str + Data source name. + account_id : str, optional + If None, the account ID will be inferred from your boto3 session. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Data source ARN. + + Examples + -------- + >>> import awswrangler as wr + >>> arn = wr.quicksight.get_data_source_arn("...") + """ + arns: List[str] = get_data_source_arns(name=name, account_id=account_id, boto3_session=boto3_session) + if len(arns) == 0: + raise exceptions.InvalidArgument(f"There is not data source with name {name}") + if len(arns) > 1: + raise exceptions.InvalidArgument( + f"There is more than 1 data source with name {name}. " + f"Please pass the data_source_arn argument to specify " + f"which one you would like to describe." + ) + return arns[0] diff --git a/awswrangler/quicksight/_utils.py b/awswrangler/quicksight/_utils.py new file mode 100644 index 000000000..957cf9f53 --- /dev/null +++ b/awswrangler/quicksight/_utils.py @@ -0,0 +1,35 @@ +"""Internal (private) Amazon QuickSight Utilities Module.""" + +import logging +from typing import Any, Dict, List, Optional + +import boto3 # type: ignore + +from awswrangler import _data_types, athena, catalog, exceptions +from awswrangler.quicksight._get_list import list_data_sources + +_logger: logging.Logger = logging.getLogger(__name__) + + +def extract_athena_table_columns(database: str, table: str, boto3_session: boto3.Session) -> List[Dict[str, str]]: + """Extract athena columns data types from table and raising an exception if not exist.""" + dtypes: Optional[Dict[str, str]] = catalog.get_table_types( + database=database, table=table, boto3_session=boto3_session + ) + if dtypes is None: + raise exceptions.InvalidArgument(f"{database}.{table} does not exist on Athena.") + return [{"Name": name, "Type": _data_types.athena2quicksight(dtype=dtype)} for name, dtype in dtypes.items()] + + +def extract_athena_query_columns( + sql: str, data_source_arn: str, account_id: str, boto3_session: boto3.Session +) -> List[Dict[str, str]]: + """Extract athena columns data types from a SQL query.""" + data_sources: List[Dict[str, Any]] = list_data_sources(account_id=account_id, boto3_session=boto3_session) + data_source: Dict[str, Any] = [x for x in data_sources if x["Arn"] == data_source_arn][0] + workgroup: str = data_source["DataSourceParameters"]["AthenaParameters"]["WorkGroup"] + sql_wrapped: str = f"/* QuickSight */\nSELECT ds.* FROM ( {sql} ) ds LIMIT 0" + query_id: str = athena.start_query_execution(sql=sql_wrapped, workgroup=workgroup, boto3_session=boto3_session) + athena.wait_query(query_execution_id=query_id, boto3_session=boto3_session) + dtypes: Dict[str, str] = athena.get_query_columns_types(query_execution_id=query_id, boto3_session=boto3_session) + return [{"Name": name, "Type": _data_types.athena2quicksight(dtype=dtype)} for name, dtype in dtypes.items()] diff --git a/awswrangler/s3.py b/awswrangler/s3.py deleted file mode 100644 index d82df8567..000000000 --- a/awswrangler/s3.py +++ /dev/null @@ -1,2656 +0,0 @@ -"""Amazon S3 Module.""" - -import concurrent.futures -import csv -import logging -import time -import uuid -from itertools import repeat -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union - -import boto3 # type: ignore -import botocore.exceptions # type: ignore -import pandas as pd # type: ignore -import pandas.io.parsers # type: ignore -import pyarrow as pa # type: ignore -import pyarrow.lib # type: ignore -import pyarrow.parquet # type: ignore -import s3fs # type: ignore -from boto3.s3.transfer import TransferConfig # type: ignore -from pandas.io.common import infer_compression # type: ignore - -from awswrangler import _data_types, _utils, catalog, exceptions - -_COMPRESSION_2_EXT: Dict[Optional[str], str] = {None: "", "gzip": ".gz", "snappy": ".snappy"} - -_logger: logging.Logger = logging.getLogger(__name__) - - -def get_bucket_region(bucket: str, boto3_session: Optional[boto3.Session] = None) -> str: - """Get bucket region name. - - Parameters - ---------- - bucket : str - Bucket name. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - str - Region code (e.g. 'us-east-1'). - - Examples - -------- - Using the default boto3 session - - >>> import awswrangler as wr - >>> region = wr.s3.get_bucket_region('bucket-name') - - Using a custom boto3 session - - >>> import boto3 - >>> import awswrangler as wr - >>> region = wr.s3.get_bucket_region('bucket-name', boto3_session=boto3.Session()) - - """ - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - _logger.debug("bucket: %s", bucket) - region: str = client_s3.get_bucket_location(Bucket=bucket)["LocationConstraint"] - region = "us-east-1" if region is None else region - _logger.debug("region: %s", region) - return region - - -def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) -> bool: - """Check if object exists on S3. - - Parameters - ---------- - path: str - S3 path (e.g. s3://bucket/key). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - bool - True if exists, False otherwise. - - Examples - -------- - Using the default boto3 session - - >>> import awswrangler as wr - >>> wr.s3.does_object_exist('s3://bucket/key_real') - True - >>> wr.s3.does_object_exist('s3://bucket/key_unreal') - False - - Using a custom boto3 session - - >>> import boto3 - >>> import awswrangler as wr - >>> wr.s3.does_object_exist('s3://bucket/key_real', boto3_session=boto3.Session()) - True - >>> wr.s3.does_object_exist('s3://bucket/key_unreal', boto3_session=boto3.Session()) - False - - """ - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - bucket: str - key: str - bucket, key = path.replace("s3://", "").split("/", 1) - try: - client_s3.head_object(Bucket=bucket, Key=key) - return True - except botocore.exceptions.ClientError as ex: - if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404: - return False - raise ex # pragma: no cover - - -def list_directories(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: - """List Amazon S3 objects from a prefix. - - Parameters - ---------- - path : str - S3 path (e.g. s3://bucket/prefix). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - List of objects paths. - - Examples - -------- - Using the default boto3 session - - >>> import awswrangler as wr - >>> wr.s3.list_objects('s3://bucket/prefix/') - ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] - - Using a custom boto3 session - - >>> import boto3 - >>> import awswrangler as wr - >>> wr.s3.list_objects('s3://bucket/prefix/', boto3_session=boto3.Session()) - ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] - - """ - return _list_objects(path=path, delimiter="/", boto3_session=boto3_session) - - -def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[str]: - """List Amazon S3 objects from a prefix. - - Parameters - ---------- - path : str - S3 path (e.g. s3://bucket/prefix). - suffix: str, optional - Suffix for filtering S3 keys. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - List of objects paths. - - Examples - -------- - Using the default boto3 session - - >>> import awswrangler as wr - >>> wr.s3.list_objects('s3://bucket/prefix') - ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] - - Using a custom boto3 session - - >>> import boto3 - >>> import awswrangler as wr - >>> wr.s3.list_objects('s3://bucket/prefix', boto3_session=boto3.Session()) - ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] - - """ - paths: List[str] = _list_objects(path=path, delimiter=None, suffix=suffix, boto3_session=boto3_session) - return [p for p in paths if not p.endswith("/")] - - -def _list_objects( - path: str, - delimiter: Optional[str] = None, - suffix: Optional[str] = None, - boto3_session: Optional[boto3.Session] = None, -) -> List[str]: - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - paginator = client_s3.get_paginator("list_objects_v2") - bucket: str - prefix: str - bucket, prefix = _utils.parse_path(path=path) - args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} - if delimiter is not None: - args["Delimiter"] = delimiter - response_iterator = paginator.paginate(**args) - paths: List[str] = [] - for page in response_iterator: # pylint: disable=too-many-nested-blocks - if delimiter is None: - contents: Optional[List] = page.get("Contents") - if contents is not None: - for content in contents: - if (content is not None) and ("Key" in content): - key: str = content["Key"] - if (suffix is None) or key.endswith(suffix): - paths.append(f"s3://{bucket}/{key}") - else: - prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") - if prefixes is not None: - for pfx in prefixes: - if (pfx is not None) and ("Prefix" in pfx): - key = pfx["Prefix"] - paths.append(f"s3://{bucket}/{key}") - return paths - - -def _path2list(path: object, boto3_session: boto3.Session, suffix: str = None) -> List[str]: - if isinstance(path, str): # prefix - paths: List[str] = list_objects(path=path, suffix=suffix, boto3_session=boto3_session) - elif isinstance(path, list): - paths = path if suffix is None else [x for x in path if x.endswith(suffix)] - else: - raise exceptions.InvalidArgumentType(f"{type(path)} is not a valid path type. Please, use str or List[str].") - return paths - - -def delete_objects( - path: Union[str, List[str]], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None -) -> None: - """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - None - None. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1']) # Delete both objects - >>> wr.s3.delete_objects('s3://bucket/prefix') # Delete all objects under the received prefix - - """ - paths: List[str] = _path2list(path=path, boto3_session=boto3_session) - if len(paths) < 1: - return - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths) - for bucket, keys in buckets.items(): - chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000) - if use_threads is False: - for chunk in chunks: - _delete_objects(bucket=bucket, keys=chunk, client_s3=client_s3) - else: - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - list(executor.map(_delete_objects, repeat(bucket), chunks, repeat(client_s3))) - - -def _split_paths_by_bucket(paths: List[str]) -> Dict[str, List[str]]: - buckets: Dict[str, List[str]] = {} - bucket: str - key: str - for path in paths: - bucket, key = _utils.parse_path(path=path) - if bucket not in buckets: - buckets[bucket] = [] - buckets[bucket].append(key) - return buckets - - -def _delete_objects(bucket: str, keys: List[str], client_s3: boto3.client) -> None: - _logger.debug("len(keys): %s", len(keys)) - batch: List[Dict[str, str]] = [{"Key": key} for key in keys] - res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch}) - deleted = res.get("Deleted") - if deleted is not None: - for i in deleted: - _logger.debug("s3://%s/%s has been deleted.", bucket, i.get("Key")) - errors = res.get("Errors") - if errors is not None: # pragma: no cover - raise exceptions.ServiceApiError(errors) - - -def describe_objects( - path: Union[str, List[str]], - wait_time: Optional[Union[int, float]] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> Dict[str, Dict[str, Any]]: - """Describe Amazon S3 objects from a received S3 prefix or list of S3 objects paths. - - Fetch attributes like ContentLength, DeleteMarker, LastModified, ContentType, etc - The full list of attributes can be explored under the boto3 head_object documentation: - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - wait_time : Union[int,float], optional - How much time (seconds) should Wrangler try to reach this objects. - Very useful to overcome eventual consistence issues. - `None` means only a single try will be done. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - Dict[str, Dict[str, Any]] - Return a dictionary of objects returned from head_objects where the key is the object path. - The response object can be explored here: - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object - - Examples - -------- - >>> import awswrangler as wr - >>> descs0 = wr.s3.describe_objects(['s3://bucket/key0', 's3://bucket/key1']) # Describe both objects - >>> descs1 = wr.s3.describe_objects('s3://bucket/prefix') # Describe all objects under the prefix - >>> descs2 = wr.s3.describe_objects('s3://bucket/prefix', wait_time=30) # Overcoming eventual consistence issues - - """ - paths: List[str] = _path2list(path=path, boto3_session=boto3_session) - if len(paths) < 1: - return {} - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - resp_list: List[Tuple[str, Dict[str, Any]]] - if use_threads is False: - resp_list = [_describe_object(path=p, wait_time=wait_time, client_s3=client_s3) for p in paths] - else: - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - resp_list = list(executor.map(_describe_object, paths, repeat(wait_time), repeat(client_s3))) - desc_dict: Dict[str, Dict[str, Any]] = dict(resp_list) - return desc_dict - - -def _describe_object( - path: str, wait_time: Optional[Union[int, float]], client_s3: boto3.client -) -> Tuple[str, Dict[str, Any]]: - wait_time = int(wait_time) if isinstance(wait_time, float) else wait_time - tries: int = wait_time if (wait_time is not None) and (wait_time > 0) else 1 - bucket: str - key: str - bucket, key = _utils.parse_path(path=path) - desc: Dict[str, Any] = {} - for i in range(tries, 0, -1): - try: - desc = client_s3.head_object(Bucket=bucket, Key=key) - break - except botocore.exceptions.ClientError as e: # pragma: no cover - if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404: # Not Found - _logger.debug("Object not found. %s seconds remaining to wait.", i) - if i == 1: # Last try, there is no more need to sleep - break - time.sleep(1) - else: - raise e - return path, desc - - -def size_objects( - path: Union[str, List[str]], - wait_time: Optional[Union[int, float]] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> Dict[str, Optional[int]]: - """Get the size (ContentLength) in bytes of Amazon S3 objects from a received S3 prefix or list of S3 objects paths. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - wait_time : Union[int,float], optional - How much time (seconds) should Wrangler try to reach this objects. - Very useful to overcome eventual consistence issues. - `None` means only a single try will be done. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - Dict[str, Optional[int]] - Dictionary where the key is the object path and the value is the object size. - - Examples - -------- - >>> import awswrangler as wr - >>> sizes0 = wr.s3.size_objects(['s3://bucket/key0', 's3://bucket/key1']) # Get the sizes of both objects - >>> sizes1 = wr.s3.size_objects('s3://bucket/prefix') # Get the sizes of all objects under the received prefix - >>> sizes2 = wr.s3.size_objects('s3://bucket/prefix', wait_time=30) # Overcoming eventual consistence issues - - """ - desc_list: Dict[str, Dict[str, Any]] = describe_objects( - path=path, wait_time=wait_time, use_threads=use_threads, boto3_session=boto3_session - ) - size_dict: Dict[str, Optional[int]] = {k: d.get("ContentLength", None) for k, d in desc_list.items()} - return size_dict - - -def to_csv( # pylint: disable=too-many-arguments - df: pd.DataFrame, - path: str, - sep: str = ",", - index: bool = True, - columns: Optional[List[str]] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - dataset: bool = False, - partition_cols: Optional[List[str]] = None, - mode: Optional[str] = None, - catalog_versioning: bool = False, - database: Optional[str] = None, - table: Optional[str] = None, - dtype: Optional[Dict[str, str]] = None, - description: Optional[str] = None, - parameters: Optional[Dict[str, str]] = None, - columns_comments: Optional[Dict[str, str]] = None, - regular_partitions: bool = True, - projection_enabled: bool = False, - projection_types: Optional[Dict[str, str]] = None, - projection_ranges: Optional[Dict[str, str]] = None, - projection_values: Optional[Dict[str, str]] = None, - projection_intervals: Optional[Dict[str, str]] = None, - projection_digits: Optional[Dict[str, str]] = None, - **pandas_kwargs, -) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: - """Write CSV file or dataset on Amazon S3. - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). - - Note - ---- - The table name and all column names will be automatically sanitize using - `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. - - Note - ---- - On `append` mode, the `parameters` will be upsert on an existing table. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - df: pandas.DataFrame - Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html - path : str - Amazon S3 path (e.g. s3://bucket/filename.csv). - sep : str - String of length 1. Field delimiter for the output file. - index : bool - Write row names (index). - columns : List[str], optional - Columns to write. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 Session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - dataset: bool - If True store a parquet dataset instead of a single file. - If True, enable all follow arguments: - partition_cols, mode, database, table, description, parameters, columns_comments, . - partition_cols: List[str], optional - List of column names that will be used to create partitions. Only takes effect if dataset=True. - mode: str, optional - ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. - catalog_versioning : bool - If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. - database : str, optional - Glue/Athena catalog: Database name. - table : str, optional - Glue/Athena catalog: Table name. - dtype : Dict[str, str], optional - Dictionary of columns names and Athena/Glue types to be casted. - Useful when you have columns with undetermined or mixed data types. - (e.g. {'col name': 'bigint', 'col2 name': 'int'}) - description : str, optional - Glue/Athena catalog: Table description - parameters : Dict[str, str], optional - Glue/Athena catalog: Key/value pairs to tag the table. - columns_comments : Dict[str, str], optional - Glue/Athena catalog: - Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). - regular_partitions : bool - Create regular partitions (Non projected partitions) on Glue Catalog. - Disable when you will work only with Partition Projection. - Keep enabled even when working with projections is useful to keep - Redshift Spectrum working with the regular partitions. - projection_enabled : bool - Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) - projection_types : Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections types. - Valid types: "enum", "integer", "date", "injected" - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) - projection_ranges: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections ranges. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) - projection_values: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections values. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) - projection_intervals: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections intervals. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '5'}) - projection_digits: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections digits. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '2'}) - pandas_kwargs : - keyword arguments forwarded to pandas.DataFrame.to_csv() - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html - - Returns - ------- - None - None. - - Examples - -------- - Writing single file - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/prefix/my_file.csv', - ... ) - { - 'paths': ['s3://bucket/prefix/my_file.csv'], - 'partitions_values': {} - } - - Writing single file encrypted with a KMS key - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/prefix/my_file.csv', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - { - 'paths': ['s3://bucket/prefix/my_file.csv'], - 'partitions_values': {} - } - - Writing partitioned dataset - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... partition_cols=['col2'] - ... ) - { - 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], - 'partitions_values: { - 's3://.../col2=A/': ['A'], - 's3://.../col2=B/': ['B'] - } - } - - Writing dataset to S3 with metadata on Athena/Glue Catalog. - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... partition_cols=['col2'], - ... database='default', # Athena/Glue database - ... table='my_table' # Athena/Glue table - ... ) - { - 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], - 'partitions_values: { - 's3://.../col2=A/': ['A'], - 's3://.../col2=B/': ['B'] - } - } - - Writing dataset casting empty column data type - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_csv( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'], - ... 'col3': [None, None, None] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... database='default', # Athena/Glue database - ... table='my_table' # Athena/Glue table - ... dtype={'col3': 'date'} - ... ) - { - 'paths': ['s3://.../x.csv'], - 'partitions_values: {} - } - - """ - if (database is None) ^ (table is None): - raise exceptions.InvalidArgumentCombination( - "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." - ) - if df.empty is True: - raise exceptions.EmptyDataFrame() - - # Sanitize table to respect Athena's standards - partition_cols = partition_cols if partition_cols else [] - dtype = dtype if dtype else {} - partitions_values: Dict[str, List[str]] = {} - df = catalog.sanitize_dataframe_columns_names(df=df) - partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] - dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} - - session: boto3.Session = _utils.ensure_session(session=boto3_session) - fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) - if dataset is False: - if partition_cols: - raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") - if mode is not None: - raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") - if columns_comments: - raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use columns_comments.") - if any(arg is not None for arg in (database, table, description, parameters)): - raise exceptions.InvalidArgumentCombination( - "Please pass dataset=True to be able to use any one of these " - "arguments: database, table, description, parameters, " - "columns_comments." - ) - pandas_kwargs["sep"] = sep - pandas_kwargs["index"] = index - pandas_kwargs["columns"] = columns - _to_text(file_format="csv", df=df, path=path, fs=fs, **pandas_kwargs) - paths = [path] - else: - mode = "append" if mode is None else mode - if columns: - df = df[columns] - if ( - (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) - ): # Fetching Catalog Types - catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( - database=database, table=table, boto3_session=session - ) - if catalog_types is not None: - for k, v in catalog_types.items(): - dtype[k] = v - df = catalog.drop_duplicated_columns(df=df) - paths, partitions_values = _to_csv_dataset( - df=df, - path=path, - index=index, - sep=sep, - fs=fs, - use_threads=use_threads, - partition_cols=partition_cols, - dtype=dtype, - mode=mode, - boto3_session=session, - ) - if (database is not None) and (table is not None): - columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( - df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True - ) - catalog.create_csv_table( - database=database, - table=table, - path=path, - columns_types=columns_types, - partitions_types=partitions_types, - description=description, - parameters=parameters, - columns_comments=columns_comments, - boto3_session=session, - mode=mode, - catalog_versioning=catalog_versioning, - sep=sep, - projection_enabled=projection_enabled, - projection_types=projection_types, - projection_ranges=projection_ranges, - projection_values=projection_values, - projection_intervals=projection_intervals, - projection_digits=projection_digits, - ) - if partitions_values and (regular_partitions is True): - _logger.debug("partitions_values:\n%s", partitions_values) - catalog.add_csv_partitions( - database=database, table=table, partitions_values=partitions_values, boto3_session=session, sep=sep - ) - return {"paths": paths, "partitions_values": partitions_values} - - -def _to_csv_dataset( - df: pd.DataFrame, - path: str, - index: bool, - sep: str, - fs: s3fs.S3FileSystem, - use_threads: bool, - mode: str, - dtype: Dict[str, str], - partition_cols: Optional[List[str]] = None, - boto3_session: Optional[boto3.Session] = None, -) -> Tuple[List[str], Dict[str, List[str]]]: - paths: List[str] = [] - partitions_values: Dict[str, List[str]] = {} - path = path if path[-1] == "/" else f"{path}/" - if mode not in ["append", "overwrite", "overwrite_partitions"]: - raise exceptions.InvalidArgumentValue( - f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." - ) - if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): - delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) - df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) - _logger.debug("dtypes: %s", df.dtypes) - if not partition_cols: - file_path: str = f"{path}{uuid.uuid4().hex}.csv" - _to_text( - file_format="csv", - df=df, - path=file_path, - fs=fs, - quoting=csv.QUOTE_NONE, - escapechar="\\", - header=False, - date_format="%Y-%m-%d %H:%M:%S.%f", - index=index, - sep=sep, - ) - paths.append(file_path) - else: - for keys, subgroup in df.groupby(by=partition_cols, observed=True): - subgroup = subgroup.drop(partition_cols, axis="columns") - keys = (keys,) if not isinstance(keys, tuple) else keys - subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) - prefix: str = f"{path}{subdir}/" - if mode == "overwrite_partitions": - delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) - file_path = f"{prefix}{uuid.uuid4().hex}.csv" - _to_text( - file_format="csv", - df=subgroup, - path=file_path, - fs=fs, - quoting=csv.QUOTE_NONE, - escapechar="\\", - header=False, - date_format="%Y-%m-%d %H:%M:%S.%f", - index=index, - sep=sep, - ) - paths.append(file_path) - partitions_values[prefix] = [str(k) for k in keys] - return paths, partitions_values - - -def to_json( - df: pd.DataFrame, - path: str, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - **pandas_kwargs, -) -> None: - """Write JSON file on Amazon S3. - - Parameters - ---------- - df: pandas.DataFrame - Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html - path : str - Amazon S3 path (e.g. s3://bucket/filename.csv). - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 Session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - pandas_kwargs: - keyword arguments forwarded to pandas.DataFrame.to_csv() - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html - - Returns - ------- - None - None. - - Examples - -------- - Writing JSON file - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_json( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/filename.json', - ... ) - - Writing CSV file encrypted with a KMS key - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_json( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/filename.json', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - """ - return _to_text( - file_format="json", - df=df, - path=path, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - **pandas_kwargs, - ) - - -def _to_text( - file_format: str, - df: pd.DataFrame, - path: str, - fs: Optional[s3fs.S3FileSystem] = None, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - **pandas_kwargs, -) -> None: - if df.empty is True: # pragma: no cover - raise exceptions.EmptyDataFrame() - if fs is None: - fs = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) - encoding: Optional[str] = pandas_kwargs.get("encoding", None) - newline: Optional[str] = pandas_kwargs.get("line_terminator", None) - with fs.open(path=path, mode="w", encoding=encoding, newline=newline) as f: - if file_format == "csv": - df.to_csv(f, **pandas_kwargs) - elif file_format == "json": - df.to_json(f, **pandas_kwargs) - - -def to_parquet( # pylint: disable=too-many-arguments,too-many-locals - df: pd.DataFrame, - path: str, - index: bool = False, - compression: Optional[str] = "snappy", - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - dataset: bool = False, - partition_cols: Optional[List[str]] = None, - mode: Optional[str] = None, - catalog_versioning: bool = False, - database: Optional[str] = None, - table: Optional[str] = None, - dtype: Optional[Dict[str, str]] = None, - description: Optional[str] = None, - parameters: Optional[Dict[str, str]] = None, - columns_comments: Optional[Dict[str, str]] = None, - regular_partitions: bool = True, - projection_enabled: bool = False, - projection_types: Optional[Dict[str, str]] = None, - projection_ranges: Optional[Dict[str, str]] = None, - projection_values: Optional[Dict[str, str]] = None, - projection_intervals: Optional[Dict[str, str]] = None, - projection_digits: Optional[Dict[str, str]] = None, -) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: - """Write Parquet file or dataset on Amazon S3. - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). - - Note - ---- - The table name and all column names will be automatically sanitize using - `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. - - Note - ---- - On `append` mode, the `parameters` will be upsert on an existing table. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - df: pandas.DataFrame - Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html - path : str - S3 path (for file e.g. ``s3://bucket/prefix/filename.parquet``) (for dataset e.g. ``s3://bucket/prefix``). - index : bool - True to store the DataFrame index in file, otherwise False to ignore it. - compression: str, optional - Compression style (``None``, ``snappy``, ``gzip``). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - dataset: bool - If True store a parquet dataset instead of a single file. - If True, enable all follow arguments: - partition_cols, mode, database, table, description, parameters, columns_comments, . - partition_cols: List[str], optional - List of column names that will be used to create partitions. Only takes effect if dataset=True. - mode: str, optional - ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. - catalog_versioning : bool - If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. - database : str, optional - Glue/Athena catalog: Database name. - table : str, optional - Glue/Athena catalog: Table name. - dtype : Dict[str, str], optional - Dictionary of columns names and Athena/Glue types to be casted. - Useful when you have columns with undetermined or mixed data types. - (e.g. {'col name': 'bigint', 'col2 name': 'int'}) - description : str, optional - Glue/Athena catalog: Table description - parameters : Dict[str, str], optional - Glue/Athena catalog: Key/value pairs to tag the table. - columns_comments : Dict[str, str], optional - Glue/Athena catalog: - Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). - regular_partitions : bool - Create regular partitions (Non projected partitions) on Glue Catalog. - Disable when you will work only with Partition Projection. - Keep enabled even when working with projections is useful to keep - Redshift Spectrum working with the regular partitions. - projection_enabled : bool - Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) - projection_types : Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections types. - Valid types: "enum", "integer", "date", "injected" - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) - projection_ranges: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections ranges. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) - projection_values: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections values. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) - projection_intervals: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections intervals. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '5'}) - projection_digits: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections digits. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '2'}) - - Returns - ------- - Dict[str, Union[List[str], Dict[str, List[str]]]] - Dictionary with: - 'paths': List of all stored files paths on S3. - 'partitions_values': Dictionary of partitions added with keys as S3 path locations - and values as a list of partitions values as str. - - Examples - -------- - Writing single file - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/prefix/my_file.parquet', - ... ) - { - 'paths': ['s3://bucket/prefix/my_file.parquet'], - 'partitions_values': {} - } - - Writing single file encrypted with a KMS key - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({'col': [1, 2, 3]}), - ... path='s3://bucket/prefix/my_file.parquet', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - { - 'paths': ['s3://bucket/prefix/my_file.parquet'], - 'partitions_values': {} - } - - Writing partitioned dataset - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... partition_cols=['col2'] - ... ) - { - 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], - 'partitions_values: { - 's3://.../col2=A/': ['A'], - 's3://.../col2=B/': ['B'] - } - } - - Writing dataset to S3 with metadata on Athena/Glue Catalog. - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... partition_cols=['col2'], - ... database='default', # Athena/Glue database - ... table='my_table' # Athena/Glue table - ... ) - { - 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], - 'partitions_values: { - 's3://.../col2=A/': ['A'], - 's3://.../col2=B/': ['B'] - } - } - - Writing dataset casting empty column data type - - >>> import awswrangler as wr - >>> import pandas as pd - >>> wr.s3.to_parquet( - ... df=pd.DataFrame({ - ... 'col': [1, 2, 3], - ... 'col2': ['A', 'A', 'B'], - ... 'col3': [None, None, None] - ... }), - ... path='s3://bucket/prefix', - ... dataset=True, - ... database='default', # Athena/Glue database - ... table='my_table' # Athena/Glue table - ... dtype={'col3': 'date'} - ... ) - { - 'paths': ['s3://.../x.parquet'], - 'partitions_values: {} - } - - """ - if (database is None) ^ (table is None): - raise exceptions.InvalidArgumentCombination( - "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." - ) - if df.empty is True: - raise exceptions.EmptyDataFrame() - - # Sanitize table to respect Athena's standards - partition_cols = partition_cols if partition_cols else [] - dtype = dtype if dtype else {} - partitions_values: Dict[str, List[str]] = {} - df = catalog.sanitize_dataframe_columns_names(df=df) - partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] - dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} - df = catalog.drop_duplicated_columns(df=df) - - session: boto3.Session = _utils.ensure_session(session=boto3_session) - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) - compression_ext: Optional[str] = _COMPRESSION_2_EXT.get(compression, None) - if compression_ext is None: - raise exceptions.InvalidCompression(f"{compression} is invalid, please use None, snappy or gzip.") - if dataset is False: - if path.endswith("/"): # pragma: no cover - raise exceptions.InvalidArgumentValue( - "If , the argument should be a object path, not a directory." - ) - if partition_cols: - raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") - if mode is not None: - raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") - if any(arg is not None for arg in (database, table, description, parameters)): - raise exceptions.InvalidArgumentCombination( - "Please pass dataset=True to be able to use any one of these " - "arguments: database, table, description, parameters, " - "columns_comments." - ) - df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) - schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( - df=df, index=index, ignore_cols=partition_cols, dtype=dtype - ) - _logger.debug("schema: \n%s", schema) - paths = [ - _to_parquet_file( - df=df, path=path, schema=schema, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype - ) - ] - else: - mode = "append" if mode is None else mode - if ( - (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) - ): # Fetching Catalog Types - catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( - database=database, table=table, boto3_session=session - ) - if catalog_types is not None: - for k, v in catalog_types.items(): - dtype[k] = v - paths, partitions_values = _to_parquet_dataset( - df=df, - path=path, - index=index, - compression=compression, - compression_ext=compression_ext, - cpus=cpus, - fs=fs, - use_threads=use_threads, - partition_cols=partition_cols, - dtype=dtype, - mode=mode, - boto3_session=session, - ) - if (database is not None) and (table is not None): - columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( - df=df, index=index, partition_cols=partition_cols, dtype=dtype - ) - catalog.create_parquet_table( - database=database, - table=table, - path=path, - columns_types=columns_types, - partitions_types=partitions_types, - compression=compression, - description=description, - parameters=parameters, - columns_comments=columns_comments, - boto3_session=session, - mode=mode, - catalog_versioning=catalog_versioning, - projection_enabled=projection_enabled, - projection_types=projection_types, - projection_ranges=projection_ranges, - projection_values=projection_values, - projection_intervals=projection_intervals, - projection_digits=projection_digits, - ) - if partitions_values and (regular_partitions is True): - _logger.debug("partitions_values:\n%s", partitions_values) - catalog.add_parquet_partitions( - database=database, - table=table, - partitions_values=partitions_values, - compression=compression, - boto3_session=session, - ) - return {"paths": paths, "partitions_values": partitions_values} - - -def _to_parquet_dataset( - df: pd.DataFrame, - path: str, - index: bool, - compression: Optional[str], - compression_ext: str, - cpus: int, - fs: s3fs.S3FileSystem, - use_threads: bool, - mode: str, - dtype: Dict[str, str], - partition_cols: Optional[List[str]] = None, - boto3_session: Optional[boto3.Session] = None, -) -> Tuple[List[str], Dict[str, List[str]]]: - paths: List[str] = [] - partitions_values: Dict[str, List[str]] = {} - path = path if path[-1] == "/" else f"{path}/" - if mode not in ["append", "overwrite", "overwrite_partitions"]: - raise exceptions.InvalidArgumentValue( - f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." - ) - if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): - delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) - df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) - schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( - df=df, index=index, ignore_cols=partition_cols, dtype=dtype - ) - _logger.debug("schema: \n%s", schema) - if not partition_cols: - file_path: str = f"{path}{uuid.uuid4().hex}{compression_ext}.parquet" - _to_parquet_file( - df=df, schema=schema, path=file_path, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype - ) - paths.append(file_path) - else: - for keys, subgroup in df.groupby(by=partition_cols, observed=True): - subgroup = subgroup.drop(partition_cols, axis="columns") - keys = (keys,) if not isinstance(keys, tuple) else keys - subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) - prefix: str = f"{path}{subdir}/" - if mode == "overwrite_partitions": - delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) - file_path = f"{prefix}{uuid.uuid4().hex}{compression_ext}.parquet" - _to_parquet_file( - df=subgroup, - schema=schema, - path=file_path, - index=index, - compression=compression, - cpus=cpus, - fs=fs, - dtype=dtype, - ) - paths.append(file_path) - partitions_values[prefix] = [str(k) for k in keys] - return paths, partitions_values - - -def _to_parquet_file( - df: pd.DataFrame, - path: str, - schema: pa.Schema, - index: bool, - compression: Optional[str], - cpus: int, - fs: s3fs.S3FileSystem, - dtype: Dict[str, str], -) -> str: - table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=True) - for col_name, col_type in dtype.items(): - if col_name in table.column_names: - col_index = table.column_names.index(col_name) - pyarrow_dtype = _data_types.athena2pyarrow(col_type) - field = pa.field(name=col_name, type=pyarrow_dtype) - table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype)) - _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype) - pyarrow.parquet.write_table( - table=table, - where=path, - write_statistics=True, - use_dictionary=True, - filesystem=fs, - coerce_timestamps="ms", - compression=compression, - flavor="spark", - ) - return path - - -def read_csv( - path: Union[str, List[str]], - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - chunksize: Optional[int] = None, - dataset: bool = False, - **pandas_kwargs, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read CSV file(s) from from a received S3 prefix or list of S3 objects paths. - - Note - ---- - For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - chunksize: int, optional - If specified, return an generator where chunksize is the number of rows to include in each chunk. - dataset: bool - If `True` read a CSV dataset instead of simple file(s) loading all the related partitions as columns. - pandas_kwargs: - keyword arguments forwarded to pandas.read_csv(). - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunksize != None`. - - Examples - -------- - Reading all CSV files under a prefix - - >>> import awswrangler as wr - >>> df = wr.s3.read_csv(path='s3://bucket/prefix/') - - Reading all CSV files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_csv( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading all CSV files from a list - - >>> import awswrangler as wr - >>> df = wr.s3.read_csv(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv']) - - Reading in chunks of 100 lines - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_csv(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunksize=100) - >>> for df in dfs: - >>> print(df) # 100 lines Pandas DataFrame - - """ - return _read_text( - parser_func=pd.read_csv, - path=path, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - chunksize=chunksize, - dataset=dataset, - **pandas_kwargs, - ) - - -def read_fwf( - path: Union[str, List[str]], - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - chunksize: Optional[int] = None, - dataset: bool = False, - **pandas_kwargs, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read fixed-width formatted file(s) from from a received S3 prefix or list of S3 objects paths. - - Note - ---- - For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - chunksize: int, optional - If specified, return an generator where chunksize is the number of rows to include in each chunk. - dataset: bool - If `True` read a FWF dataset instead of simple file(s) loading all the related partitions as columns. - pandas_kwargs: - keyword arguments forwarded to pandas.read_fwf(). - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_fwf.html - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunksize != None`. - - Examples - -------- - Reading all fixed-width formatted (FWF) files under a prefix - - >>> import awswrangler as wr - >>> df = wr.s3.read_fwf(path='s3://bucket/prefix/') - - Reading all fixed-width formatted (FWF) files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_fwf( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading all fixed-width formatted (FWF) files from a list - - >>> import awswrangler as wr - >>> df = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt']) - - Reading in chunks of 100 lines - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt'], chunksize=100) - >>> for df in dfs: - >>> print(df) # 100 lines Pandas DataFrame - - """ - return _read_text( - parser_func=pd.read_fwf, - path=path, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - chunksize=chunksize, - dataset=dataset, - **pandas_kwargs, - ) - - -def read_json( - path: Union[str, List[str]], - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - chunksize: Optional[int] = None, - dataset: bool = False, - **pandas_kwargs, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read JSON file(s) from from a received S3 prefix or list of S3 objects paths. - - Note - ---- - For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - chunksize: int, optional - If specified, return an generator where chunksize is the number of rows to include in each chunk. - dataset: bool - If `True` read a JSON dataset instead of simple file(s) loading all the related partitions as columns. - If `True`, the `lines=True` will be assumed by default. - pandas_kwargs: - keyword arguments forwarded to pandas.read_json(). - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunksize != None`. - - Examples - -------- - Reading all JSON files under a prefix - - >>> import awswrangler as wr - >>> df = wr.s3.read_json(path='s3://bucket/prefix/') - - Reading all JSON files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_json( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading all JSON files from a list - - >>> import awswrangler as wr - >>> df = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json']) - - Reading in chunks of 100 lines - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json'], chunksize=100) - >>> for df in dfs: - >>> print(df) # 100 lines Pandas DataFrame - - """ - if (dataset is True) and ("lines" not in pandas_kwargs): - pandas_kwargs["lines"] = True - return _read_text( - parser_func=pd.read_json, - path=path, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - chunksize=chunksize, - dataset=dataset, - **pandas_kwargs, - ) - - -def _read_text( - parser_func: Callable, - path: Union[str, List[str]], - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, - chunksize: Optional[int] = None, - dataset: bool = False, - **pandas_kwargs, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - if "iterator" in pandas_kwargs: - raise exceptions.InvalidArgument("Please, use chunksize instead of iterator.") - session: boto3.Session = _utils.ensure_session(session=boto3_session) - if (dataset is True) and (not isinstance(path, str)): # pragma: no cover - raise exceptions.InvalidArgument("The path argument must be a string Amazon S3 prefix if dataset=True.") - if dataset is True: - path_root: str = str(path) - else: - path_root = "" - paths: List[str] = _path2list(path=path, boto3_session=session) - _logger.debug("paths:\n%s", paths) - if chunksize is not None: - dfs: Iterator[pd.DataFrame] = _read_text_chunksize( - parser_func=parser_func, - paths=paths, - boto3_session=session, - chunksize=chunksize, - pandas_args=pandas_kwargs, - s3_additional_kwargs=s3_additional_kwargs, - dataset=dataset, - path_root=path_root, - ) - return dfs - if (use_threads is False) or (boto3_session is not None): - df: pd.DataFrame = pd.concat( - objs=[ - _read_text_full( - parser_func=parser_func, - path=p, - boto3_session=session, - pandas_args=pandas_kwargs, - s3_additional_kwargs=s3_additional_kwargs, - dataset=dataset, - path_root=path_root, - ) - for p in paths - ], - ignore_index=True, - sort=False, - ) - else: - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - df = pd.concat( - objs=executor.map( - _read_text_full, - repeat(parser_func), - repeat(path_root), - paths, - repeat(None), # Boto3.Session - repeat(pandas_kwargs), - repeat(s3_additional_kwargs), - repeat(dataset), - ), - ignore_index=True, - sort=False, - ) - return df - - -def _read_text_chunksize( - parser_func: Callable, - path_root: str, - paths: List[str], - boto3_session: boto3.Session, - chunksize: int, - pandas_args: Dict[str, Any], - s3_additional_kwargs: Optional[Dict[str, str]] = None, - dataset: bool = False, -) -> Iterator[pd.DataFrame]: - fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) - for path in paths: - _logger.debug("path: %s", path) - partitions: Dict[str, Any] = {} - if dataset is True: - partitions = _utils.extract_partitions_from_path(path_root=path_root, path=path) - if pandas_args.get("compression", "infer") == "infer": - pandas_args["compression"] = infer_compression(path, compression="infer") - mode: str = "r" if pandas_args.get("compression") is None else "rb" - with fs.open(path, mode) as f: - reader: pandas.io.parsers.TextFileReader = parser_func(f, chunksize=chunksize, **pandas_args) - for df in reader: - if dataset is True: - for column_name, value in partitions.items(): - df[column_name] = value - yield df - - -def _read_text_full( - parser_func: Callable, - path_root: str, - path: str, - boto3_session: boto3.Session, - pandas_args: Dict[str, Any], - s3_additional_kwargs: Optional[Dict[str, str]] = None, - dataset: bool = False, -) -> pd.DataFrame: - fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) - if pandas_args.get("compression", "infer") == "infer": - pandas_args["compression"] = infer_compression(path, compression="infer") - mode: str = "r" if pandas_args.get("compression") is None else "rb" - encoding: Optional[str] = pandas_args.get("encoding", None) - newline: Optional[str] = pandas_args.get("lineterminator", None) - with fs.open(path=path, mode=mode, encoding=encoding, newline=newline) as f: - df: pd.DataFrame = parser_func(f, **pandas_args) - if dataset is True: - partitions: Dict[str, Any] = _utils.extract_partitions_from_path(path_root=path_root, path=path) - for column_name, value in partitions.items(): - df[column_name] = value - return df - - -def _read_parquet_init( - path: Union[str, List[str]], - filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, - categories: List[str] = None, - validate_schema: bool = True, - dataset: bool = False, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, -) -> pyarrow.parquet.ParquetDataset: - """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset.""" - session: boto3.Session = _utils.ensure_session(session=boto3_session) - if dataset is False: - path_or_paths: Union[str, List[str]] = _path2list(path=path, boto3_session=session) - elif isinstance(path, str): - path_or_paths = path[:-1] if path.endswith("/") else path - else: - path_or_paths = path - _logger.debug("path_or_paths: %s", path_or_paths) - fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset( - path_or_paths=path_or_paths, - filesystem=fs, - metadata_nthreads=cpus, - filters=filters, - read_dictionary=categories, - validate_schema=validate_schema, - split_row_groups=False, - use_legacy_dataset=True, - ) - return data - - -def read_parquet( - path: Union[str, List[str]], - filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, - columns: Optional[List[str]] = None, - validate_schema: bool = True, - chunked: Union[bool, int] = False, - dataset: bool = False, - categories: List[str] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read Apache Parquet file(s) from from a received S3 prefix or list of S3 objects paths. - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning and catalog integration (AWS Glue Catalog). - - Note - ---- - ``Batching`` (`chunked` argument) (Memory Friendly): - - Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. - - There are two batching strategies on Wrangler: - - - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - - - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. - - `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise - in number of rows for each Dataframe. - - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - filters: Union[List[Tuple], List[List[Tuple]]], optional - List of filters to apply on PARTITION columns (PUSH-DOWN filter), like ``[[('x', '=', 0), ...], ...]``. - Ignored if `dataset=False`. - columns : List[str], optional - Names of columns to read from the file(s). - validate_schema: - Check that individual file schemas are all the same / compatible. Schemas within a - folder prefix should all be the same. Disable if you have schemas that are different - and want to disable this check. - chunked : Union[int, bool] - If passed will split the data in a Iterable of DataFrames (Memory friendly). - If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. - If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. - dataset: bool - If `True` read a parquet dataset instead of simple file(s) loading all the related partitions as columns. - categories: List[str], optional - List of columns names that should be returned as pandas.Categorical. - Recommended for memory restricted environments. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunked=True`. - - Examples - -------- - Reading all Parquet files under a prefix - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet(path='s3://bucket/prefix/') - - Reading all Parquet files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading all Parquet files from a list - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet(path=['s3://bucket/filename0.parquet', 's3://bucket/filename1.parquet']) - - Reading in chunks (Chunk by file) - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=True) - >>> for df in dfs: - >>> print(df) # Smaller Pandas DataFrame - - Reading in chunks (Chunk by 1MM rows) - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) - >>> for df in dfs: - >>> print(df) # 1MM Pandas DataFrame - - """ - data: pyarrow.parquet.ParquetDataset = _read_parquet_init( - path=path, - filters=filters, - dataset=dataset, - categories=categories, - validate_schema=validate_schema, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - ) - _logger.debug("pyarrow.parquet.ParquetDataset initialized.") - if chunked is False: - return _read_parquet( - data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema - ) - return _read_parquet_chunked( - data=data, columns=columns, categories=categories, chunked=chunked, use_threads=use_threads - ) - - -def _read_parquet( - data: pyarrow.parquet.ParquetDataset, - columns: Optional[List[str]] = None, - categories: List[str] = None, - use_threads: bool = True, - validate_schema: bool = True, -) -> pd.DataFrame: - tables: List[pa.Table] = [] - _logger.debug("Reading pieces...") - for piece in data.pieces: - table: pa.Table = piece.read( - columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False - ) - _logger.debug("Appending piece in the list...") - tables.append(table) - promote: bool = not validate_schema - _logger.debug("Concating pieces...") - table = pa.lib.concat_tables(tables, promote=promote) - _logger.debug("Converting PyArrow table to Pandas DataFrame...") - return table.to_pandas( - use_threads=use_threads, - split_blocks=True, - self_destruct=True, - integer_object_nulls=False, - date_as_object=True, - ignore_metadata=True, - categories=categories, - types_mapper=_data_types.pyarrow2pandas_extension, - ) - - -def _read_parquet_chunked( - data: pyarrow.parquet.ParquetDataset, - columns: Optional[List[str]] = None, - categories: List[str] = None, - chunked: Union[bool, int] = True, - use_threads: bool = True, -) -> Iterator[pd.DataFrame]: - next_slice: Optional[pd.DataFrame] = None - for piece in data.pieces: - df: pd.DataFrame = _table2df( - table=piece.read( - columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False - ), - categories=categories, - use_threads=use_threads, - ) - if chunked is True: - yield df - else: - if next_slice is not None: - df = pd.concat(objs=[next_slice, df], ignore_index=True, sort=False) - while len(df.index) >= chunked: - yield df.iloc[:chunked] - df = df.iloc[chunked:] - if df.empty: - next_slice = None - else: - next_slice = df - if next_slice is not None: - yield next_slice - - -def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame: - return table.to_pandas( - use_threads=use_threads, - split_blocks=True, - self_destruct=True, - integer_object_nulls=False, - date_as_object=True, - ignore_metadata=True, - categories=categories, - types_mapper=_data_types.pyarrow2pandas_extension, - ) - - -def read_parquet_metadata( - path: Union[str, List[str]], - dtype: Optional[Dict[str, str]] = None, - sampling: float = 1.0, - dataset: bool = False, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> Tuple[Dict[str, str], Optional[Dict[str, str]]]: - """Read Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths. - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning and catalog integration (AWS Glue Catalog). - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - dtype : Dict[str, str], optional - Dictionary of columns names and Athena/Glue types to be casted. - Useful when you have columns with undetermined data types as partitions columns. - (e.g. {'col name': 'bigint', 'col2 name': 'int'}) - sampling : float - Random sample ratio of files that will have the metadata inspected. - Must be `0.0 < sampling <= 1.0`. - The higher, the more accurate. - The lower, the faster. - dataset: bool - If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - Tuple[Dict[str, str], Optional[Dict[str, str]]] - columns_types: Dictionary with keys as column names and vales as - data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / - partitions_types: Dictionary with keys as partition names - and values as data types (e.g. {'col2': 'date'}). - - Examples - -------- - Reading all Parquet files (with partitions) metadata under a prefix - - >>> import awswrangler as wr - >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path='s3://bucket/prefix/', dataset=True) - - Reading all Parquet files metadata from a list - - >>> import awswrangler as wr - >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path=[ - ... 's3://bucket/filename0.parquet', - ... 's3://bucket/filename1.parquet' - ... ]) - - """ - return _read_parquet_metadata( - path=path, dtype=dtype, sampling=sampling, dataset=dataset, use_threads=use_threads, boto3_session=boto3_session - )[:2] - - -def _read_parquet_metadata( - path: Union[str, List[str]], - dtype: Optional[Dict[str, str]], - sampling: float, - dataset: bool, - use_threads: bool, - boto3_session: Optional[boto3.Session], -) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: - session: boto3.Session = _utils.ensure_session(session=boto3_session) - if dataset is True: - if isinstance(path, str): - _path: Optional[str] = path if path.endswith("/") else f"{path}/" - paths: List[str] = _path2list(path=_path, boto3_session=session) - else: # pragma: no cover - raise exceptions.InvalidArgumentType("Argument must be str if dataset=True.") - else: - if isinstance(path, str): - _path = None - paths = _path2list(path=path, boto3_session=session) - elif isinstance(path, list): - _path = None - paths = path - else: # pragma: no cover - raise exceptions.InvalidArgumentType(f"Argument path must be str or List[str] instead of {type(path)}.") - schemas: List[Dict[str, str]] = [ - _read_parquet_metadata_file(path=x, use_threads=use_threads, boto3_session=session) - for x in _utils.list_sampling(lst=paths, sampling=sampling) - ] - _logger.debug("schemas: %s", schemas) - columns_types: Dict[str, str] = {} - for schema in schemas: - for column, _dtype in schema.items(): - if (column in columns_types) and (columns_types[column] != _dtype): # pragma: no cover - raise exceptions.InvalidSchemaConvergence( - f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})." - ) - columns_types[column] = _dtype - partitions_types: Optional[Dict[str, str]] = None - partitions_values: Optional[Dict[str, List[str]]] = None - if (dataset is True) and (_path is not None): - partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths(path=_path, paths=paths) - if dtype: - for k, v in dtype.items(): - if columns_types and k in columns_types: - columns_types[k] = v - if partitions_types and k in partitions_types: - partitions_types[k] = v - _logger.debug("columns_types: %s", columns_types) - return columns_types, partitions_types, partitions_values - - -def _read_parquet_metadata_file(path: str, use_threads: bool, boto3_session: boto3.Session) -> Dict[str, str]: - data: pyarrow.parquet.ParquetDataset = _read_parquet_init( - path=path, filters=None, dataset=False, use_threads=use_threads, boto3_session=boto3_session - ) - return _data_types.athena_types_from_pyarrow_schema(schema=data.schema.to_arrow_schema(), partitions=None)[0] - - -def store_parquet_metadata( # pylint: disable=too-many-arguments - path: str, - database: str, - table: str, - dtype: Optional[Dict[str, str]] = None, - sampling: float = 1.0, - dataset: bool = False, - use_threads: bool = True, - description: Optional[str] = None, - parameters: Optional[Dict[str, str]] = None, - columns_comments: Optional[Dict[str, str]] = None, - compression: Optional[str] = None, - mode: str = "overwrite", - catalog_versioning: bool = False, - regular_partitions: bool = True, - projection_enabled: bool = False, - projection_types: Optional[Dict[str, str]] = None, - projection_ranges: Optional[Dict[str, str]] = None, - projection_values: Optional[Dict[str, str]] = None, - projection_intervals: Optional[Dict[str, str]] = None, - projection_digits: Optional[Dict[str, str]] = None, - boto3_session: Optional[boto3.Session] = None, -) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: - """Infer and store parquet metadata on AWS Glue Catalog. - - Infer Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths - And then stores it on AWS Glue Catalog including all inferred partitions - (No need of 'MCSK REPAIR TABLE') - - The concept of Dataset goes beyond the simple idea of files and enable more - complex features like partitioning and catalog integration (AWS Glue Catalog). - - Note - ---- - On `append` mode, the `parameters` will be upsert on an existing table. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - path : Union[str, List[str]] - S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - database : str - Glue/Athena catalog: Database name. - table : str - Glue/Athena catalog: Table name. - dtype : Dict[str, str], optional - Dictionary of columns names and Athena/Glue types to be casted. - Useful when you have columns with undetermined data types as partitions columns. - (e.g. {'col name': 'bigint', 'col2 name': 'int'}) - sampling : float - Random sample ratio of files that will have the metadata inspected. - Must be `0.0 < sampling <= 1.0`. - The higher, the more accurate. - The lower, the faster. - dataset: bool - If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - description: str, optional - Glue/Athena catalog: Table description - parameters: Dict[str, str], optional - Glue/Athena catalog: Key/value pairs to tag the table. - columns_comments: Dict[str, str], optional - Glue/Athena catalog: - Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). - compression: str, optional - Compression style (``None``, ``snappy``, ``gzip``, etc). - mode: str - 'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table. - catalog_versioning : bool - If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. - regular_partitions : bool - Create regular partitions (Non projected partitions) on Glue Catalog. - Disable when you will work only with Partition Projection. - Keep enabled even when working with projections is useful to keep - Redshift Spectrum working with the regular partitions. - projection_enabled : bool - Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) - projection_types : Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections types. - Valid types: "enum", "integer", "date", "injected" - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) - projection_ranges: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections ranges. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) - projection_values: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections values. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) - projection_intervals: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections intervals. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '5'}) - projection_digits: Optional[Dict[str, str]] - Dictionary of partitions names and Athena projections digits. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html - (e.g. {'col_name': '1', 'col2_name': '2'}) - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]] - The metadata used to create the Glue Table. - columns_types: Dictionary with keys as column names and vales as - data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / - partitions_types: Dictionary with keys as partition names - and values as data types (e.g. {'col2': 'date'}). / - partitions_values: Dictionary with keys as S3 path locations and values as a - list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). - - Examples - -------- - Reading all Parquet files metadata under a prefix - - >>> import awswrangler as wr - >>> columns_types, partitions_types, partitions_values = wr.s3.store_parquet_metadata( - ... path='s3://bucket/prefix/', - ... database='...', - ... table='...', - ... dataset=True - ... ) - - """ - session: boto3.Session = _utils.ensure_session(session=boto3_session) - columns_types: Dict[str, str] - partitions_types: Optional[Dict[str, str]] - partitions_values: Optional[Dict[str, List[str]]] - columns_types, partitions_types, partitions_values = _read_parquet_metadata( - path=path, dtype=dtype, sampling=sampling, dataset=dataset, use_threads=use_threads, boto3_session=session - ) - _logger.debug("columns_types: %s", columns_types) - _logger.debug("partitions_types: %s", partitions_types) - _logger.debug("partitions_values: %s", partitions_values) - catalog.create_parquet_table( - database=database, - table=table, - path=path, - columns_types=columns_types, - partitions_types=partitions_types, - description=description, - parameters=parameters, - columns_comments=columns_comments, - mode=mode, - catalog_versioning=catalog_versioning, - projection_enabled=projection_enabled, - projection_types=projection_types, - projection_ranges=projection_ranges, - projection_values=projection_values, - projection_intervals=projection_intervals, - projection_digits=projection_digits, - boto3_session=session, - ) - if (partitions_types is not None) and (partitions_values is not None) and (regular_partitions is True): - catalog.add_parquet_partitions( - database=database, - table=table, - partitions_values=partitions_values, - compression=compression, - boto3_session=session, - ) - return columns_types, partitions_types, partitions_values - - -def wait_objects_exist( - paths: List[str], - delay: Optional[Union[int, float]] = None, - max_attempts: Optional[int] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> None: - """Wait Amazon S3 objects exist. - - Polls S3.Client.head_object() every 5 seconds (default) until a successful - state is reached. An error is returned after 20 (default) failed checks. - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Waiter.ObjectExists - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - paths : List[str] - List of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - delay : Union[int,float], optional - The amount of time in seconds to wait between attempts. Default: 5 - max_attempts : int, optional - The maximum number of attempts to be made. Default: 20 - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - None - None. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.wait_objects_exist(['s3://bucket/key0', 's3://bucket/key1']) # wait both objects - - """ - return _wait_objects( - waiter_name="object_exists", - paths=paths, - delay=delay, - max_attempts=max_attempts, - use_threads=use_threads, - boto3_session=boto3_session, - ) - - -def wait_objects_not_exist( - paths: List[str], - delay: Optional[Union[int, float]] = None, - max_attempts: Optional[int] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> None: - """Wait Amazon S3 objects not exist. - - Polls S3.Client.head_object() every 5 seconds (default) until a successful - state is reached. An error is returned after 20 (default) failed checks. - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Waiter.ObjectNotExists - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - paths : List[str] - List of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). - delay : Union[int,float], optional - The amount of time in seconds to wait between attempts. Default: 5 - max_attempts : int, optional - The maximum number of attempts to be made. Default: 20 - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - None - None. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.wait_objects_not_exist(['s3://bucket/key0', 's3://bucket/key1']) # wait both objects not exist - - """ - return _wait_objects( - waiter_name="object_not_exists", - paths=paths, - delay=delay, - max_attempts=max_attempts, - use_threads=use_threads, - boto3_session=boto3_session, - ) - - -def _wait_objects( - waiter_name: str, - paths: List[str], - delay: Optional[Union[int, float]] = None, - max_attempts: Optional[int] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> None: - delay = 5 if delay is None else delay - max_attempts = 20 if max_attempts is None else max_attempts - _delay: int = int(delay) if isinstance(delay, float) else delay - - if len(paths) < 1: - return None - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - waiter = client_s3.get_waiter(waiter_name) - _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths] - if use_threads is False: - for bucket, key in _paths: - waiter.wait(Bucket=bucket, Key=key, WaiterConfig={"Delay": _delay, "MaxAttempts": max_attempts}) - else: - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: - futures: List[concurrent.futures.Future] = [] - for bucket, key in _paths: - future: concurrent.futures.Future = executor.submit( - fn=waiter.wait, Bucket=bucket, Key=key, WaiterConfig={"Delay": _delay, "MaxAttempts": max_attempts} - ) - futures.append(future) - for future in futures: - future.result() - return None - - -def read_parquet_table( - table: str, - database: str, - filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, - columns: Optional[List[str]] = None, - categories: List[str] = None, - chunked: Union[bool, int] = False, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, -) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: - """Read Apache Parquet table registered on AWS Glue Catalog. - - Note - ---- - ``Batching`` (`chunked` argument) (Memory Friendly): - - Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. - - There are two batching strategies on Wrangler: - - - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - - - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating - to return DataFrames with the number of row igual the received INTEGER. - - `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise - in number of rows for each Dataframe. - - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - table : str - AWS Glue Catalog table name. - database : str - AWS Glue Catalog database name. - filters: Union[List[Tuple], List[List[Tuple]]], optional - List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. - columns : List[str], optional - Names of columns to read from the file(s). - categories: List[str], optional - List of columns names that should be returned as pandas.Categorical. - Recommended for memory restricted environments. - chunked : bool - If True will break the data in smaller DataFrames (Non deterministic number of lines). - Otherwise return a single DataFrame with the whole data. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption - - Returns - ------- - Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] - Pandas DataFrame or a Generator in case of `chunked=True`. - - Examples - -------- - Reading Parquet Table - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet_table(database='...', table='...') - - Reading Parquet Table encrypted - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet_table( - ... database='...', - ... table='...' - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - - Reading Parquet Table in chunks (Chunk by file) - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True) - >>> for df in dfs: - >>> print(df) # Smaller Pandas DataFrame - - Reading in chunks (Chunk by 1MM rows) - - >>> import awswrangler as wr - >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) - >>> for df in dfs: - >>> print(df) # 1MM Pandas DataFrame - - """ - path: str = catalog.get_table_location(database=database, table=table, boto3_session=boto3_session) - return read_parquet( - path=path, - filters=filters, - columns=columns, - categories=categories, - chunked=chunked, - dataset=True, - use_threads=use_threads, - boto3_session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - ) - - -def merge_datasets( - source_path: str, - target_path: str, - mode: str = "append", - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> List[str]: - """Merge a source dataset into a target dataset. - - Note - ---- - If you are merging tables (S3 datasets + Glue Catalog metadata), - remember that you will also need to update your partitions metadata in some cases. - (e.g. wr.athena.repair_table(table='...', database='...')) - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - source_path : str, - S3 Path for the source directory. - target_path : str, - S3 Path for the target directory. - mode: str, optional - ``append`` (Default), ``overwrite``, ``overwrite_partitions``. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - List of new objects paths. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.merge_datasets( - ... source_path="s3://bucket0/dir0/", - ... target_path="s3://bucket1/dir1/", - ... mode="append" - ... ) - ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] - - """ - source_path = source_path[:-1] if source_path[-1] == "/" else source_path - target_path = target_path[:-1] if target_path[-1] == "/" else target_path - session: boto3.Session = _utils.ensure_session(session=boto3_session) - - paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session) - _logger.debug("len(paths): %s", len(paths)) - if len(paths) < 1: - return [] - - if mode == "overwrite": - _logger.debug("Deleting to overwrite: %s/", target_path) - delete_objects(path=f"{target_path}/", use_threads=use_threads, boto3_session=session) - elif mode == "overwrite_partitions": - paths_wo_prefix: List[str] = [x.replace(f"{source_path}/", "") for x in paths] - paths_wo_filename: List[str] = [f"{x.rpartition('/')[0]}/" for x in paths_wo_prefix] - partitions_paths: List[str] = list(set(paths_wo_filename)) - target_partitions_paths = [f"{target_path}/{x}" for x in partitions_paths] - for path in target_partitions_paths: - _logger.debug("Deleting to overwrite_partitions: %s", path) - delete_objects(path=path, use_threads=use_threads, boto3_session=session) - elif mode != "append": - raise exceptions.InvalidArgumentValue(f"{mode} is a invalid mode option.") - - new_objects: List[str] = copy_objects( - paths=paths, source_path=source_path, target_path=target_path, use_threads=use_threads, boto3_session=session - ) - _logger.debug("len(new_objects): %s", len(new_objects)) - return new_objects - - -def copy_objects( - paths: List[str], - source_path: str, - target_path: str, - replace_filenames: Optional[Dict[str, str]] = None, - use_threads: bool = True, - boto3_session: Optional[boto3.Session] = None, -) -> List[str]: - """Copy a list of S3 objects to another S3 directory. - - Note - ---- - In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). - - Parameters - ---------- - paths : List[str] - List of S3 objects paths (e.g. [s3://bucket/dir0/key0, s3://bucket/dir0/key1]). - source_path : str, - S3 Path for the source directory. - target_path : str, - S3 Path for the target directory. - use_threads : bool - True to enable concurrent requests, False to disable multiple threads. - If enabled os.cpu_count() will be used as the max number of threads. - boto3_session : boto3.Session(), optional - Boto3 Session. The default boto3 session will be used if boto3_session receive None. - - Returns - ------- - List[str] - List of new objects paths. - - Examples - -------- - >>> import awswrangler as wr - >>> wr.s3.copy_objects( - ... paths=["s3://bucket0/dir0/key0", "s3://bucket0/dir0/key1"]) - ... source_path="s3://bucket0/dir0/", - ... target_path="s3://bucket1/dir1/", - ... ) - ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] - - """ - _logger.debug("len(paths): %s", len(paths)) - if len(paths) < 1: - return [] - source_path = source_path[:-1] if source_path[-1] == "/" else source_path - target_path = target_path[:-1] if target_path[-1] == "/" else target_path - session: boto3.Session = _utils.ensure_session(session=boto3_session) - batch: List[Tuple[str, str]] = [] - new_objects: List[str] = [] - for path in paths: - path_wo_prefix: str = path.replace(f"{source_path}/", "") - path_final: str = f"{target_path}/{path_wo_prefix}" - if replace_filenames is not None: - parts: List[str] = path_final.rsplit(sep="/", maxsplit=1) - if len(parts) == 2: - path_wo_filename: str = parts[0] - filename: str = parts[1] - if filename in replace_filenames: - new_filename: str = replace_filenames[filename] - _logger.debug("Replacing filename: %s -> %s", filename, new_filename) - path_final = f"{path_wo_filename}/{new_filename}" - new_objects.append(path_final) - batch.append((path, path_final)) - _logger.debug("len(new_objects): %s", len(new_objects)) - _copy_objects(batch=batch, use_threads=use_threads, boto3_session=session) - return new_objects - - -def _copy_objects(batch: List[Tuple[str, str]], use_threads: bool, boto3_session: boto3.Session) -> None: - _logger.debug("len(batch): %s", len(batch)) - client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) - resource_s3: boto3.resource = _utils.resource(service_name="s3", session=boto3_session) - for source, target in batch: - source_bucket, source_key = _utils.parse_path(path=source) - copy_source: Dict[str, str] = {"Bucket": source_bucket, "Key": source_key} - target_bucket, target_key = _utils.parse_path(path=target) - resource_s3.meta.client.copy( - CopySource=copy_source, - Bucket=target_bucket, - Key=target_key, - SourceClient=client_s3, - Config=TransferConfig(num_download_attempts=15, use_threads=use_threads), - ) diff --git a/awswrangler/s3/__init__.py b/awswrangler/s3/__init__.py new file mode 100644 index 000000000..234dbc718 --- /dev/null +++ b/awswrangler/s3/__init__.py @@ -0,0 +1,16 @@ +"""Amazon S3 Read Module.""" + +from awswrangler.s3._copy import copy_objects, merge_datasets # noqa +from awswrangler.s3._delete import delete_objects # noqa +from awswrangler.s3._describe import describe_objects, get_bucket_region, size_objects # noqa +from awswrangler.s3._list import does_object_exist, list_directories, list_objects # noqa +from awswrangler.s3._read import ( # noqa + read_csv, + read_fwf, + read_json, + read_parquet, + read_parquet_metadata, + read_parquet_table, +) +from awswrangler.s3._wait import wait_objects_exist, wait_objects_not_exist # noqa +from awswrangler.s3._write import store_parquet_metadata, to_csv, to_json, to_parquet # noqa diff --git a/awswrangler/s3/_copy.py b/awswrangler/s3/_copy.py new file mode 100644 index 000000000..f8fedd5ca --- /dev/null +++ b/awswrangler/s3/_copy.py @@ -0,0 +1,182 @@ +"""Amazon S3 Copy Module (PRIVATE).""" + +import logging +from typing import Dict, List, Optional, Tuple + +import boto3 # type: ignore +from boto3.s3.transfer import TransferConfig # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.s3._delete import delete_objects +from awswrangler.s3._list import list_objects + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _copy_objects(batch: List[Tuple[str, str]], use_threads: bool, boto3_session: boto3.Session) -> None: + _logger.debug("len(batch): %s", len(batch)) + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + resource_s3: boto3.resource = _utils.resource(service_name="s3", session=boto3_session) + for source, target in batch: + source_bucket, source_key = _utils.parse_path(path=source) + copy_source: Dict[str, str] = {"Bucket": source_bucket, "Key": source_key} + target_bucket, target_key = _utils.parse_path(path=target) + resource_s3.meta.client.copy( + CopySource=copy_source, + Bucket=target_bucket, + Key=target_key, + SourceClient=client_s3, + Config=TransferConfig(num_download_attempts=15, use_threads=use_threads), + ) + + +def merge_datasets( + source_path: str, + target_path: str, + mode: str = "append", + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + """Merge a source dataset into a target dataset. + + Note + ---- + If you are merging tables (S3 datasets + Glue Catalog metadata), + remember that you will also need to update your partitions metadata in some cases. + (e.g. wr.athena.repair_table(table='...', database='...')) + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + source_path : str, + S3 Path for the source directory. + target_path : str, + S3 Path for the target directory. + mode: str, optional + ``append`` (Default), ``overwrite``, ``overwrite_partitions``. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of new objects paths. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.merge_datasets( + ... source_path="s3://bucket0/dir0/", + ... target_path="s3://bucket1/dir1/", + ... mode="append" + ... ) + ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] + + """ + source_path = source_path[:-1] if source_path[-1] == "/" else source_path + target_path = target_path[:-1] if target_path[-1] == "/" else target_path + session: boto3.Session = _utils.ensure_session(session=boto3_session) + + paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session) + _logger.debug("len(paths): %s", len(paths)) + if len(paths) < 1: + return [] + + if mode == "overwrite": + _logger.debug("Deleting to overwrite: %s/", target_path) + delete_objects(path=f"{target_path}/", use_threads=use_threads, boto3_session=session) + elif mode == "overwrite_partitions": + paths_wo_prefix: List[str] = [x.replace(f"{source_path}/", "") for x in paths] + paths_wo_filename: List[str] = [f"{x.rpartition('/')[0]}/" for x in paths_wo_prefix] + partitions_paths: List[str] = list(set(paths_wo_filename)) + target_partitions_paths = [f"{target_path}/{x}" for x in partitions_paths] + for path in target_partitions_paths: + _logger.debug("Deleting to overwrite_partitions: %s", path) + delete_objects(path=path, use_threads=use_threads, boto3_session=session) + elif mode != "append": + raise exceptions.InvalidArgumentValue(f"{mode} is a invalid mode option.") + + new_objects: List[str] = copy_objects( + paths=paths, source_path=source_path, target_path=target_path, use_threads=use_threads, boto3_session=session + ) + _logger.debug("len(new_objects): %s", len(new_objects)) + return new_objects + + +def copy_objects( + paths: List[str], + source_path: str, + target_path: str, + replace_filenames: Optional[Dict[str, str]] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + """Copy a list of S3 objects to another S3 directory. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + paths : List[str] + List of S3 objects paths (e.g. [s3://bucket/dir0/key0, s3://bucket/dir0/key1]). + source_path : str, + S3 Path for the source directory. + target_path : str, + S3 Path for the target directory. + replace_filenames : Dict[str, str], optional + e.g. {"old_name.csv": "new_name.csv", "old_name2.csv": "new_name2.csv"} + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of new objects paths. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.copy_objects( + ... paths=["s3://bucket0/dir0/key0", "s3://bucket0/dir0/key1"]) + ... source_path="s3://bucket0/dir0/", + ... target_path="s3://bucket1/dir1/", + ... ) + ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] + + """ + _logger.debug("len(paths): %s", len(paths)) + if len(paths) < 1: + return [] + source_path = source_path[:-1] if source_path[-1] == "/" else source_path + target_path = target_path[:-1] if target_path[-1] == "/" else target_path + session: boto3.Session = _utils.ensure_session(session=boto3_session) + batch: List[Tuple[str, str]] = [] + new_objects: List[str] = [] + for path in paths: + path_wo_prefix: str = path.replace(f"{source_path}/", "") + path_final: str = f"{target_path}/{path_wo_prefix}" + if replace_filenames is not None: + parts: List[str] = path_final.rsplit(sep="/", maxsplit=1) + if len(parts) == 2: + path_wo_filename: str = parts[0] + filename: str = parts[1] + if filename in replace_filenames: + new_filename: str = replace_filenames[filename] + _logger.debug("Replacing filename: %s -> %s", filename, new_filename) + path_final = f"{path_wo_filename}/{new_filename}" + new_objects.append(path_final) + batch.append((path, path_final)) + _logger.debug("len(new_objects): %s", len(new_objects)) + _copy_objects(batch=batch, use_threads=use_threads, boto3_session=session) + return new_objects diff --git a/awswrangler/s3/_delete.py b/awswrangler/s3/_delete.py new file mode 100644 index 000000000..b2d53759a --- /dev/null +++ b/awswrangler/s3/_delete.py @@ -0,0 +1,85 @@ +"""Amazon S3 CopDeletey Module (PRIVATE).""" + +import concurrent.futures +import itertools +import logging +from typing import Dict, List, Optional, Union + +import boto3 # type: ignore + +from awswrangler import _utils, exceptions +from awswrangler.s3._list import path2list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _split_paths_by_bucket(paths: List[str]) -> Dict[str, List[str]]: + buckets: Dict[str, List[str]] = {} + bucket: str + key: str + for path in paths: + bucket, key = _utils.parse_path(path=path) + if bucket not in buckets: + buckets[bucket] = [] + buckets[bucket].append(key) + return buckets + + +def _delete_objects(bucket: str, keys: List[str], client_s3: boto3.client) -> None: + _logger.debug("len(keys): %s", len(keys)) + batch: List[Dict[str, str]] = [{"Key": key} for key in keys] + res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch}) + deleted = res.get("Deleted") + if deleted is not None: + for i in deleted: + _logger.debug("s3://%s/%s has been deleted.", bucket, i.get("Key")) + errors = res.get("Errors") + if errors is not None: # pragma: no cover + raise exceptions.ServiceApiError(errors) + + +def delete_objects( + path: Union[str, List[str]], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None +) -> None: + """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1']) # Delete both objects + >>> wr.s3.delete_objects('s3://bucket/prefix') # Delete all objects under the received prefix + + """ + paths: List[str] = path2list(path=path, boto3_session=boto3_session) + if len(paths) < 1: + return + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths) + for bucket, keys in buckets.items(): + chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000) + if use_threads is False: + for chunk in chunks: + _delete_objects(bucket=bucket, keys=chunk, client_s3=client_s3) + else: + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: + list(executor.map(_delete_objects, itertools.repeat(bucket), chunks, itertools.repeat(client_s3))) diff --git a/awswrangler/s3/_describe.py b/awswrangler/s3/_describe.py new file mode 100644 index 000000000..c2e09bad8 --- /dev/null +++ b/awswrangler/s3/_describe.py @@ -0,0 +1,184 @@ +"""Amazon S3 Describe Module (INTERNAL).""" + +import concurrent.futures +import itertools +import logging +import time +from typing import Any, Dict, List, Optional, Tuple, Union + +import boto3 # type: ignore +import botocore.exceptions # type: ignore + +from awswrangler import _utils +from awswrangler.s3._list import path2list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _describe_object( + path: str, wait_time: Optional[Union[int, float]], client_s3: boto3.client +) -> Tuple[str, Dict[str, Any]]: + wait_time = int(wait_time) if isinstance(wait_time, float) else wait_time + tries: int = wait_time if (wait_time is not None) and (wait_time > 0) else 1 + bucket: str + key: str + bucket, key = _utils.parse_path(path=path) + desc: Dict[str, Any] = {} + for i in range(tries, 0, -1): + try: + desc = client_s3.head_object(Bucket=bucket, Key=key) + break + except botocore.exceptions.ClientError as e: # pragma: no cover + if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404: # Not Found + _logger.debug("Object not found. %s seconds remaining to wait.", i) + if i == 1: # Last try, there is no more need to sleep + break + time.sleep(1) + else: + raise e + return path, desc + + +def describe_objects( + path: Union[str, List[str]], + wait_time: Optional[Union[int, float]] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Dict[str, Any]]: + """Describe Amazon S3 objects from a received S3 prefix or list of S3 objects paths. + + Fetch attributes like ContentLength, DeleteMarker, LastModified, ContentType, etc + The full list of attributes can be explored under the boto3 head_object documentation: + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + wait_time : Union[int,float], optional + How much time (seconds) should Wrangler try to reach this objects. + Very useful to overcome eventual consistence issues. + `None` means only a single try will be done. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Dict[str, Any]] + Return a dictionary of objects returned from head_objects where the key is the object path. + The response object can be explored here: + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object + + Examples + -------- + >>> import awswrangler as wr + >>> descs0 = wr.s3.describe_objects(['s3://bucket/key0', 's3://bucket/key1']) # Describe both objects + >>> descs1 = wr.s3.describe_objects('s3://bucket/prefix') # Describe all objects under the prefix + >>> descs2 = wr.s3.describe_objects('s3://bucket/prefix', wait_time=30) # Overcoming eventual consistence issues + + """ + paths: List[str] = path2list(path=path, boto3_session=boto3_session) + if len(paths) < 1: + return {} + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + resp_list: List[Tuple[str, Dict[str, Any]]] + if use_threads is False: + resp_list = [_describe_object(path=p, wait_time=wait_time, client_s3=client_s3) for p in paths] + else: + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: + resp_list = list( + executor.map(_describe_object, paths, itertools.repeat(wait_time), itertools.repeat(client_s3)) + ) + desc_dict: Dict[str, Dict[str, Any]] = dict(resp_list) + return desc_dict + + +def size_objects( + path: Union[str, List[str]], + wait_time: Optional[Union[int, float]] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> Dict[str, Optional[int]]: + """Get the size (ContentLength) in bytes of Amazon S3 objects from a received S3 prefix or list of S3 objects paths. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + wait_time : Union[int,float], optional + How much time (seconds) should Wrangler try to reach this objects. + Very useful to overcome eventual consistence issues. + `None` means only a single try will be done. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Dict[str, Optional[int]] + Dictionary where the key is the object path and the value is the object size. + + Examples + -------- + >>> import awswrangler as wr + >>> sizes0 = wr.s3.size_objects(['s3://bucket/key0', 's3://bucket/key1']) # Get the sizes of both objects + >>> sizes1 = wr.s3.size_objects('s3://bucket/prefix') # Get the sizes of all objects under the received prefix + >>> sizes2 = wr.s3.size_objects('s3://bucket/prefix', wait_time=30) # Overcoming eventual consistence issues + + """ + desc_list: Dict[str, Dict[str, Any]] = describe_objects( + path=path, wait_time=wait_time, use_threads=use_threads, boto3_session=boto3_session + ) + size_dict: Dict[str, Optional[int]] = {k: d.get("ContentLength", None) for k, d in desc_list.items()} + return size_dict + + +def get_bucket_region(bucket: str, boto3_session: Optional[boto3.Session] = None) -> str: + """Get bucket region name. + + Parameters + ---------- + bucket : str + Bucket name. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + str + Region code (e.g. 'us-east-1'). + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> region = wr.s3.get_bucket_region('bucket-name') + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> region = wr.s3.get_bucket_region('bucket-name', boto3_session=boto3.Session()) + + """ + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + _logger.debug("bucket: %s", bucket) + region: str = client_s3.get_bucket_location(Bucket=bucket)["LocationConstraint"] + region = "us-east-1" if region is None else region + _logger.debug("region: %s", region) + return region diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py new file mode 100644 index 000000000..2eebea1ca --- /dev/null +++ b/awswrangler/s3/_list.py @@ -0,0 +1,176 @@ +"""Amazon S3 List Module (PRIVATE).""" + +import logging +from typing import Any, Dict, List, Optional + +import boto3 # type: ignore +import botocore.exceptions # type: ignore + +from awswrangler import _utils, exceptions + +_logger: logging.Logger = logging.getLogger(__name__) + + +def path2list(path: object, boto3_session: boto3.Session, suffix: str = None) -> List[str]: + """Convert Amazon S3 path to list of objects.""" + if isinstance(path, str): # prefix + paths: List[str] = list_objects(path=path, suffix=suffix, boto3_session=boto3_session) + elif isinstance(path, list): + paths = path if suffix is None else [x for x in path if x.endswith(suffix)] + else: + raise exceptions.InvalidArgumentType(f"{type(path)} is not a valid path type. Please, use str or List[str].") + return paths + + +def _list_objects( + path: str, + delimiter: Optional[str] = None, + suffix: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> List[str]: + bucket: str + prefix: str + bucket, prefix = _utils.parse_path(path=path) + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + paginator = client_s3.get_paginator("list_objects_v2") + args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} + if delimiter is not None: + args["Delimiter"] = delimiter + response_iterator = paginator.paginate(**args) + paths: List[str] = [] + for page in response_iterator: # pylint: disable=too-many-nested-blocks + if delimiter is None: + contents: Optional[List] = page.get("Contents") + if contents is not None: + for content in contents: + if (content is not None) and ("Key" in content): + key: str = content["Key"] + if (suffix is None) or key.endswith(suffix): + paths.append(f"s3://{bucket}/{key}") + else: + prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") + if prefixes is not None: + for pfx in prefixes: + if (pfx is not None) and ("Prefix" in pfx): + key = pfx["Prefix"] + paths.append(f"s3://{bucket}/{key}") + return paths + + +def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) -> bool: + """Check if object exists on S3. + + Parameters + ---------- + path: str + S3 path (e.g. s3://bucket/key). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + bool + True if exists, False otherwise. + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> wr.s3.does_object_exist('s3://bucket/key_real') + True + >>> wr.s3.does_object_exist('s3://bucket/key_unreal') + False + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> wr.s3.does_object_exist('s3://bucket/key_real', boto3_session=boto3.Session()) + True + >>> wr.s3.does_object_exist('s3://bucket/key_unreal', boto3_session=boto3.Session()) + False + + """ + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + bucket: str + key: str + bucket, key = path.replace("s3://", "").split("/", 1) + try: + client_s3.head_object(Bucket=bucket, Key=key) + return True + except botocore.exceptions.ClientError as ex: + if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404: + return False + raise ex # pragma: no cover + + +def list_directories(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: + """List Amazon S3 objects from a prefix. + + Parameters + ---------- + path : str + S3 path (e.g. s3://bucket/prefix). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of objects paths. + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix/') + ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix/', boto3_session=boto3.Session()) + ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2'] + + """ + return _list_objects(path=path, delimiter="/", boto3_session=boto3_session) + + +def list_objects(path: str, suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[str]: + """List Amazon S3 objects from a prefix. + + Parameters + ---------- + path : str + S3 path (e.g. s3://bucket/prefix). + suffix: str, optional + Suffix for filtering S3 keys. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[str] + List of objects paths. + + Examples + -------- + Using the default boto3 session + + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix') + ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] + + Using a custom boto3 session + + >>> import boto3 + >>> import awswrangler as wr + >>> wr.s3.list_objects('s3://bucket/prefix', boto3_session=boto3.Session()) + ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2'] + + """ + paths: List[str] = _list_objects(path=path, delimiter=None, suffix=suffix, boto3_session=boto3_session) + return [p for p in paths if not p.endswith("/")] diff --git a/awswrangler/s3/_read.py b/awswrangler/s3/_read.py new file mode 100644 index 000000000..e249aca82 --- /dev/null +++ b/awswrangler/s3/_read.py @@ -0,0 +1,885 @@ +"""Amazon S3 Read Module (PRIVATE).""" + +import concurrent.futures +import itertools +import logging +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union + +import boto3 # type: ignore +import pandas as pd # type: ignore +import pandas.io.parsers # type: ignore +import pyarrow as pa # type: ignore +import pyarrow.lib # type: ignore +import pyarrow.parquet # type: ignore +import s3fs # type: ignore +from pandas.io.common import infer_compression # type: ignore + +from awswrangler import _data_types, _utils, catalog, exceptions +from awswrangler.s3._list import path2list + +_logger: logging.Logger = logging.getLogger(__name__) + + +def read_parquet_metadata_internal( + path: Union[str, List[str]], + dtype: Optional[Dict[str, str]], + sampling: float, + dataset: bool, + use_threads: bool, + boto3_session: Optional[boto3.Session], +) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: + """Handle wr.s3.read_parquet_metadata internally.""" + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if dataset is True: + if isinstance(path, str): + _path: Optional[str] = path if path.endswith("/") else f"{path}/" + paths: List[str] = path2list(path=_path, boto3_session=session) + else: # pragma: no cover + raise exceptions.InvalidArgumentType("Argument must be str if dataset=True.") + else: + if isinstance(path, str): + _path = None + paths = path2list(path=path, boto3_session=session) + elif isinstance(path, list): + _path = None + paths = path + else: # pragma: no cover + raise exceptions.InvalidArgumentType(f"Argument path must be str or List[str] instead of {type(path)}.") + schemas: List[Dict[str, str]] = [ + _read_parquet_metadata_file(path=x, use_threads=use_threads, boto3_session=session) + for x in _utils.list_sampling(lst=paths, sampling=sampling) + ] + _logger.debug("schemas: %s", schemas) + columns_types: Dict[str, str] = {} + for schema in schemas: + for column, _dtype in schema.items(): + if (column in columns_types) and (columns_types[column] != _dtype): # pragma: no cover + raise exceptions.InvalidSchemaConvergence( + f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})." + ) + columns_types[column] = _dtype + partitions_types: Optional[Dict[str, str]] = None + partitions_values: Optional[Dict[str, List[str]]] = None + if (dataset is True) and (_path is not None): + partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths(path=_path, paths=paths) + if dtype: + for k, v in dtype.items(): + if columns_types and k in columns_types: + columns_types[k] = v + if partitions_types and k in partitions_types: + partitions_types[k] = v + _logger.debug("columns_types: %s", columns_types) + return columns_types, partitions_types, partitions_values + + +def _read_text( + parser_func: Callable, + path: Union[str, List[str]], + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + chunksize: Optional[int] = None, + dataset: bool = False, + **pandas_kwargs, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + if "iterator" in pandas_kwargs: + raise exceptions.InvalidArgument("Please, use chunksize instead of iterator.") + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (dataset is True) and (not isinstance(path, str)): # pragma: no cover + raise exceptions.InvalidArgument("The path argument must be a string Amazon S3 prefix if dataset=True.") + if dataset is True: + path_root: str = str(path) + else: + path_root = "" + paths: List[str] = path2list(path=path, boto3_session=session) + _logger.debug("paths:\n%s", paths) + if chunksize is not None: + dfs: Iterator[pd.DataFrame] = _read_text_chunksize( + parser_func=parser_func, + paths=paths, + boto3_session=session, + chunksize=chunksize, + pandas_args=pandas_kwargs, + s3_additional_kwargs=s3_additional_kwargs, + dataset=dataset, + path_root=path_root, + ) + return dfs + if use_threads is False: + df: pd.DataFrame = pd.concat( + objs=[ + _read_text_full( + parser_func=parser_func, + path=p, + boto3_session=session, + pandas_args=pandas_kwargs, + s3_additional_kwargs=s3_additional_kwargs, + dataset=dataset, + path_root=path_root, + ) + for p in paths + ], + ignore_index=True, + sort=False, + ) + else: + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: + df = pd.concat( + objs=executor.map( + _read_text_full, + itertools.repeat(parser_func), + itertools.repeat(path_root), + paths, + itertools.repeat(_utils.boto3_to_primitives(boto3_session=session)), # Boto3.Session + itertools.repeat(pandas_kwargs), + itertools.repeat(s3_additional_kwargs), + itertools.repeat(dataset), + ), + ignore_index=True, + sort=False, + ) + return df + + +def _read_text_chunksize( + parser_func: Callable, + path_root: str, + paths: List[str], + boto3_session: boto3.Session, + chunksize: int, + pandas_args: Dict[str, Any], + s3_additional_kwargs: Optional[Dict[str, str]] = None, + dataset: bool = False, +) -> Iterator[pd.DataFrame]: + fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) + for path in paths: + _logger.debug("path: %s", path) + partitions: Dict[str, Any] = {} + if dataset is True: + partitions = _utils.extract_partitions_from_path(path_root=path_root, path=path) + if pandas_args.get("compression", "infer") == "infer": + pandas_args["compression"] = infer_compression(path, compression="infer") + mode: str = "r" if pandas_args.get("compression") is None else "rb" + with fs.open(path, mode) as f: + reader: pandas.io.parsers.TextFileReader = parser_func(f, chunksize=chunksize, **pandas_args) + for df in reader: + if dataset is True: + for column_name, value in partitions.items(): + df[column_name] = value + yield df + + +def _read_text_full( + parser_func: Callable, + path_root: str, + path: str, + boto3_session: Union[boto3.Session, Dict[str, Optional[str]]], + pandas_args: Dict[str, Any], + s3_additional_kwargs: Optional[Dict[str, str]] = None, + dataset: bool = False, +) -> pd.DataFrame: + fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) + if pandas_args.get("compression", "infer") == "infer": + pandas_args["compression"] = infer_compression(path, compression="infer") + mode: str = "r" if pandas_args.get("compression") is None else "rb" + encoding: Optional[str] = pandas_args.get("encoding", None) + newline: Optional[str] = pandas_args.get("lineterminator", None) + with fs.open(path=path, mode=mode, encoding=encoding, newline=newline) as f: + df: pd.DataFrame = parser_func(f, **pandas_args) + if dataset is True: + partitions: Dict[str, Any] = _utils.extract_partitions_from_path(path_root=path_root, path=path) + for column_name, value in partitions.items(): + df[column_name] = value + return df + + +def _read_parquet_init( + path: Union[str, List[str]], + filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, + categories: List[str] = None, + validate_schema: bool = True, + dataset: bool = False, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, +) -> pyarrow.parquet.ParquetDataset: + """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset.""" + session: boto3.Session = _utils.ensure_session(session=boto3_session) + if dataset is False: + path_or_paths: Union[str, List[str]] = path2list(path=path, boto3_session=session) + elif isinstance(path, str): + path_or_paths = path[:-1] if path.endswith("/") else path + else: + path_or_paths = path + _logger.debug("path_or_paths: %s", path_or_paths) + fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset( + path_or_paths=path_or_paths, + filesystem=fs, + metadata_nthreads=cpus, + filters=filters, + read_dictionary=categories, + validate_schema=validate_schema, + split_row_groups=False, + use_legacy_dataset=True, + ) + return data + + +def _read_parquet( + data: pyarrow.parquet.ParquetDataset, + columns: Optional[List[str]] = None, + categories: List[str] = None, + use_threads: bool = True, + validate_schema: bool = True, +) -> pd.DataFrame: + tables: List[pa.Table] = [] + _logger.debug("Reading pieces...") + for piece in data.pieces: + table: pa.Table = piece.read( + columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False + ) + _logger.debug("Appending piece in the list...") + tables.append(table) + promote: bool = not validate_schema + _logger.debug("Concating pieces...") + table = pa.lib.concat_tables(tables, promote=promote) + _logger.debug("Converting PyArrow table to Pandas DataFrame...") + return table.to_pandas( + use_threads=use_threads, + split_blocks=True, + self_destruct=True, + integer_object_nulls=False, + date_as_object=True, + ignore_metadata=True, + categories=categories, + types_mapper=_data_types.pyarrow2pandas_extension, + ) + + +def _read_parquet_chunked( + data: pyarrow.parquet.ParquetDataset, + columns: Optional[List[str]] = None, + categories: List[str] = None, + chunked: Union[bool, int] = True, + use_threads: bool = True, +) -> Iterator[pd.DataFrame]: + next_slice: Optional[pd.DataFrame] = None + for piece in data.pieces: + df: pd.DataFrame = _table2df( + table=piece.read( + columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False + ), + categories=categories, + use_threads=use_threads, + ) + if chunked is True: + yield df + else: + if next_slice is not None: + df = pd.concat(objs=[next_slice, df], ignore_index=True, sort=False) + while len(df.index) >= chunked: + yield df.iloc[:chunked] + df = df.iloc[chunked:] + if df.empty: + next_slice = None + else: + next_slice = df + if next_slice is not None: + yield next_slice + + +def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame: + return table.to_pandas( + use_threads=use_threads, + split_blocks=True, + self_destruct=True, + integer_object_nulls=False, + date_as_object=True, + ignore_metadata=True, + categories=categories, + types_mapper=_data_types.pyarrow2pandas_extension, + ) + + +def _read_parquet_metadata_file(path: str, use_threads: bool, boto3_session: boto3.Session) -> Dict[str, str]: + data: pyarrow.parquet.ParquetDataset = _read_parquet_init( + path=path, filters=None, dataset=False, use_threads=use_threads, boto3_session=boto3_session + ) + return _data_types.athena_types_from_pyarrow_schema(schema=data.schema.to_arrow_schema(), partitions=None)[0] + + +def read_csv( + path: Union[str, List[str]], + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + chunksize: Optional[int] = None, + dataset: bool = False, + **pandas_kwargs, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read CSV file(s) from from a received S3 prefix or list of S3 objects paths. + + Note + ---- + For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + chunksize: int, optional + If specified, return an generator where chunksize is the number of rows to include in each chunk. + dataset: bool + If `True` read a CSV dataset instead of simple file(s) loading all the related partitions as columns. + pandas_kwargs: + keyword arguments forwarded to pandas.read_csv(). + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunksize != None`. + + Examples + -------- + Reading all CSV files under a prefix + + >>> import awswrangler as wr + >>> df = wr.s3.read_csv(path='s3://bucket/prefix/') + + Reading all CSV files under a prefix encrypted with a KMS key + + >>> import awswrangler as wr + >>> df = wr.s3.read_csv( + ... path='s3://bucket/prefix/', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading all CSV files from a list + + >>> import awswrangler as wr + >>> df = wr.s3.read_csv(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv']) + + Reading in chunks of 100 lines + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_csv(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunksize=100) + >>> for df in dfs: + >>> print(df) # 100 lines Pandas DataFrame + + """ + return _read_text( + parser_func=pd.read_csv, + path=path, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + chunksize=chunksize, + dataset=dataset, + **pandas_kwargs, + ) + + +def read_fwf( + path: Union[str, List[str]], + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + chunksize: Optional[int] = None, + dataset: bool = False, + **pandas_kwargs, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read fixed-width formatted file(s) from from a received S3 prefix or list of S3 objects paths. + + Note + ---- + For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + chunksize: int, optional + If specified, return an generator where chunksize is the number of rows to include in each chunk. + dataset: bool + If `True` read a FWF dataset instead of simple file(s) loading all the related partitions as columns. + pandas_kwargs: + keyword arguments forwarded to pandas.read_fwf(). + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_fwf.html + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunksize != None`. + + Examples + -------- + Reading all fixed-width formatted (FWF) files under a prefix + + >>> import awswrangler as wr + >>> df = wr.s3.read_fwf(path='s3://bucket/prefix/') + + Reading all fixed-width formatted (FWF) files under a prefix encrypted with a KMS key + + >>> import awswrangler as wr + >>> df = wr.s3.read_fwf( + ... path='s3://bucket/prefix/', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading all fixed-width formatted (FWF) files from a list + + >>> import awswrangler as wr + >>> df = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt']) + + Reading in chunks of 100 lines + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt'], chunksize=100) + >>> for df in dfs: + >>> print(df) # 100 lines Pandas DataFrame + + """ + return _read_text( + parser_func=pd.read_fwf, + path=path, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + chunksize=chunksize, + dataset=dataset, + **pandas_kwargs, + ) + + +def read_json( + path: Union[str, List[str]], + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + chunksize: Optional[int] = None, + dataset: bool = False, + **pandas_kwargs, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read JSON file(s) from from a received S3 prefix or list of S3 objects paths. + + Note + ---- + For partial and gradual reading use the argument ``chunksize`` instead of ``iterator``. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. ``[s3://bucket/key0, s3://bucket/key1]``). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + chunksize: int, optional + If specified, return an generator where chunksize is the number of rows to include in each chunk. + dataset: bool + If `True` read a JSON dataset instead of simple file(s) loading all the related partitions as columns. + If `True`, the `lines=True` will be assumed by default. + pandas_kwargs: + keyword arguments forwarded to pandas.read_json(). + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunksize != None`. + + Examples + -------- + Reading all JSON files under a prefix + + >>> import awswrangler as wr + >>> df = wr.s3.read_json(path='s3://bucket/prefix/') + + Reading all JSON files under a prefix encrypted with a KMS key + + >>> import awswrangler as wr + >>> df = wr.s3.read_json( + ... path='s3://bucket/prefix/', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading all JSON files from a list + + >>> import awswrangler as wr + >>> df = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json']) + + Reading in chunks of 100 lines + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json'], chunksize=100) + >>> for df in dfs: + >>> print(df) # 100 lines Pandas DataFrame + + """ + if (dataset is True) and ("lines" not in pandas_kwargs): + pandas_kwargs["lines"] = True + return _read_text( + parser_func=pd.read_json, + path=path, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + chunksize=chunksize, + dataset=dataset, + **pandas_kwargs, + ) + + +def read_parquet( + path: Union[str, List[str]], + filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, + columns: Optional[List[str]] = None, + validate_schema: bool = True, + chunked: Union[bool, int] = False, + dataset: bool = False, + categories: List[str] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read Apache Parquet file(s) from from a received S3 prefix or list of S3 objects paths. + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning and catalog integration (AWS Glue Catalog). + + Note + ---- + ``Batching`` (`chunked` argument) (Memory Friendly): + + Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. + + There are two batching strategies on Wrangler: + + - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. + + - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. + + `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise + in number of rows for each Dataframe. + + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + filters: Union[List[Tuple], List[List[Tuple]]], optional + List of filters to apply on PARTITION columns (PUSH-DOWN filter), like ``[[('x', '=', 0), ...], ...]``. + Ignored if `dataset=False`. + columns : List[str], optional + Names of columns to read from the file(s). + validate_schema: + Check that individual file schemas are all the same / compatible. Schemas within a + folder prefix should all be the same. Disable if you have schemas that are different + and want to disable this check. + chunked : Union[int, bool] + If passed will split the data in a Iterable of DataFrames (Memory friendly). + If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. + If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. + dataset: bool + If `True` read a parquet dataset instead of simple file(s) loading all the related partitions as columns. + categories: List[str], optional + List of columns names that should be returned as pandas.Categorical. + Recommended for memory restricted environments. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunked=True`. + + Examples + -------- + Reading all Parquet files under a prefix + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet(path='s3://bucket/prefix/') + + Reading all Parquet files under a prefix encrypted with a KMS key + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet( + ... path='s3://bucket/prefix/', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading all Parquet files from a list + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet(path=['s3://bucket/filename0.parquet', 's3://bucket/filename1.parquet']) + + Reading in chunks (Chunk by file) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=True) + >>> for df in dfs: + >>> print(df) # Smaller Pandas DataFrame + + Reading in chunks (Chunk by 1MM rows) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) + >>> for df in dfs: + >>> print(df) # 1MM Pandas DataFrame + + """ + data: pyarrow.parquet.ParquetDataset = _read_parquet_init( + path=path, + filters=filters, + dataset=dataset, + categories=categories, + validate_schema=validate_schema, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + ) + _logger.debug("pyarrow.parquet.ParquetDataset initialized.") + if chunked is False: + return _read_parquet( + data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema + ) + return _read_parquet_chunked( + data=data, columns=columns, categories=categories, chunked=chunked, use_threads=use_threads + ) + + +def read_parquet_metadata( + path: Union[str, List[str]], + dtype: Optional[Dict[str, str]] = None, + sampling: float = 1.0, + dataset: bool = False, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> Tuple[Dict[str, str], Optional[Dict[str, str]]]: + """Read Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths. + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning and catalog integration (AWS Glue Catalog). + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + dtype : Dict[str, str], optional + Dictionary of columns names and Athena/Glue types to be casted. + Useful when you have columns with undetermined data types as partitions columns. + (e.g. {'col name': 'bigint', 'col2 name': 'int'}) + sampling : float + Random sample ratio of files that will have the metadata inspected. + Must be `0.0 < sampling <= 1.0`. + The higher, the more accurate. + The lower, the faster. + dataset: bool + If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Tuple[Dict[str, str], Optional[Dict[str, str]]] + columns_types: Dictionary with keys as column names and vales as + data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / + partitions_types: Dictionary with keys as partition names + and values as data types (e.g. {'col2': 'date'}). + + Examples + -------- + Reading all Parquet files (with partitions) metadata under a prefix + + >>> import awswrangler as wr + >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path='s3://bucket/prefix/', dataset=True) + + Reading all Parquet files metadata from a list + + >>> import awswrangler as wr + >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path=[ + ... 's3://bucket/filename0.parquet', + ... 's3://bucket/filename1.parquet' + ... ]) + + """ + return read_parquet_metadata_internal( + path=path, dtype=dtype, sampling=sampling, dataset=dataset, use_threads=use_threads, boto3_session=boto3_session + )[:2] + + +def read_parquet_table( + table: str, + database: str, + filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None, + columns: Optional[List[str]] = None, + categories: List[str] = None, + chunked: Union[bool, int] = False, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Read Apache Parquet table registered on AWS Glue Catalog. + + Note + ---- + ``Batching`` (`chunked` argument) (Memory Friendly): + + Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. + + There are two batching strategies on Wrangler: + + - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. + + - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating + to return DataFrames with the number of row igual the received INTEGER. + + `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise + in number of rows for each Dataframe. + + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + table : str + AWS Glue Catalog table name. + database : str + AWS Glue Catalog database name. + filters: Union[List[Tuple], List[List[Tuple]]], optional + List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. + columns : List[str], optional + Names of columns to read from the file(s). + categories: List[str], optional + List of columns names that should be returned as pandas.Categorical. + Recommended for memory restricted environments. + chunked : bool + If True will break the data in smaller DataFrames (Non deterministic number of lines). + Otherwise return a single DataFrame with the whole data. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + + Returns + ------- + Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] + Pandas DataFrame or a Generator in case of `chunked=True`. + + Examples + -------- + Reading Parquet Table + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet_table(database='...', table='...') + + Reading Parquet Table encrypted + + >>> import awswrangler as wr + >>> df = wr.s3.read_parquet_table( + ... database='...', + ... table='...' + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + Reading Parquet Table in chunks (Chunk by file) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True) + >>> for df in dfs: + >>> print(df) # Smaller Pandas DataFrame + + Reading in chunks (Chunk by 1MM rows) + + >>> import awswrangler as wr + >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) + >>> for df in dfs: + >>> print(df) # 1MM Pandas DataFrame + + """ + path: str = catalog.get_table_location(database=database, table=table, boto3_session=boto3_session) + return read_parquet( + path=path, + filters=filters, + columns=columns, + categories=categories, + chunked=chunked, + dataset=True, + use_threads=use_threads, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + ) diff --git a/awswrangler/s3/_wait.py b/awswrangler/s3/_wait.py new file mode 100644 index 000000000..45487db61 --- /dev/null +++ b/awswrangler/s3/_wait.py @@ -0,0 +1,159 @@ +"""Amazon S3 Wait Module (PRIVATE).""" + +import concurrent.futures +import itertools +import logging +from typing import List, Optional, Tuple, Union + +import boto3 # type: ignore + +from awswrangler import _utils + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _wait_objects( + waiter_name: str, + paths: List[str], + delay: Optional[Union[int, float]] = None, + max_attempts: Optional[int] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> None: + delay = 5 if delay is None else delay + max_attempts = 20 if max_attempts is None else max_attempts + _delay: int = int(delay) if isinstance(delay, float) else delay + if len(paths) < 1: + return None + client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) + _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths] + if use_threads is False: + waiter = client_s3.get_waiter(waiter_name) + for bucket, key in _paths: + waiter.wait(Bucket=bucket, Key=key, WaiterConfig={"Delay": _delay, "MaxAttempts": max_attempts}) + else: + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: + list( + executor.map( + _wait_objects_concurrent, + _paths, + itertools.repeat(waiter_name), + itertools.repeat(client_s3), + itertools.repeat(_delay), + itertools.repeat(max_attempts), + ) + ) + return None + + +def _wait_objects_concurrent( + path: Tuple[str, str], waiter_name: str, client_s3: boto3.client, delay: int, max_attempts: int +) -> None: + waiter = client_s3.get_waiter(waiter_name) + bucket, key = path + waiter.wait(Bucket=bucket, Key=key, WaiterConfig={"Delay": delay, "MaxAttempts": max_attempts}) + + +def wait_objects_exist( + paths: List[str], + delay: Optional[Union[int, float]] = None, + max_attempts: Optional[int] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Wait Amazon S3 objects exist. + + Polls S3.Client.head_object() every 5 seconds (default) until a successful + state is reached. An error is returned after 20 (default) failed checks. + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Waiter.ObjectExists + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + paths : List[str] + List of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + delay : Union[int,float], optional + The amount of time in seconds to wait between attempts. Default: 5 + max_attempts : int, optional + The maximum number of attempts to be made. Default: 20 + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.wait_objects_exist(['s3://bucket/key0', 's3://bucket/key1']) # wait both objects + + """ + return _wait_objects( + waiter_name="object_exists", + paths=paths, + delay=delay, + max_attempts=max_attempts, + use_threads=use_threads, + boto3_session=boto3_session, + ) + + +def wait_objects_not_exist( + paths: List[str], + delay: Optional[Union[int, float]] = None, + max_attempts: Optional[int] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Wait Amazon S3 objects not exist. + + Polls S3.Client.head_object() every 5 seconds (default) until a successful + state is reached. An error is returned after 20 (default) failed checks. + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Waiter.ObjectNotExists + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + paths : List[str] + List of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + delay : Union[int,float], optional + The amount of time in seconds to wait between attempts. Default: 5 + max_attempts : int, optional + The maximum number of attempts to be made. Default: 20 + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.s3.wait_objects_not_exist(['s3://bucket/key0', 's3://bucket/key1']) # wait both objects not exist + + """ + return _wait_objects( + waiter_name="object_not_exists", + paths=paths, + delay=delay, + max_attempts=max_attempts, + use_threads=use_threads, + boto3_session=boto3_session, + ) diff --git a/awswrangler/s3/_write.py b/awswrangler/s3/_write.py new file mode 100644 index 000000000..c51d4ea14 --- /dev/null +++ b/awswrangler/s3/_write.py @@ -0,0 +1,1093 @@ +"""Amazon S3 Write Module (PRIVATE).""" + +import csv +import logging +import uuid +from typing import Dict, List, Optional, Tuple, Union + +import boto3 # type: ignore +import pandas as pd # type: ignore +import pyarrow as pa # type: ignore +import pyarrow.lib # type: ignore +import pyarrow.parquet # type: ignore +import s3fs # type: ignore + +from awswrangler import _data_types, _utils, catalog, exceptions +from awswrangler.s3._delete import delete_objects +from awswrangler.s3._read import read_parquet_metadata_internal + +_COMPRESSION_2_EXT: Dict[Optional[str], str] = {None: "", "gzip": ".gz", "snappy": ".snappy"} + +_logger: logging.Logger = logging.getLogger(__name__) + + +def _to_csv_dataset( + df: pd.DataFrame, + path: str, + index: bool, + sep: str, + fs: s3fs.S3FileSystem, + use_threads: bool, + mode: str, + dtype: Dict[str, str], + partition_cols: Optional[List[str]] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Tuple[List[str], Dict[str, List[str]]]: + paths: List[str] = [] + partitions_values: Dict[str, List[str]] = {} + path = path if path[-1] == "/" else f"{path}/" + if mode not in ["append", "overwrite", "overwrite_partitions"]: + raise exceptions.InvalidArgumentValue( + f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." + ) + if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): + delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) + df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) + _logger.debug("dtypes: %s", df.dtypes) + if not partition_cols: + file_path: str = f"{path}{uuid.uuid4().hex}.csv" + _to_text( + file_format="csv", + df=df, + path=file_path, + fs=fs, + quoting=csv.QUOTE_NONE, + escapechar="\\", + header=False, + date_format="%Y-%m-%d %H:%M:%S.%f", + index=index, + sep=sep, + ) + paths.append(file_path) + else: + for keys, subgroup in df.groupby(by=partition_cols, observed=True): + subgroup = subgroup.drop(partition_cols, axis="columns") + keys = (keys,) if not isinstance(keys, tuple) else keys + subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) + prefix: str = f"{path}{subdir}/" + if mode == "overwrite_partitions": + delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) + file_path = f"{prefix}{uuid.uuid4().hex}.csv" + _to_text( + file_format="csv", + df=subgroup, + path=file_path, + fs=fs, + quoting=csv.QUOTE_NONE, + escapechar="\\", + header=False, + date_format="%Y-%m-%d %H:%M:%S.%f", + index=index, + sep=sep, + ) + paths.append(file_path) + partitions_values[prefix] = [str(k) for k in keys] + return paths, partitions_values + + +def _to_text( + file_format: str, + df: pd.DataFrame, + path: str, + fs: Optional[s3fs.S3FileSystem] = None, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + **pandas_kwargs, +) -> None: + if df.empty is True: # pragma: no cover + raise exceptions.EmptyDataFrame() + if fs is None: + fs = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) + encoding: Optional[str] = pandas_kwargs.get("encoding", None) + newline: Optional[str] = pandas_kwargs.get("line_terminator", None) + with fs.open(path=path, mode="w", encoding=encoding, newline=newline) as f: + if file_format == "csv": + df.to_csv(f, **pandas_kwargs) + elif file_format == "json": + df.to_json(f, **pandas_kwargs) + + +def _to_parquet_dataset( + df: pd.DataFrame, + path: str, + index: bool, + compression: Optional[str], + compression_ext: str, + cpus: int, + fs: s3fs.S3FileSystem, + use_threads: bool, + mode: str, + dtype: Dict[str, str], + partition_cols: Optional[List[str]] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Tuple[List[str], Dict[str, List[str]]]: + paths: List[str] = [] + partitions_values: Dict[str, List[str]] = {} + path = path if path[-1] == "/" else f"{path}/" + if mode not in ["append", "overwrite", "overwrite_partitions"]: + raise exceptions.InvalidArgumentValue( + f"{mode} is a invalid mode, please use append, overwrite or overwrite_partitions." + ) + if (mode == "overwrite") or ((mode == "overwrite_partitions") and (not partition_cols)): + delete_objects(path=path, use_threads=use_threads, boto3_session=boto3_session) + df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) + schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( + df=df, index=index, ignore_cols=partition_cols, dtype=dtype + ) + _logger.debug("schema: \n%s", schema) + if not partition_cols: + file_path: str = f"{path}{uuid.uuid4().hex}{compression_ext}.parquet" + _to_parquet_file( + df=df, schema=schema, path=file_path, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype + ) + paths.append(file_path) + else: + for keys, subgroup in df.groupby(by=partition_cols, observed=True): + subgroup = subgroup.drop(partition_cols, axis="columns") + keys = (keys,) if not isinstance(keys, tuple) else keys + subdir = "/".join([f"{name}={val}" for name, val in zip(partition_cols, keys)]) + prefix: str = f"{path}{subdir}/" + if mode == "overwrite_partitions": + delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) + file_path = f"{prefix}{uuid.uuid4().hex}{compression_ext}.parquet" + _to_parquet_file( + df=subgroup, + schema=schema, + path=file_path, + index=index, + compression=compression, + cpus=cpus, + fs=fs, + dtype=dtype, + ) + paths.append(file_path) + partitions_values[prefix] = [str(k) for k in keys] + return paths, partitions_values + + +def _to_parquet_file( + df: pd.DataFrame, + path: str, + schema: pa.Schema, + index: bool, + compression: Optional[str], + cpus: int, + fs: s3fs.S3FileSystem, + dtype: Dict[str, str], +) -> str: + table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=True) + for col_name, col_type in dtype.items(): + if col_name in table.column_names: + col_index = table.column_names.index(col_name) + pyarrow_dtype = _data_types.athena2pyarrow(col_type) + field = pa.field(name=col_name, type=pyarrow_dtype) + table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype)) + _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype) + pyarrow.parquet.write_table( + table=table, + where=path, + write_statistics=True, + use_dictionary=True, + filesystem=fs, + coerce_timestamps="ms", + compression=compression, + flavor="spark", + ) + return path + + +def to_csv( # pylint: disable=too-many-arguments,too-many-locals + df: pd.DataFrame, + path: str, + sep: str = ",", + index: bool = True, + columns: Optional[List[str]] = None, + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + sanitize_columns: bool = False, + dataset: bool = False, + partition_cols: Optional[List[str]] = None, + mode: Optional[str] = None, + catalog_versioning: bool = False, + database: Optional[str] = None, + table: Optional[str] = None, + dtype: Optional[Dict[str, str]] = None, + description: Optional[str] = None, + parameters: Optional[Dict[str, str]] = None, + columns_comments: Optional[Dict[str, str]] = None, + regular_partitions: bool = True, + projection_enabled: bool = False, + projection_types: Optional[Dict[str, str]] = None, + projection_ranges: Optional[Dict[str, str]] = None, + projection_values: Optional[Dict[str, str]] = None, + projection_intervals: Optional[Dict[str, str]] = None, + projection_digits: Optional[Dict[str, str]] = None, + **pandas_kwargs, +) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: + """Write CSV file or dataset on Amazon S3. + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). + + Note + ---- + If `dataset=True` The table name and all column names will be automatically sanitized using + `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. + Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. + + Note + ---- + On `append` mode, the `parameters` will be upsert on an existing table. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + df: pandas.DataFrame + Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + path : str + Amazon S3 path (e.g. s3://bucket/filename.csv). + sep : str + String of length 1. Field delimiter for the output file. + index : bool + Write row names (index). + columns : List[str], optional + Columns to write. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + sanitize_columns : bool + True to sanitize columns names or False to keep it as is. + True value is forced if `dataset=True`. + dataset : bool + If True store a parquet dataset instead of a single file. + If True, enable all follow arguments: + partition_cols, mode, database, table, description, parameters, columns_comments, . + partition_cols: List[str], optional + List of column names that will be used to create partitions. Only takes effect if dataset=True. + mode : str, optional + ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. + database : str, optional + Glue/Athena catalog: Database name. + table : str, optional + Glue/Athena catalog: Table name. + dtype : Dict[str, str], optional + Dictionary of columns names and Athena/Glue types to be casted. + Useful when you have columns with undetermined or mixed data types. + (e.g. {'col name': 'bigint', 'col2 name': 'int'}) + description : str, optional + Glue/Athena catalog: Table description + parameters : Dict[str, str], optional + Glue/Athena catalog: Key/value pairs to tag the table. + columns_comments : Dict[str, str], optional + Glue/Athena catalog: + Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). + regular_partitions : bool + Create regular partitions (Non projected partitions) on Glue Catalog. + Disable when you will work only with Partition Projection. + Keep enabled even when working with projections is useful to keep + Redshift Spectrum working with the regular partitions. + projection_enabled : bool + Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) + projection_types : Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections types. + Valid types: "enum", "integer", "date", "injected" + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) + projection_ranges: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections ranges. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) + projection_values: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections values. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) + projection_intervals: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections intervals. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '5'}) + projection_digits: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections digits. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '2'}) + pandas_kwargs : + keyword arguments forwarded to pandas.DataFrame.to_csv() + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html + + Returns + ------- + None + None. + + Examples + -------- + Writing single file + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/prefix/my_file.csv', + ... ) + { + 'paths': ['s3://bucket/prefix/my_file.csv'], + 'partitions_values': {} + } + + Writing single file encrypted with a KMS key + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/prefix/my_file.csv', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + { + 'paths': ['s3://bucket/prefix/my_file.csv'], + 'partitions_values': {} + } + + Writing partitioned dataset + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... partition_cols=['col2'] + ... ) + { + 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], + 'partitions_values: { + 's3://.../col2=A/': ['A'], + 's3://.../col2=B/': ['B'] + } + } + + Writing dataset to S3 with metadata on Athena/Glue Catalog. + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... partition_cols=['col2'], + ... database='default', # Athena/Glue database + ... table='my_table' # Athena/Glue table + ... ) + { + 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], + 'partitions_values: { + 's3://.../col2=A/': ['A'], + 's3://.../col2=B/': ['B'] + } + } + + Writing dataset casting empty column data type + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'], + ... 'col3': [None, None, None] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... database='default', # Athena/Glue database + ... table='my_table' # Athena/Glue table + ... dtype={'col3': 'date'} + ... ) + { + 'paths': ['s3://.../x.csv'], + 'partitions_values: {} + } + + """ + if (database is None) ^ (table is None): + raise exceptions.InvalidArgumentCombination( + "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." + ) + if df.empty is True: + raise exceptions.EmptyDataFrame() + + partition_cols = partition_cols if partition_cols else [] + dtype = dtype if dtype else {} + partitions_values: Dict[str, List[str]] = {} + + # Sanitize table to respect Athena's standards + if (sanitize_columns is True) or (dataset is True): + df = catalog.sanitize_dataframe_columns_names(df=df) + partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] + dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} + catalog.drop_duplicated_columns(df=df) + + session: boto3.Session = _utils.ensure_session(session=boto3_session) + fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) + if dataset is False: + if partition_cols: + raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") + if mode is not None: + raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") + if columns_comments: + raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use columns_comments.") + if any(arg is not None for arg in (database, table, description, parameters)): + raise exceptions.InvalidArgumentCombination( + "Please pass dataset=True to be able to use any one of these " + "arguments: database, table, description, parameters, " + "columns_comments." + ) + pandas_kwargs["sep"] = sep + pandas_kwargs["index"] = index + pandas_kwargs["columns"] = columns + _to_text(file_format="csv", df=df, path=path, fs=fs, **pandas_kwargs) + paths = [path] + else: + mode = "append" if mode is None else mode + if columns: + df = df[columns] + if ( + (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) + ): # Fetching Catalog Types + catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( + database=database, table=table, boto3_session=session + ) + if catalog_types is not None: + for k, v in catalog_types.items(): + dtype[k] = v + paths, partitions_values = _to_csv_dataset( + df=df, + path=path, + index=index, + sep=sep, + fs=fs, + use_threads=use_threads, + partition_cols=partition_cols, + dtype=dtype, + mode=mode, + boto3_session=session, + ) + if (database is not None) and (table is not None): + columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( + df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True + ) + catalog.create_csv_table( + database=database, + table=table, + path=path, + columns_types=columns_types, + partitions_types=partitions_types, + description=description, + parameters=parameters, + columns_comments=columns_comments, + boto3_session=session, + mode=mode, + catalog_versioning=catalog_versioning, + sep=sep, + projection_enabled=projection_enabled, + projection_types=projection_types, + projection_ranges=projection_ranges, + projection_values=projection_values, + projection_intervals=projection_intervals, + projection_digits=projection_digits, + ) + if partitions_values and (regular_partitions is True): + _logger.debug("partitions_values:\n%s", partitions_values) + catalog.add_csv_partitions( + database=database, table=table, partitions_values=partitions_values, boto3_session=session, sep=sep + ) + return {"paths": paths, "partitions_values": partitions_values} + + +def to_json( + df: pd.DataFrame, + path: str, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + **pandas_kwargs, +) -> None: + """Write JSON file on Amazon S3. + + Parameters + ---------- + df: pandas.DataFrame + Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + path : str + Amazon S3 path (e.g. s3://bucket/filename.csv). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 Session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + pandas_kwargs: + keyword arguments forwarded to pandas.DataFrame.to_csv() + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html + + Returns + ------- + None + None. + + Examples + -------- + Writing JSON file + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_json( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/filename.json', + ... ) + + Writing CSV file encrypted with a KMS key + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_json( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/filename.json', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + + """ + return _to_text( + file_format="json", + df=df, + path=path, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + **pandas_kwargs, + ) + + +def to_parquet( # pylint: disable=too-many-arguments,too-many-locals + df: pd.DataFrame, + path: str, + index: bool = False, + compression: Optional[str] = "snappy", + use_threads: bool = True, + boto3_session: Optional[boto3.Session] = None, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + sanitize_columns: bool = False, + dataset: bool = False, + partition_cols: Optional[List[str]] = None, + mode: Optional[str] = None, + catalog_versioning: bool = False, + database: Optional[str] = None, + table: Optional[str] = None, + dtype: Optional[Dict[str, str]] = None, + description: Optional[str] = None, + parameters: Optional[Dict[str, str]] = None, + columns_comments: Optional[Dict[str, str]] = None, + regular_partitions: bool = True, + projection_enabled: bool = False, + projection_types: Optional[Dict[str, str]] = None, + projection_ranges: Optional[Dict[str, str]] = None, + projection_values: Optional[Dict[str, str]] = None, + projection_intervals: Optional[Dict[str, str]] = None, + projection_digits: Optional[Dict[str, str]] = None, +) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: + """Write Parquet file or dataset on Amazon S3. + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). + + Note + ---- + If `dataset=True` The table name and all column names will be automatically sanitized using + `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. + Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. + + Note + ---- + On `append` mode, the `parameters` will be upsert on an existing table. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + df: pandas.DataFrame + Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + path : str + S3 path (for file e.g. ``s3://bucket/prefix/filename.parquet``) (for dataset e.g. ``s3://bucket/prefix``). + index : bool + True to store the DataFrame index in file, otherwise False to ignore it. + compression: str, optional + Compression style (``None``, ``snappy``, ``gzip``). + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + s3_additional_kwargs: + Forward to s3fs, useful for server side encryption + https://s3fs.readthedocs.io/en/latest/#serverside-encryption + sanitize_columns : bool + True to sanitize columns names or False to keep it as is. + True value is forced if `dataset=True`. + dataset : bool + If True store a parquet dataset instead of a single file. + If True, enable all follow arguments: + partition_cols, mode, database, table, description, parameters, columns_comments, . + partition_cols: List[str], optional + List of column names that will be used to create partitions. Only takes effect if dataset=True. + mode: str, optional + ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. + database : str, optional + Glue/Athena catalog: Database name. + table : str, optional + Glue/Athena catalog: Table name. + dtype : Dict[str, str], optional + Dictionary of columns names and Athena/Glue types to be casted. + Useful when you have columns with undetermined or mixed data types. + (e.g. {'col name': 'bigint', 'col2 name': 'int'}) + description : str, optional + Glue/Athena catalog: Table description + parameters : Dict[str, str], optional + Glue/Athena catalog: Key/value pairs to tag the table. + columns_comments : Dict[str, str], optional + Glue/Athena catalog: + Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). + regular_partitions : bool + Create regular partitions (Non projected partitions) on Glue Catalog. + Disable when you will work only with Partition Projection. + Keep enabled even when working with projections is useful to keep + Redshift Spectrum working with the regular partitions. + projection_enabled : bool + Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) + projection_types : Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections types. + Valid types: "enum", "integer", "date", "injected" + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) + projection_ranges: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections ranges. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) + projection_values: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections values. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) + projection_intervals: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections intervals. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '5'}) + projection_digits: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections digits. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '2'}) + + Returns + ------- + Dict[str, Union[List[str], Dict[str, List[str]]]] + Dictionary with: + 'paths': List of all stored files paths on S3. + 'partitions_values': Dictionary of partitions added with keys as S3 path locations + and values as a list of partitions values as str. + + Examples + -------- + Writing single file + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/prefix/my_file.parquet', + ... ) + { + 'paths': ['s3://bucket/prefix/my_file.parquet'], + 'partitions_values': {} + } + + Writing single file encrypted with a KMS key + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/prefix/my_file.parquet', + ... s3_additional_kwargs={ + ... 'ServerSideEncryption': 'aws:kms', + ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' + ... } + ... ) + { + 'paths': ['s3://bucket/prefix/my_file.parquet'], + 'partitions_values': {} + } + + Writing partitioned dataset + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... partition_cols=['col2'] + ... ) + { + 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], + 'partitions_values: { + 's3://.../col2=A/': ['A'], + 's3://.../col2=B/': ['B'] + } + } + + Writing dataset to S3 with metadata on Athena/Glue Catalog. + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... partition_cols=['col2'], + ... database='default', # Athena/Glue database + ... table='my_table' # Athena/Glue table + ... ) + { + 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], + 'partitions_values: { + 's3://.../col2=A/': ['A'], + 's3://.../col2=B/': ['B'] + } + } + + Writing dataset casting empty column data type + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_parquet( + ... df=pd.DataFrame({ + ... 'col': [1, 2, 3], + ... 'col2': ['A', 'A', 'B'], + ... 'col3': [None, None, None] + ... }), + ... path='s3://bucket/prefix', + ... dataset=True, + ... database='default', # Athena/Glue database + ... table='my_table' # Athena/Glue table + ... dtype={'col3': 'date'} + ... ) + { + 'paths': ['s3://.../x.parquet'], + 'partitions_values: {} + } + + """ + if (database is None) ^ (table is None): + raise exceptions.InvalidArgumentCombination( + "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." + ) + if df.empty is True: + raise exceptions.EmptyDataFrame() + + partition_cols = partition_cols if partition_cols else [] + dtype = dtype if dtype else {} + partitions_values: Dict[str, List[str]] = {} + + # Sanitize table to respect Athena's standards + if (sanitize_columns is True) or (dataset is True): + df = catalog.sanitize_dataframe_columns_names(df=df) + partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] + dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} + catalog.drop_duplicated_columns(df=df) + + session: boto3.Session = _utils.ensure_session(session=boto3_session) + cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) + compression_ext: Optional[str] = _COMPRESSION_2_EXT.get(compression, None) + if compression_ext is None: + raise exceptions.InvalidCompression(f"{compression} is invalid, please use None, snappy or gzip.") + if dataset is False: + if path.endswith("/"): # pragma: no cover + raise exceptions.InvalidArgumentValue( + "If , the argument should be a object path, not a directory." + ) + if partition_cols: + raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") + if mode is not None: + raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") + if any(arg is not None for arg in (database, table, description, parameters)): + raise exceptions.InvalidArgumentCombination( + "Please pass dataset=True to be able to use any one of these " + "arguments: database, table, description, parameters, " + "columns_comments." + ) + df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) + schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( + df=df, index=index, ignore_cols=partition_cols, dtype=dtype + ) + _logger.debug("schema: \n%s", schema) + paths = [ + _to_parquet_file( + df=df, path=path, schema=schema, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype + ) + ] + else: + mode = "append" if mode is None else mode + if ( + (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) + ): # Fetching Catalog Types + catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( + database=database, table=table, boto3_session=session + ) + if catalog_types is not None: + for k, v in catalog_types.items(): + dtype[k] = v + paths, partitions_values = _to_parquet_dataset( + df=df, + path=path, + index=index, + compression=compression, + compression_ext=compression_ext, + cpus=cpus, + fs=fs, + use_threads=use_threads, + partition_cols=partition_cols, + dtype=dtype, + mode=mode, + boto3_session=session, + ) + if (database is not None) and (table is not None): + columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( + df=df, index=index, partition_cols=partition_cols, dtype=dtype + ) + catalog.create_parquet_table( + database=database, + table=table, + path=path, + columns_types=columns_types, + partitions_types=partitions_types, + compression=compression, + description=description, + parameters=parameters, + columns_comments=columns_comments, + boto3_session=session, + mode=mode, + catalog_versioning=catalog_versioning, + projection_enabled=projection_enabled, + projection_types=projection_types, + projection_ranges=projection_ranges, + projection_values=projection_values, + projection_intervals=projection_intervals, + projection_digits=projection_digits, + ) + if partitions_values and (regular_partitions is True): + _logger.debug("partitions_values:\n%s", partitions_values) + catalog.add_parquet_partitions( + database=database, + table=table, + partitions_values=partitions_values, + compression=compression, + boto3_session=session, + ) + return {"paths": paths, "partitions_values": partitions_values} + + +def store_parquet_metadata( # pylint: disable=too-many-arguments + path: str, + database: str, + table: str, + dtype: Optional[Dict[str, str]] = None, + sampling: float = 1.0, + dataset: bool = False, + use_threads: bool = True, + description: Optional[str] = None, + parameters: Optional[Dict[str, str]] = None, + columns_comments: Optional[Dict[str, str]] = None, + compression: Optional[str] = None, + mode: str = "overwrite", + catalog_versioning: bool = False, + regular_partitions: bool = True, + projection_enabled: bool = False, + projection_types: Optional[Dict[str, str]] = None, + projection_ranges: Optional[Dict[str, str]] = None, + projection_values: Optional[Dict[str, str]] = None, + projection_intervals: Optional[Dict[str, str]] = None, + projection_digits: Optional[Dict[str, str]] = None, + boto3_session: Optional[boto3.Session] = None, +) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: + """Infer and store parquet metadata on AWS Glue Catalog. + + Infer Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths + And then stores it on AWS Glue Catalog including all inferred partitions + (No need of 'MCSK REPAIR TABLE') + + The concept of Dataset goes beyond the simple idea of files and enable more + complex features like partitioning and catalog integration (AWS Glue Catalog). + + Note + ---- + On `append` mode, the `parameters` will be upsert on an existing table. + + Note + ---- + In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). + + Parameters + ---------- + path : Union[str, List[str]] + S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). + database : str + Glue/Athena catalog: Database name. + table : str + Glue/Athena catalog: Table name. + dtype : Dict[str, str], optional + Dictionary of columns names and Athena/Glue types to be casted. + Useful when you have columns with undetermined data types as partitions columns. + (e.g. {'col name': 'bigint', 'col2 name': 'int'}) + sampling : float + Random sample ratio of files that will have the metadata inspected. + Must be `0.0 < sampling <= 1.0`. + The higher, the more accurate. + The lower, the faster. + dataset: bool + If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. + description: str, optional + Glue/Athena catalog: Table description + parameters: Dict[str, str], optional + Glue/Athena catalog: Key/value pairs to tag the table. + columns_comments: Dict[str, str], optional + Glue/Athena catalog: + Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). + compression: str, optional + Compression style (``None``, ``snappy``, ``gzip``, etc). + mode: str + 'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table. + catalog_versioning : bool + If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. + regular_partitions : bool + Create regular partitions (Non projected partitions) on Glue Catalog. + Disable when you will work only with Partition Projection. + Keep enabled even when working with projections is useful to keep + Redshift Spectrum working with the regular partitions. + projection_enabled : bool + Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) + projection_types : Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections types. + Valid types: "enum", "integer", "date", "injected" + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) + projection_ranges: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections ranges. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) + projection_values: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections values. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) + projection_intervals: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections intervals. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '5'}) + projection_digits: Optional[Dict[str, str]] + Dictionary of partitions names and Athena projections digits. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html + (e.g. {'col_name': '1', 'col2_name': '2'}) + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]] + The metadata used to create the Glue Table. + columns_types: Dictionary with keys as column names and vales as + data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / + partitions_types: Dictionary with keys as partition names + and values as data types (e.g. {'col2': 'date'}). / + partitions_values: Dictionary with keys as S3 path locations and values as a + list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). + + Examples + -------- + Reading all Parquet files metadata under a prefix + + >>> import awswrangler as wr + >>> columns_types, partitions_types, partitions_values = wr.s3.store_parquet_metadata( + ... path='s3://bucket/prefix/', + ... database='...', + ... table='...', + ... dataset=True + ... ) + + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + columns_types: Dict[str, str] + partitions_types: Optional[Dict[str, str]] + partitions_values: Optional[Dict[str, List[str]]] + columns_types, partitions_types, partitions_values = read_parquet_metadata_internal( + path=path, dtype=dtype, sampling=sampling, dataset=dataset, use_threads=use_threads, boto3_session=session + ) + _logger.debug("columns_types: %s", columns_types) + _logger.debug("partitions_types: %s", partitions_types) + _logger.debug("partitions_values: %s", partitions_values) + catalog.create_parquet_table( + database=database, + table=table, + path=path, + columns_types=columns_types, + partitions_types=partitions_types, + description=description, + parameters=parameters, + columns_comments=columns_comments, + mode=mode, + catalog_versioning=catalog_versioning, + projection_enabled=projection_enabled, + projection_types=projection_types, + projection_ranges=projection_ranges, + projection_values=projection_values, + projection_intervals=projection_intervals, + projection_digits=projection_digits, + boto3_session=session, + ) + if (partitions_types is not None) and (partitions_values is not None) and (regular_partitions is True): + catalog.add_parquet_partitions( + database=database, + table=table, + partitions_values=partitions_values, + compression=compression, + boto3_session=session, + ) + return columns_types, partitions_types, partitions_values diff --git a/docs/source/api.rst b/docs/source/api.rst index 5cd8e9e3c..16bb6ed0c 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -42,8 +42,10 @@ AWS Glue Catalog add_csv_partitions add_parquet_partitions create_csv_table + create_database create_parquet_table databases + delete_database delete_table_if_exists does_table_exist drop_duplicated_columns @@ -135,3 +137,50 @@ CloudWatch Logs run_query start_query wait_query + +Amazon QuickSight +----------------- + +.. currentmodule:: awswrangler.quicksight + +.. autosummary:: + :toctree: stubs + + cancel_ingestion + create_athena_data_source + create_athena_dataset + create_ingestion + delete_all_dashboards + delete_all_data_sources + delete_all_datasets + delete_all_templates + delete_dashboard + delete_data_source + delete_dataset + delete_template + describe_dashboard + describe_data_source + describe_data_source_permissions + describe_dataset + describe_ingestion + get_dashboard_id + get_dashboard_ids + get_data_source_arn + get_data_source_arns + get_data_source_id + get_data_source_ids + get_dataset_id + get_dataset_ids + get_template_id + get_template_ids + list_dashboards + list_data_sources + list_datasets + list_groups + list_group_memberships + list_iam_policy_assignments + list_iam_policy_assignments_for_user + list_ingestions + list_templates + list_users + list_user_groups diff --git a/docs/source/index.rst b/docs/source/index.rst index 6c0380007..ccd45d34c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -25,12 +25,28 @@ Quick Start # Retrieving the data from Amazon Athena df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db") - # Getting Redshift connection (SQLAlchemy) from Glue Catalog Connections + # Get Redshift connection (SQLAlchemy) from Glue and retrieving data from Redshift Spectrum engine = wr.catalog.get_engine("my-redshift-connection") - - # Retrieving the data from Amazon Redshift Spectrum df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) + # Creating QuickSight Data Source and Dataset to reflect our new table + wr.quicksight.create_athena_data_source("athena-source", allowed_to_manage=["username"]) + wr.quicksight.create_athena_dataset( + name="my-dataset", + database="my_db", + table="my_table", + data_source_name="athena-source", + allowed_to_manage=["username"] + ) + + # Get MySQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into MySQL + engine = wr.catalog.get_engine("my-mysql-connection") + wr.db.to_sql(df, engine, schema="test", name="my_table") + + # Get PostgreSQL connection (SQLAlchemy) from Glue Catalog and LOAD the data into PostgreSQL + engine = wr.catalog.get_engine("my-postgresql-connection") + wr.db.to_sql(df, engine, schema="test", name="my_table") + Read The Docs ------------- diff --git a/docs/source/what.rst b/docs/source/what.rst index 0a169b74d..71c721782 100644 --- a/docs/source/what.rst +++ b/docs/source/what.rst @@ -1,7 +1,7 @@ What is AWS Data Wrangler? ========================== -An `open-source `_ Python package that extends the power of `Pandas `_ library to AWS connecting **DataFrames** and AWS data related services (**Amazon Redshift**, **AWS Glue**, **Amazon Athena**, **Amazon EMR**, etc). +An `open-source `_ Python package that extends the power of `Pandas `_ library to AWS connecting **DataFrames** and AWS data related services (**Amazon Redshift**, **AWS Glue**, **Amazon Athena**, **Amazon EMR**, **Amazon QuickSight**, etc). Built on top of other open-source projects like `Pandas `_, `Apache Arrow `_, `Boto3 `_, `s3fs `_, `SQLAlchemy `_, `Psycopg2 `_ and `PyMySQL `_, it offers abstracted functions to execute usual ETL tasks like load/unload data from **Data Lakes**, **Data Warehouses** and **Databases**. diff --git a/requirements-dev.txt b/requirements-dev.txt index e0abc8e4a..3cb805c67 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,22 +1,22 @@ awscli>=1.18.0,<2.0.0 black~=19.3b0 -pylint~=2.5.2 -flake8~=3.8.2 -mypy~=0.770 +pylint~=2.5.3 +flake8~=3.8.3 +mypy~=0.780 isort~=4.3.21 pydocstyle~=5.0.2 -doc8~=0.8.0 -tox~=3.15.1 -pytest~=5.4.2 +doc8~=0.8.1 +tox~=3.15.2 +pytest~=5.4.3 pytest-cov~=2.9.0 pytest-xdist~=1.32.0 pytest-timeout~=1.3.4 scikit-learn~=0.23.1 -cfn-lint~=0.32.1 +cfn-lint~=0.33.0 cfn-flip~=1.2.3 twine~=3.1.1 wheel~=0.34.2 sphinx~=3.0.4 sphinx_bootstrap_theme~=0.7.1 moto~=1.3.14 -jupyterlab~=2.1.4 \ No newline at end of file +jupyterlab~=2.1.4 diff --git a/requirements.txt b/requirements.txt index c8eedd045..273fff794 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ s3fs~=0.4.2 psycopg2-binary~=2.8.0 pymysql~=0.9.0 sqlalchemy-redshift~=0.7.0 -SQLAlchemy>=1.3.10,<1.3.16 \ No newline at end of file +SQLAlchemy>=1.3.10,<1.3.14 diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 19fb1ca19..e0c43de9d 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -1078,15 +1078,12 @@ def test_catalog(path, database, table): if tbl["Name"] == table: assert tbl["TableType"] == "EXTERNAL_TABLE" # prefix & suffix & name_contains - tables = list( - wr.catalog.get_tables( - name_prefix=table[0], name_contains=table[3], name_suffix=table[-1], catalog_id=account_id + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + list( + wr.catalog.get_tables( + name_prefix=table[0], name_contains=table[3], name_suffix=table[-1], catalog_id=account_id + ) ) - ) - assert len(tables) > 0 - for tbl in tables: - if tbl["Name"] == table: - assert tbl["TableType"] == "EXTERNAL_TABLE" # prefix & suffix tables = list(wr.catalog.get_tables(name_prefix=table[0], name_suffix=table[-1], catalog_id=account_id)) assert len(tables) > 0 diff --git a/testing/test_awswrangler/test_data_lake2.py b/testing/test_awswrangler/test_data_lake2.py index cbb78fb41..05400a615 100644 --- a/testing/test_awswrangler/test_data_lake2.py +++ b/testing/test_awswrangler/test_data_lake2.py @@ -103,11 +103,10 @@ def test_json_chunksize(path): def test_parquet_cast_string(path): df = pd.DataFrame({"id": [1, 2, 3], "value": ["foo", "boo", "bar"]}) path_file = f"{path}0.parquet" - wr.s3.to_parquet(df, path_file, dtype={"id": "string"}) + wr.s3.to_parquet(df, path_file, dtype={"id": "string"}, sanitize_columns=False) wr.s3.wait_objects_exist([path_file]) df2 = wr.s3.read_parquet(path_file) assert str(df2.id.dtypes) == "string" - df2["id"] = df2["id"].astype(int) assert df.shape == df2.shape for col, row in tuple(itertools.product(df.columns, range(3))): assert df[col].iloc[row] == df2[col].iloc[row] @@ -123,8 +122,6 @@ def test_parquet_cast_string_dataset(path, partition_cols): df2 = wr.s3.read_parquet(path, dataset=True).sort_values("id", ignore_index=True) assert str(df2.id.dtypes) == "string" assert str(df2.c3.dtypes) == "string" - df2["id"] = df2["id"].astype(int) - df2["c3"] = df2["c3"].astype(float) assert df.shape == df2.shape for col, row in tuple(itertools.product(df.columns, range(3))): assert df[col].iloc[row] == df2[col].iloc[row] @@ -158,7 +155,7 @@ def test_athena_undefined_column(database): def test_to_parquet_file_sanitize(path): df = pd.DataFrame({"C0": [0, 1], "camelCase": [2, 3], "c**--2": [4, 5]}) path_file = f"{path}0.parquet" - wr.s3.to_parquet(df, path_file) + wr.s3.to_parquet(df, path_file, sanitize_columns=True) wr.s3.wait_objects_exist([path_file]) df2 = wr.s3.read_parquet(path_file) assert df.shape == df2.shape @@ -423,3 +420,66 @@ def test_read_partitioned_fwf(path, use_threads, chunksize): else: for d in df2: assert d.shape == (1, 4) + + +def test_glue_database(): + + # Round 1 - Create Database + database_name = f"database_{get_time_str_with_random_suffix()}" + print(f"Database Name: {database_name}") + wr.catalog.create_database(name=database_name, description="Database Description") + databases = wr.catalog.get_databases() + test_database_name = "" + test_database_description = "" + + for database in databases: + if database["Name"] == database_name: + test_database_name = database["Name"] + test_database_description = database["Description"] + + assert test_database_name == database_name + assert test_database_description == "Database Description" + + # Round 2 - Delete Database + print(f"Database Name: {database_name}") + wr.catalog.delete_database(name=database_name) + databases = wr.catalog.get_databases() + test_database_name = "" + test_database_description = "" + + for database in databases: + if database["Name"] == database_name: + test_database_name = database["Name"] + test_database_description = database["Description"] + + assert test_database_name == "" + assert test_database_description == "" + + +def test_list_wrong_path(path): + wrong_path = path.replace("s3://", "") + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.s3.list_objects(wrong_path) + + +@pytest.mark.parametrize("sanitize_columns,col", [(True, "foo_boo"), (False, "FooBoo")]) +def test_sanitize_columns(path, sanitize_columns, col): + df = pd.DataFrame({"FooBoo": [1, 2, 3]}) + + # Parquet + file_path = f"{path}0.parquet" + wr.s3.to_parquet(df, path=file_path, sanitize_columns=sanitize_columns) + wr.s3.wait_objects_exist([file_path]) + df = wr.s3.read_parquet(file_path) + assert len(df.index) == 3 + assert len(df.columns) == 1 + assert df.columns == [col] + + # CSV + file_path = f"{path}0.csv" + wr.s3.to_csv(df, path=file_path, sanitize_columns=sanitize_columns, index=False) + wr.s3.wait_objects_exist([file_path]) + df = wr.s3.read_csv(file_path) + assert len(df.index) == 3 + assert len(df.columns) == 1 + assert df.columns == [col] diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py index 4ff1e68ed..2775c7187 100644 --- a/testing/test_awswrangler/test_db.py +++ b/testing/test_awswrangler/test_db.py @@ -1,5 +1,6 @@ import logging import random +import string import boto3 import pandas as pd @@ -15,6 +16,8 @@ extract_cloudformation_outputs, get_df, get_df_category, + get_time_str_with_random_suffix, + path_generator, ) logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") @@ -27,6 +30,11 @@ def cloudformation_outputs(): yield extract_cloudformation_outputs() +@pytest.fixture(scope="function") +def path(bucket): + yield from path_generator(bucket) + + @pytest.fixture(scope="module") def bucket(cloudformation_outputs): if "BucketName" in cloudformation_outputs: @@ -63,6 +71,15 @@ def glue_database(cloudformation_outputs): yield cloudformation_outputs["GlueDatabaseName"] +@pytest.fixture(scope="function") +def glue_table(glue_database): + name = f"tbl_{get_time_str_with_random_suffix()}" + print(f"Table name: {name}") + wr.catalog.delete_table_if_exists(database=glue_database, table=name) + yield name + wr.catalog.delete_table_if_exists(database=glue_database, table=name) + + @pytest.fixture(scope="module") def external_schema(cloudformation_outputs, parameters, glue_database): region = cloudformation_outputs.get("Region") @@ -89,13 +106,14 @@ def test_sql(parameters, db_type): if db_type == "redshift": df.drop(["binary"], axis=1, inplace=True) engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") + index = True if engine.name == "redshift" else False wr.db.to_sql( df=df, con=engine, name="test_sql", schema=parameters[db_type]["schema"], if_exists="replace", - index=False, + index=index, index_label=None, chunksize=None, method=None, @@ -528,3 +546,58 @@ def test_null(parameters, db_type): df2 = wr.db.read_sql_table(table=table, schema=schema, con=engine) df["id"] = df["id"].astype("Int64") assert pd.concat(objs=[df, df], ignore_index=True).equals(df2) + + +def test_redshift_spectrum_long_string(path, glue_table, glue_database, external_schema): + df = pd.DataFrame( + { + "id": [1, 2], + "col_str": [ + "".join(random.choice(string.ascii_letters) for _ in range(300)), + "".join(random.choice(string.ascii_letters) for _ in range(300)), + ], + } + ) + paths = wr.s3.to_parquet( + df=df, path=path, database=glue_database, table=glue_table, mode="overwrite", index=False, dataset=True + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") + with engine.connect() as con: + cursor = con.execute(f"SELECT * FROM {external_schema}.{glue_table}") + rows = cursor.fetchall() + assert len(rows) == len(df.index) + for row in rows: + assert len(row) == len(df.columns) + + +def test_redshift_copy_unload_long_string(path, parameters): + df = pd.DataFrame( + { + "id": [1, 2], + "col_str": [ + "".join(random.choice(string.ascii_letters) for _ in range(300)), + "".join(random.choice(string.ascii_letters) for _ in range(300)), + ], + } + ) + engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") + wr.db.copy_to_redshift( + df=df, + path=path, + con=engine, + schema="public", + table="test_redshift_copy_unload_long_string", + mode="overwrite", + varchar_lengths={"col_str": 300}, + iam_role=parameters["redshift"]["role"], + ) + df2 = wr.db.unload_redshift( + sql="SELECT * FROM public.test_redshift_copy_unload_long_string", + con=engine, + iam_role=parameters["redshift"]["role"], + path=path, + keep_files=False, + ) + assert len(df2.index) == 2 + assert len(df2.columns) == 2 diff --git a/testing/test_awswrangler/test_metadata.py b/testing/test_awswrangler/test_metadata.py index c8f0bc067..d4084dff1 100644 --- a/testing/test_awswrangler/test_metadata.py +++ b/testing/test_awswrangler/test_metadata.py @@ -2,7 +2,7 @@ def test_metadata(): - assert wr.__version__ == "1.4.0" + assert wr.__version__ == "1.5.0" assert wr.__title__ == "awswrangler" assert wr.__description__ == "Pandas on AWS." assert wr.__license__ == "Apache License 2.0" diff --git a/tox.ini b/tox.ini index 288b563dc..620eeab7e 100644 --- a/tox.ini +++ b/tox.ini @@ -2,15 +2,17 @@ envlist = py{37,38,36} [testenv] +passenv = AWS_PROFILE AWS_DEFAULT_REGION AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY deps = pytest pytest-xdist pytest-timeout moto commands = - pytest --timeout=900 -n 8 testing/test_awswrangler + pytest --timeout=600 -n 8 testing/test_awswrangler [testenv:py36] +passenv = AWS_PROFILE AWS_DEFAULT_REGION AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY deps = {[testenv]deps} pytest-cov diff --git a/tutorials/002 - Sessions.ipynb b/tutorials/002 - Sessions.ipynb index 2ff88ad1a..b305ed429 100644 --- a/tutorials/002 - Sessions.ipynb +++ b/tutorials/002 - Sessions.ipynb @@ -124,28 +124,6 @@ "\n", "wr.s3.does_object_exist(\"s3://noaa-ghcn-pds/fake\", boto3_session=my_session)" ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_session = boto3.Session(region_name=\"us-east-2\")\n", - "\n", - "wr.s3.does_object_exist(\"s3://noaa-ghcn-pds/fake\", boto3_session=my_session)" - ] } ], "metadata": { diff --git a/tutorials/005 - Glue Catalog.ipynb b/tutorials/005 - Glue Catalog.ipynb index 1e48a20c9..4c3b02540 100644 --- a/tutorials/005 - Glue Catalog.ipynb +++ b/tutorials/005 - Glue Catalog.ipynb @@ -36,10 +36,10 @@ "metadata": {}, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ - " ··········································\n" + "········\n" ] } ], @@ -192,77 +192,62 @@ "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DatabaseDescription
0awswrangler_testAWS Data Wrangler Test Arena - Glue Database
1defaultDefault Hive database
2sampledbSample database
\n", - "
" - ], - "text/plain": [ - " Database Description\n", - "0 awswrangler_test AWS Data Wrangler Test Arena - Glue Database\n", - "1 default Default Hive database\n", - "2 sampledb Sample database" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 default Default Hive database\n" + ] } ], "source": [ - "wr.catalog.databases()" + "databases = wr.catalog.databases()\n", + "print(databases)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Checking the empty database" + "### Create the database awswrangler_test if not exists" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 awswrangler_test \n", + "2 default Default Hive database\n" + ] + } + ], + "source": [ + "if \"awswrangler_test\" not in databases.values:\n", + " wr.catalog.create_database(\"awswrangler_test\")\n", + " print(wr.catalog.databases())\n", + "else:\n", + " print(\"Database awswrangler_test already exists\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Checking the empty database" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { @@ -293,37 +278,17 @@ " \n", " \n", " \n", - " \n", - " 0\n", - " awswrangler_test\n", - " lambda\n", - " \n", - " col1, col2\n", - " \n", - " \n", - " \n", - " 1\n", - " awswrangler_test\n", - " noaa\n", - " \n", - " id, dt, element, value, m_flag, q_flag, s_flag...\n", - " \n", - " \n", " \n", "\n", "" ], "text/plain": [ - " Database Table Description \\\n", - "0 awswrangler_test lambda \n", - "1 awswrangler_test noaa \n", - "\n", - " Columns Partitions \n", - "0 col1, col2 \n", - "1 id, dt, element, value, m_flag, q_flag, s_flag... " + "Empty DataFrame\n", + "Columns: [Database, Table, Description, Columns, Partitions]\n", + "Index: []" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -341,7 +306,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -408,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -463,7 +428,7 @@ "0 crim, zn, indus, chas, nox, rm, age, dis, rad,... " ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -474,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -529,7 +494,7 @@ "0 crim, zn, indus, chas, nox, rm, age, dis, rad,... " ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -540,7 +505,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -595,7 +560,7 @@ "0 crim, zn, indus, chas, nox, rm, age, dis, rad,... " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -606,7 +571,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -661,7 +626,7 @@ "0 crim, zn, indus, chas, nox, rm, age, dis, rad,... " ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -679,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -846,7 +811,7 @@ "13 " ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -864,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -872,19 +837,28 @@ " wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete Database" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "wr.catalog.delete_database('awswrangler_test')" + ] } ], "metadata": { "kernelspec": { - "display_name": "conda_python3", + "display_name": "Python 3", "language": "python", - "name": "conda_python3" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -896,9 +870,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tutorials/006 - Amazon Athena.ipynb b/tutorials/006 - Amazon Athena.ipynb index c4216346a..a3fcb221d 100644 --- a/tutorials/006 - Amazon Athena.ipynb +++ b/tutorials/006 - Amazon Athena.ipynb @@ -49,10 +49,10 @@ "metadata": {}, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ - " ··········································\n" + "········\n" ] } ], @@ -62,6 +62,57 @@ "path = f\"s3://{bucket}/data/\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Checking Glue Catalog Databases" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 default Default Hive database\n" + ] + } + ], + "source": [ + "databases = wr.catalog.databases()\n", + "print(databases)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 awswrangler_test \n", + "2 default Default Hive database\n" + ] + } + ], + "source": [ + "if \"awswrangler_test\" not in databases.values:\n", + " wr.catalog.create_database(\"awswrangler_test\")\n", + " print(wr.catalog.databases())\n", + "else:\n", + " print(\"Database awswrangler_test already exists\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -73,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -175,20 +226,20 @@ " ...\n", " \n", " \n", - " 29240012\n", - " USC00181790\n", - " 1899-12-31\n", - " PRCP\n", + " 1276241\n", + " CA001167635\n", + " 1890-12-31\n", + " SNOW\n", " 0\n", - " P\n", " NaN\n", - " 6\n", - " 1830\n", + " NaN\n", + " C\n", + " NaN\n", " \n", " \n", - " 29240013\n", - " ASN00061000\n", - " 1899-12-31\n", + " 1276242\n", + " ASN00019053\n", + " 1890-12-31\n", " PRCP\n", " 0\n", " NaN\n", @@ -197,9 +248,9 @@ " NaN\n", " \n", " \n", - " 29240014\n", - " ASN00040284\n", - " 1899-12-31\n", + " 1276243\n", + " ASN00024501\n", + " 1890-12-31\n", " PRCP\n", " 0\n", " NaN\n", @@ -208,22 +259,22 @@ " NaN\n", " \n", " \n", - " 29240015\n", - " ASN00048117\n", - " 1899-12-31\n", + " 1276244\n", + " SF001035700\n", + " 1890-12-31\n", " PRCP\n", " 0\n", " NaN\n", " NaN\n", - " a\n", + " I\n", " NaN\n", " \n", " \n", - " 29240016\n", + " 1276245\n", " ASN00054092\n", - " 1899-12-31\n", + " 1890-12-31\n", " PRCP\n", - " 0\n", + " 15\n", " NaN\n", " NaN\n", " a\n", @@ -231,27 +282,27 @@ " \n", " \n", "\n", - "

29240017 rows × 8 columns

\n", + "

1276246 rows × 8 columns

\n", "" ], "text/plain": [ - " id dt element value m_flag q_flag s_flag obs_time\n", - "0 ASN00070200 1890-01-01 PRCP 0 NaN NaN a NaN\n", - "1 SF000782720 1890-01-01 PRCP 0 NaN NaN I NaN\n", - "2 CA005022790 1890-01-01 TMAX -222 NaN NaN C NaN\n", - "3 CA005022790 1890-01-01 TMIN -261 NaN NaN C NaN\n", - "4 CA005022790 1890-01-01 PRCP 0 NaN NaN C NaN\n", - "... ... ... ... ... ... ... ... ...\n", - "29240012 USC00181790 1899-12-31 PRCP 0 P NaN 6 1830\n", - "29240013 ASN00061000 1899-12-31 PRCP 0 NaN NaN a NaN\n", - "29240014 ASN00040284 1899-12-31 PRCP 0 NaN NaN a NaN\n", - "29240015 ASN00048117 1899-12-31 PRCP 0 NaN NaN a NaN\n", - "29240016 ASN00054092 1899-12-31 PRCP 0 NaN NaN a NaN\n", + " id dt element value m_flag q_flag s_flag obs_time\n", + "0 ASN00070200 1890-01-01 PRCP 0 NaN NaN a NaN\n", + "1 SF000782720 1890-01-01 PRCP 0 NaN NaN I NaN\n", + "2 CA005022790 1890-01-01 TMAX -222 NaN NaN C NaN\n", + "3 CA005022790 1890-01-01 TMIN -261 NaN NaN C NaN\n", + "4 CA005022790 1890-01-01 PRCP 0 NaN NaN C NaN\n", + "... ... ... ... ... ... ... ... ...\n", + "1276241 CA001167635 1890-12-31 SNOW 0 NaN NaN C NaN\n", + "1276242 ASN00019053 1890-12-31 PRCP 0 NaN NaN a NaN\n", + "1276243 ASN00024501 1890-12-31 PRCP 0 NaN NaN a NaN\n", + "1276244 SF001035700 1890-12-31 PRCP 0 NaN NaN I NaN\n", + "1276245 ASN00054092 1890-12-31 PRCP 15 NaN NaN a NaN\n", "\n", - "[29240017 rows x 8 columns]" + "[1276246 rows x 8 columns]" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -269,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -285,7 +336,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -388,7 +439,7 @@ "7 obs_time string False " ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -406,15 +457,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 19.7 s, sys: 5.2 s, total: 24.9 s\n", - "Wall time: 45 s\n" + "CPU times: user 1.57 s, sys: 454 ms, total: 2.02 s\n", + "Wall time: 46.6 s\n" ] }, { @@ -451,8 +502,8 @@ " \n", " \n", " 0\n", - " ASN00047014\n", - " 1892-01-16\n", + " ASN00061069\n", + " 1890-01-01\n", " PRCP\n", " 0\n", " <NA>\n", @@ -462,46 +513,46 @@ " \n", " \n", " 1\n", - " ASN00056032\n", - " 1892-01-16\n", + " USC00212904\n", + " 1890-01-01\n", " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", " 2\n", - " KG000036948\n", - " 1892-01-16\n", - " PRCP\n", - " 0\n", + " USC00212904\n", + " 1890-01-01\n", + " SNWD\n", + " 305\n", " <NA>\n", " <NA>\n", - " I\n", + " 6\n", " <NA>\n", " \n", " \n", " 3\n", - " CA005010868\n", - " 1892-01-16\n", + " ASN00019052\n", + " 1890-01-01\n", " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " a\n", " <NA>\n", " \n", " \n", " 4\n", - " CA005010868\n", - " 1892-01-16\n", - " SNOW\n", + " RSM00022112\n", + " 1890-01-01\n", + " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " I\n", " <NA>\n", " \n", " \n", @@ -516,83 +567,83 @@ " ...\n", " \n", " \n", - " 29240012\n", - " USC00303773\n", - " 1899-12-31\n", - " SNOW\n", + " 1276241\n", + " ASN00075035\n", + " 1890-10-28\n", + " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " 6\n", + " a\n", " <NA>\n", " \n", " \n", - " 29240013\n", - " USC00165090\n", - " 1899-12-31\n", - " TMAX\n", - " 100\n", + " 1276242\n", + " SF001988360\n", + " 1890-10-28\n", + " PRCP\n", + " 51\n", " <NA>\n", " <NA>\n", - " 6\n", + " I\n", " <NA>\n", " \n", " \n", - " 29240014\n", - " USC00165090\n", - " 1899-12-31\n", - " TMIN\n", - " -33\n", + " 1276243\n", + " ASN00048021\n", + " 1890-10-28\n", + " PRCP\n", + " 0\n", " <NA>\n", " <NA>\n", - " 6\n", + " a\n", " <NA>\n", " \n", " \n", - " 29240015\n", - " USC00165090\n", - " 1899-12-31\n", + " 1276244\n", + " USC00412758\n", + " 1890-10-28\n", " PRCP\n", - " 51\n", + " 0\n", " <NA>\n", " <NA>\n", " 6\n", " <NA>\n", " \n", " \n", - " 29240016\n", - " USC00165090\n", - " 1899-12-31\n", - " SNOW\n", - " 51\n", + " 1276245\n", + " SF000440500\n", + " 1890-10-28\n", + " PRCP\n", + " 0\n", " <NA>\n", " <NA>\n", - " 6\n", + " I\n", " <NA>\n", " \n", " \n", "\n", - "

29240017 rows × 8 columns

\n", + "

1276246 rows × 8 columns

\n", "" ], "text/plain": [ - " id dt element value m_flag q_flag s_flag obs_time\n", - "0 ASN00047014 1892-01-16 PRCP 0 a \n", - "1 ASN00056032 1892-01-16 PRCP 0 a \n", - "2 KG000036948 1892-01-16 PRCP 0 I \n", - "3 CA005010868 1892-01-16 PRCP 0 C \n", - "4 CA005010868 1892-01-16 SNOW 0 C \n", - "... ... ... ... ... ... ... ... ...\n", - "29240012 USC00303773 1899-12-31 SNOW 0 6 \n", - "29240013 USC00165090 1899-12-31 TMAX 100 6 \n", - "29240014 USC00165090 1899-12-31 TMIN -33 6 \n", - "29240015 USC00165090 1899-12-31 PRCP 51 6 \n", - "29240016 USC00165090 1899-12-31 SNOW 51 6 \n", + " id dt element value m_flag q_flag s_flag obs_time\n", + "0 ASN00061069 1890-01-01 PRCP 0 a \n", + "1 USC00212904 1890-01-01 PRCP 0 6 \n", + "2 USC00212904 1890-01-01 SNWD 305 6 \n", + "3 ASN00019052 1890-01-01 PRCP 0 a \n", + "4 RSM00022112 1890-01-01 PRCP 0 I \n", + "... ... ... ... ... ... ... ... ...\n", + "1276241 ASN00075035 1890-10-28 PRCP 0 a \n", + "1276242 SF001988360 1890-10-28 PRCP 51 I \n", + "1276243 ASN00048021 1890-10-28 PRCP 0 a \n", + "1276244 USC00412758 1890-10-28 PRCP 0 6 \n", + "1276245 SF000440500 1890-10-28 PRCP 0 I \n", "\n", - "[29240017 rows x 8 columns]" + "[1276246 rows x 8 columns]" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -612,15 +663,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 8min 33s, sys: 6.11 s, total: 8min 39s\n", - "Wall time: 12min 28s\n" + "CPU times: user 21.2 s, sys: 1.8 s, total: 23 s\n", + "Wall time: 6min 22s\n" ] }, { @@ -722,83 +773,83 @@ " ...\n", " \n", " \n", - " 29240012\n", - " USC00181790\n", - " 1899-12-31\n", - " PRCP\n", + " 1276241\n", + " CA006131910\n", + " 1890-12-31\n", + " SNOW\n", " 0\n", - " P\n", " <NA>\n", - " 6\n", - " 1830\n", + " <NA>\n", + " C\n", + " <NA>\n", " \n", " \n", - " 29240013\n", - " ASN00061000\n", - " 1899-12-31\n", - " PRCP\n", - " 0\n", + " 1276242\n", + " USC00174230\n", + " 1890-12-31\n", + " TMAX\n", + " -106\n", " <NA>\n", " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", - " 29240014\n", - " ASN00040284\n", - " 1899-12-31\n", - " PRCP\n", - " 0\n", + " 1276243\n", + " USC00174230\n", + " 1890-12-31\n", + " TMIN\n", + " -244\n", " <NA>\n", " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", - " 29240015\n", - " ASN00048117\n", - " 1899-12-31\n", + " 1276244\n", + " USC00174230\n", + " 1890-12-31\n", " PRCP\n", " 0\n", + " P\n", " <NA>\n", - " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", - " 29240016\n", - " ASN00054092\n", - " 1899-12-31\n", - " PRCP\n", + " 1276245\n", + " USC00174230\n", + " 1890-12-31\n", + " SNOW\n", " 0\n", " <NA>\n", " <NA>\n", - " a\n", + " 6\n", " <NA>\n", " \n", " \n", "\n", - "

29240017 rows × 8 columns

\n", + "

1276246 rows × 8 columns

\n", "" ], "text/plain": [ - " id dt element value m_flag q_flag s_flag obs_time\n", - "0 ASN00070200 1890-01-01 PRCP 0 a \n", - "1 SF000782720 1890-01-01 PRCP 0 I \n", - "2 CA005022790 1890-01-01 TMAX -222 C \n", - "3 CA005022790 1890-01-01 TMIN -261 C \n", - "4 CA005022790 1890-01-01 PRCP 0 C \n", - "... ... ... ... ... ... ... ... ...\n", - "29240012 USC00181790 1899-12-31 PRCP 0 P 6 1830\n", - "29240013 ASN00061000 1899-12-31 PRCP 0 a \n", - "29240014 ASN00040284 1899-12-31 PRCP 0 a \n", - "29240015 ASN00048117 1899-12-31 PRCP 0 a \n", - "29240016 ASN00054092 1899-12-31 PRCP 0 a \n", + " id dt element value m_flag q_flag s_flag obs_time\n", + "0 ASN00070200 1890-01-01 PRCP 0 a \n", + "1 SF000782720 1890-01-01 PRCP 0 I \n", + "2 CA005022790 1890-01-01 TMAX -222 C \n", + "3 CA005022790 1890-01-01 TMIN -261 C \n", + "4 CA005022790 1890-01-01 PRCP 0 C \n", + "... ... ... ... ... ... ... ... ...\n", + "1276241 CA006131910 1890-12-31 SNOW 0 C \n", + "1276242 USC00174230 1890-12-31 TMAX -106 6 \n", + "1276243 USC00174230 1890-12-31 TMIN -244 6 \n", + "1276244 USC00174230 1890-12-31 PRCP 0 P 6 \n", + "1276245 USC00174230 1890-12-31 SNOW 0 6 \n", "\n", - "[29240017 rows x 8 columns]" + "[1276246 rows x 8 columns]" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -818,15 +869,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 4.12 s, sys: 1.75 s, total: 5.87 s\n", - "Wall time: 31 s\n" + "CPU times: user 748 ms, sys: 279 ms, total: 1.03 s\n", + "Wall time: 48.8 s\n" ] }, { @@ -863,58 +914,58 @@ " \n", " \n", " 0\n", - " CA008101170\n", + " ASN00061069\n", " 1890-01-01\n", - " TMIN\n", - " -217\n", + " PRCP\n", + " 0\n", " NaN\n", " NaN\n", - " C\n", + " a\n", " NaN\n", " \n", " \n", " 1\n", - " CA008101170\n", + " USC00212904\n", " 1890-01-01\n", " PRCP\n", " 0\n", " NaN\n", " NaN\n", - " C\n", + " 6\n", " NaN\n", " \n", " \n", " 2\n", - " CA008101170\n", + " USC00212904\n", " 1890-01-01\n", - " SNOW\n", - " 0\n", + " SNWD\n", + " 305\n", " NaN\n", " NaN\n", - " C\n", + " 6\n", " NaN\n", " \n", " \n", " 3\n", - " USC00435733\n", + " ASN00019052\n", " 1890-01-01\n", - " TMAX\n", - " 33\n", + " PRCP\n", + " 0\n", " NaN\n", " NaN\n", - " 6\n", - " 1700\n", + " a\n", + " NaN\n", " \n", " \n", " 4\n", - " USC00435733\n", + " RSM00022112\n", " 1890-01-01\n", - " TMIN\n", - " -122\n", + " PRCP\n", + " 0\n", " NaN\n", " NaN\n", - " 6\n", - " 1700\n", + " I\n", + " NaN\n", " \n", " \n", " ...\n", @@ -928,83 +979,83 @@ " ...\n", " \n", " \n", - " 29240012\n", - " USC00395481\n", - " 1899-12-31\n", - " SNOW\n", + " 1276241\n", + " SF004323870\n", + " 1890-01-03\n", + " PRCP\n", " 0\n", " NaN\n", " NaN\n", - " 6\n", + " I\n", " NaN\n", " \n", " \n", - " 29240013\n", - " ASN00063055\n", - " 1899-12-31\n", + " 1276242\n", + " SF001018040\n", + " 1890-01-03\n", " PRCP\n", " 0\n", " NaN\n", " NaN\n", - " a\n", + " I\n", " NaN\n", " \n", " \n", - " 29240014\n", - " USC00357814\n", - " 1899-12-31\n", - " TMAX\n", - " 78\n", + " 1276243\n", + " LG000026314\n", + " 1890-01-03\n", + " PRCP\n", + " 0\n", " NaN\n", " NaN\n", - " 6\n", + " I\n", " NaN\n", " \n", " \n", - " 29240015\n", - " USC00357814\n", - " 1899-12-31\n", - " TMIN\n", - " 0\n", + " 1276244\n", + " CA004016320\n", + " 1890-01-03\n", + " TMAX\n", + " -278\n", " NaN\n", " NaN\n", - " 6\n", + " C\n", " NaN\n", " \n", " \n", - " 29240016\n", - " USC00357814\n", - " 1899-12-31\n", - " PRCP\n", - " 102\n", + " 1276245\n", + " CA004016320\n", + " 1890-01-03\n", + " TMIN\n", + " -383\n", " NaN\n", " NaN\n", - " 6\n", + " C\n", " NaN\n", " \n", " \n", "\n", - "

29240017 rows × 8 columns

\n", + "

1276246 rows × 8 columns

\n", "" ], "text/plain": [ - " id dt element value m_flag q_flag s_flag obs_time\n", - "0 CA008101170 1890-01-01 TMIN -217 NaN NaN C NaN\n", - "1 CA008101170 1890-01-01 PRCP 0 NaN NaN C NaN\n", - "2 CA008101170 1890-01-01 SNOW 0 NaN NaN C NaN\n", - "3 USC00435733 1890-01-01 TMAX 33 NaN NaN 6 1700\n", - "4 USC00435733 1890-01-01 TMIN -122 NaN NaN 6 1700\n", - "... ... ... ... ... ... ... ... ...\n", - "29240012 USC00395481 1899-12-31 SNOW 0 NaN NaN 6 NaN\n", - "29240013 ASN00063055 1899-12-31 PRCP 0 NaN NaN a NaN\n", - "29240014 USC00357814 1899-12-31 TMAX 78 NaN NaN 6 NaN\n", - "29240015 USC00357814 1899-12-31 TMIN 0 NaN NaN 6 NaN\n", - "29240016 USC00357814 1899-12-31 PRCP 102 NaN NaN 6 NaN\n", + " id dt element value m_flag q_flag s_flag obs_time\n", + "0 ASN00061069 1890-01-01 PRCP 0 NaN NaN a NaN\n", + "1 USC00212904 1890-01-01 PRCP 0 NaN NaN 6 NaN\n", + "2 USC00212904 1890-01-01 SNWD 305 NaN NaN 6 NaN\n", + "3 ASN00019052 1890-01-01 PRCP 0 NaN NaN a NaN\n", + "4 RSM00022112 1890-01-01 PRCP 0 NaN NaN I NaN\n", + "... ... ... ... ... ... ... ... ...\n", + "1276241 SF004323870 1890-01-03 PRCP 0 NaN NaN I NaN\n", + "1276242 SF001018040 1890-01-03 PRCP 0 NaN NaN I NaN\n", + "1276243 LG000026314 1890-01-03 PRCP 0 NaN NaN I NaN\n", + "1276244 CA004016320 1890-01-03 TMAX -278 NaN NaN C NaN\n", + "1276245 CA004016320 1890-01-03 TMIN -383 NaN NaN C NaN\n", "\n", - "[29240017 rows x 8 columns]" + "[1276246 rows x 8 columns]" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1024,19 +1075,18 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "110592\n", + "150870\n", + "1024\n", "1024\n", - "50176\n", - "2278400\n", - "9641681\n", - "9716736\n", - "7552000\n" + "1012736\n" ] } ], @@ -1054,16 +1104,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "10000000\n", - "10000000\n", - "9240017\n" + "1276246\n" ] } ], @@ -1088,7 +1136,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1104,7 +1152,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -1112,19 +1160,28 @@ " wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete Database" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "wr.catalog.delete_database('awswrangler_test')" + ] } ], "metadata": { "kernelspec": { - "display_name": "conda_python3", + "display_name": "Python 3", "language": "python", - "name": "conda_python3" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1136,7 +1193,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.7" }, "pycharm": { "stem_cell": { diff --git a/tutorials/008 - Redshift - Copy & Unload.ipynb b/tutorials/008 - Redshift - Copy & Unload.ipynb index 7bb23da14..14133f101 100644 --- a/tutorials/008 - Redshift - Copy & Unload.ipynb +++ b/tutorials/008 - Redshift - Copy & Unload.ipynb @@ -14,7 +14,7 @@ "\n", "2 - [UNLOAD](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html)\n", "\n", - "Let's take a look and how Wrangler could can use it." + "Let's take a look and how Wrangler can use it." ] }, { @@ -781,17 +781,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/tutorials/018 - QuickSight.ipynb b/tutorials/018 - QuickSight.ipynb new file mode 100644 index 000000000..a90fe6573 --- /dev/null +++ b/tutorials/018 - QuickSight.ipynb @@ -0,0 +1,1298 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)\n", + "\n", + "# 18 - QuickSight\n", + "\n", + "For this tutorial we will use the public AWS COVID-19 data lake.\n", + "\n", + "References:\n", + "\n", + "* [A public data lake for analysis of COVID-19 data](https://aws.amazon.com/blogs/big-data/a-public-data-lake-for-analysis-of-covid-19-data/)\n", + "* [Exploring the public AWS COVID-19 data lake](https://aws.amazon.com/blogs/big-data/exploring-the-public-aws-covid-19-data-lake/)\n", + "* [CloudFormation template](https://covid19-lake.s3.us-east-2.amazonaws.com/cfn/CovidLakeStack.template.json)\n", + "\n", + "*Please, install the Cloudformation template above to have access to the public data lake.*\n", + "\n", + "*P.S. To be able to access the public data lake, you must allow explicitly QuickSight to access the related external bucket.*" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ + "import awswrangler as wr\n", + "from time import sleep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List users of QuickSight account" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'username': 'dev', 'role': 'ADMIN'}]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[{\"username\": user[\"UserName\"], \"role\": user[\"Role\"]} for user in wr.quicksight.list_users('default')]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatabaseDescription
0aws_data_wranglerAWS Data Wrangler Test Arena - Glue Database
1awswrangler_test
2covid-19
3defaultDefault Hive database
\n", + "
" + ], + "text/plain": [ + " Database Description\n", + "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", + "1 awswrangler_test \n", + "2 covid-19 \n", + "3 default Default Hive database" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.catalog.databases()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatabaseTableDescriptionColumnsPartitions
0covid-19alleninstitute_comprehend_medicalComprehend Medical results run against Allen I...paper_id, date, dx_name, test_name, procedure_...
1covid-19alleninstitute_metadataMetadata on papers pulled from the Allen Insti...cord_uid, sha, source_x, title, doi, pmcid, pu...
2covid-19country_codesLookup table for country codescountry, alpha-2 code, alpha-3 code, numeric c...
3covid-19county_populationsLookup table for population for each county ba...id, id2, county, state, population estimate 2018
4covid-19covid_knowledge_graph_edgesAWS Knowledge Graph for COVID-19 dataid, label, from, to, score
5covid-19covid_knowledge_graph_nodes_authorAWS Knowledge Graph for COVID-19 dataid, label, first, last, full_name
6covid-19covid_knowledge_graph_nodes_conceptAWS Knowledge Graph for COVID-19 dataid, label, entity, concept
7covid-19covid_knowledge_graph_nodes_institutionAWS Knowledge Graph for COVID-19 dataid, label, institution, country, settlement
8covid-19covid_knowledge_graph_nodes_paperAWS Knowledge Graph for COVID-19 dataid, label, doi, sha_code, publish_time, source...
9covid-19covid_knowledge_graph_nodes_topicAWS Knowledge Graph for COVID-19 dataid, label, topic, topic_num
10covid-19covid_testing_states_dailyUSA total test daily trend by state. Sourced ...date, state, positive, negative, pending, hosp...
11covid-19covid_testing_us_dailyUSA total test daily trend. Sourced from covi...date, states, positive, negative, posneg, pend...
12covid-19covid_testing_us_totalUSA total tests. Sourced from covidtracking.c...positive, negative, posneg, hospitalized, deat...
13covid-19covidcast_dataCMU Delphi's COVID-19 Surveillance Datadata_source, signal, geo_type, time_value, geo...
14covid-19covidcast_metadataCMU Delphi's COVID-19 Surveillance Metadatadata_source, signal, time_type, geo_type, min_...
15covid-19enigma_jhuJohns Hopkins University Consolidated data on ...fips, admin2, province_state, country_region, ...
16covid-19enigma_jhu_timeseriesJohns Hopkins University data on COVID-19 case...uid, fips, iso2, iso3, code3, admin2, latitude...
17covid-19hospital_bedsData on hospital beds and their utilization in...objectid, hospital_name, hospital_type, hq_add...
18covid-19nytimes_countiesData on COVID-19 cases from NY Times at US cou...date, county, state, fips, cases, deaths
19covid-19nytimes_statesData on COVID-19 cases from NY Times at US sta...date, state, fips, cases, deaths
20covid-19prediction_models_county_predictionsCounty-level Predictions Data. Sourced from Yu...countyfips, countyname, statename, severity_co...
21covid-19prediction_models_severity_indexSeverity Index models. Sourced from Yu Group a...severity_1-day, severity_2-day, severity_3-day...
22covid-19tableau_covid_datahubCOVID-19 data that has been gathered and unifi...country_short_name, country_alpha_3_code, coun...
23covid-19tableau_jhuJohns Hopkins University data on COVID-19 case...case_type, cases, difference, date, country_re...
24covid-19us_state_abbreviationsLookup table for US state abbreviationsstate, abbreviation
25covid-19world_cases_deaths_testingData on confirmed cases, deaths, and testing. ...iso_code, location, date, total_cases, new_cas...
\n", + "
" + ], + "text/plain": [ + " Database Table \\\n", + "0 covid-19 alleninstitute_comprehend_medical \n", + "1 covid-19 alleninstitute_metadata \n", + "2 covid-19 country_codes \n", + "3 covid-19 county_populations \n", + "4 covid-19 covid_knowledge_graph_edges \n", + "5 covid-19 covid_knowledge_graph_nodes_author \n", + "6 covid-19 covid_knowledge_graph_nodes_concept \n", + "7 covid-19 covid_knowledge_graph_nodes_institution \n", + "8 covid-19 covid_knowledge_graph_nodes_paper \n", + "9 covid-19 covid_knowledge_graph_nodes_topic \n", + "10 covid-19 covid_testing_states_daily \n", + "11 covid-19 covid_testing_us_daily \n", + "12 covid-19 covid_testing_us_total \n", + "13 covid-19 covidcast_data \n", + "14 covid-19 covidcast_metadata \n", + "15 covid-19 enigma_jhu \n", + "16 covid-19 enigma_jhu_timeseries \n", + "17 covid-19 hospital_beds \n", + "18 covid-19 nytimes_counties \n", + "19 covid-19 nytimes_states \n", + "20 covid-19 prediction_models_county_predictions \n", + "21 covid-19 prediction_models_severity_index \n", + "22 covid-19 tableau_covid_datahub \n", + "23 covid-19 tableau_jhu \n", + "24 covid-19 us_state_abbreviations \n", + "25 covid-19 world_cases_deaths_testing \n", + "\n", + " Description \\\n", + "0 Comprehend Medical results run against Allen I... \n", + "1 Metadata on papers pulled from the Allen Insti... \n", + "2 Lookup table for country codes \n", + "3 Lookup table for population for each county ba... \n", + "4 AWS Knowledge Graph for COVID-19 data \n", + "5 AWS Knowledge Graph for COVID-19 data \n", + "6 AWS Knowledge Graph for COVID-19 data \n", + "7 AWS Knowledge Graph for COVID-19 data \n", + "8 AWS Knowledge Graph for COVID-19 data \n", + "9 AWS Knowledge Graph for COVID-19 data \n", + "10 USA total test daily trend by state. Sourced ... \n", + "11 USA total test daily trend. Sourced from covi... \n", + "12 USA total tests. Sourced from covidtracking.c... \n", + "13 CMU Delphi's COVID-19 Surveillance Data \n", + "14 CMU Delphi's COVID-19 Surveillance Metadata \n", + "15 Johns Hopkins University Consolidated data on ... \n", + "16 Johns Hopkins University data on COVID-19 case... \n", + "17 Data on hospital beds and their utilization in... \n", + "18 Data on COVID-19 cases from NY Times at US cou... \n", + "19 Data on COVID-19 cases from NY Times at US sta... \n", + "20 County-level Predictions Data. Sourced from Yu... \n", + "21 Severity Index models. Sourced from Yu Group a... \n", + "22 COVID-19 data that has been gathered and unifi... \n", + "23 Johns Hopkins University data on COVID-19 case... \n", + "24 Lookup table for US state abbreviations \n", + "25 Data on confirmed cases, deaths, and testing. ... \n", + "\n", + " Columns Partitions \n", + "0 paper_id, date, dx_name, test_name, procedure_... \n", + "1 cord_uid, sha, source_x, title, doi, pmcid, pu... \n", + "2 country, alpha-2 code, alpha-3 code, numeric c... \n", + "3 id, id2, county, state, population estimate 2018 \n", + "4 id, label, from, to, score \n", + "5 id, label, first, last, full_name \n", + "6 id, label, entity, concept \n", + "7 id, label, institution, country, settlement \n", + "8 id, label, doi, sha_code, publish_time, source... \n", + "9 id, label, topic, topic_num \n", + "10 date, state, positive, negative, pending, hosp... \n", + "11 date, states, positive, negative, posneg, pend... \n", + "12 positive, negative, posneg, hospitalized, deat... \n", + "13 data_source, signal, geo_type, time_value, geo... \n", + "14 data_source, signal, time_type, geo_type, min_... \n", + "15 fips, admin2, province_state, country_region, ... \n", + "16 uid, fips, iso2, iso3, code3, admin2, latitude... \n", + "17 objectid, hospital_name, hospital_type, hq_add... \n", + "18 date, county, state, fips, cases, deaths \n", + "19 date, state, fips, cases, deaths \n", + "20 countyfips, countyname, statename, severity_co... \n", + "21 severity_1-day, severity_2-day, severity_3-day... \n", + "22 country_short_name, country_alpha_3_code, coun... \n", + "23 case_type, cases, difference, date, country_re... \n", + "24 state, abbreviation \n", + "25 iso_code, location, date, total_cases, new_cas... " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.catalog.tables(database=\"covid-19\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create data source of QuickSight\n", + "Note: data source stores the connection information." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "wr.quicksight.create_athena_data_source(\n", + " name=\"covid-19\",\n", + " workgroup=\"primary\",\n", + " allowed_to_manage=[\"dev\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatabaseTableDescriptionColumnsPartitions
0covid-19nytimes_countiesData on COVID-19 cases from NY Times at US cou...date, county, state, fips, cases, deaths
1covid-19nytimes_statesData on COVID-19 cases from NY Times at US sta...date, state, fips, cases, deaths
\n", + "
" + ], + "text/plain": [ + " Database Table \\\n", + "0 covid-19 nytimes_counties \n", + "1 covid-19 nytimes_states \n", + "\n", + " Description \\\n", + "0 Data on COVID-19 cases from NY Times at US cou... \n", + "1 Data on COVID-19 cases from NY Times at US sta... \n", + "\n", + " Columns Partitions \n", + "0 date, county, state, fips, cases, deaths \n", + "1 date, state, fips, cases, deaths " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.catalog.tables(database=\"covid-19\", name_contains=\"nyt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datecountystatefipscasesdeaths
02020-01-21SnohomishWashington5306110
12020-01-22SnohomishWashington5306110
22020-01-23SnohomishWashington5306110
32020-01-24CookIllinois1703110
42020-01-24SnohomishWashington5306110
52020-01-25OrangeCalifornia0605910
62020-01-25CookIllinois1703110
72020-01-25SnohomishWashington5306110
82020-01-26MaricopaArizona0401310
92020-01-26Los AngelesCalifornia0603710
\n", + "
" + ], + "text/plain": [ + " date county state fips cases deaths\n", + "0 2020-01-21 Snohomish Washington 53061 1 0\n", + "1 2020-01-22 Snohomish Washington 53061 1 0\n", + "2 2020-01-23 Snohomish Washington 53061 1 0\n", + "3 2020-01-24 Cook Illinois 17031 1 0\n", + "4 2020-01-24 Snohomish Washington 53061 1 0\n", + "5 2020-01-25 Orange California 06059 1 0\n", + "6 2020-01-25 Cook Illinois 17031 1 0\n", + "7 2020-01-25 Snohomish Washington 53061 1 0\n", + "8 2020-01-26 Maricopa Arizona 04013 1 0\n", + "9 2020-01-26 Los Angeles California 06037 1 0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.athena.read_sql_query(\"SELECT * FROM nytimes_counties limit 10\", database=\"covid-19\", ctas_approach=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datecountystatefipsconfirmeddeathspopulationcounty2Hospitalhospital_fipslicensed_bedsstaffed_bedsicu_bedsbed_utilizationpotential_increase_bed_capacity
02020-04-12ParkMontana300677016736Park030067252540.4325480
12020-04-12RavalliMontana300813043172Ravalli030081252550.5677810
22020-04-12Silver BowMontana3009311034993Silver Bow0300939871110.55145727
32020-04-12ClayNebraska31035206214Clay<NA><NA><NA><NA><NA>NaN<NA>
42020-04-12CumingNebraska31039208940Cuming031039252540.2044930
................................................
2276842020-06-11HockleyTexas4821928122980Hockley048219484880.1206050
2276852020-06-11HudspethTexas482291104795Hudspeth<NA><NA><NA><NA><NA>NaN<NA>
2276862020-06-11JonesTexas48253633019817Jones04825345710.71859138
2276872020-06-11La SalleTexas48283407531La Salle<NA><NA><NA><NA><NA>NaN<NA>
2276882020-06-11LimestoneTexas4829336123519Limestone048293786990.1639409
\n", + "

227689 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " date county state fips confirmed deaths population \\\n", + "0 2020-04-12 Park Montana 30067 7 0 16736 \n", + "1 2020-04-12 Ravalli Montana 30081 3 0 43172 \n", + "2 2020-04-12 Silver Bow Montana 30093 11 0 34993 \n", + "3 2020-04-12 Clay Nebraska 31035 2 0 6214 \n", + "4 2020-04-12 Cuming Nebraska 31039 2 0 8940 \n", + "... ... ... ... ... ... ... ... \n", + "227684 2020-06-11 Hockley Texas 48219 28 1 22980 \n", + "227685 2020-06-11 Hudspeth Texas 48229 11 0 4795 \n", + "227686 2020-06-11 Jones Texas 48253 633 0 19817 \n", + "227687 2020-06-11 La Salle Texas 48283 4 0 7531 \n", + "227688 2020-06-11 Limestone Texas 48293 36 1 23519 \n", + "\n", + " county2 Hospital hospital_fips licensed_beds staffed_beds \\\n", + "0 Park 0 30067 25 25 \n", + "1 Ravalli 0 30081 25 25 \n", + "2 Silver Bow 0 30093 98 71 \n", + "3 Clay \n", + "4 Cuming 0 31039 25 25 \n", + "... ... ... ... ... ... \n", + "227684 Hockley 0 48219 48 48 \n", + "227685 Hudspeth \n", + "227686 Jones 0 48253 45 7 \n", + "227687 La Salle \n", + "227688 Limestone 0 48293 78 69 \n", + "\n", + " icu_beds bed_utilization potential_increase_bed_capacity \n", + "0 4 0.432548 0 \n", + "1 5 0.567781 0 \n", + "2 11 0.551457 27 \n", + "3 NaN \n", + "4 4 0.204493 0 \n", + "... ... ... ... \n", + "227684 8 0.120605 0 \n", + "227685 NaN \n", + "227686 1 0.718591 38 \n", + "227687 NaN \n", + "227688 9 0.163940 9 \n", + "\n", + "[227689 rows x 15 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = \"\"\"\n", + "SELECT \n", + " j.*, \n", + " co.Population, \n", + " co.county AS county2, \n", + " hb.* \n", + "FROM \n", + " (\n", + " SELECT \n", + " date, \n", + " county, \n", + " state, \n", + " fips, \n", + " cases as confirmed, \n", + " deaths \n", + " FROM \"covid-19\".nytimes_counties\n", + " ) j \n", + " LEFT OUTER JOIN (\n", + " SELECT \n", + " DISTINCT county, \n", + " state, \n", + " \"population estimate 2018\" AS Population \n", + " FROM \n", + " \"covid-19\".county_populations \n", + " WHERE \n", + " state IN (\n", + " SELECT \n", + " DISTINCT state \n", + " FROM \n", + " \"covid-19\".nytimes_counties\n", + " ) \n", + " AND county IN (\n", + " SELECT \n", + " DISTINCT county as county \n", + " FROM \"covid-19\".nytimes_counties\n", + " )\n", + " ) co ON co.county = j.county \n", + " AND co.state = j.state \n", + " LEFT OUTER JOIN (\n", + " SELECT \n", + " count(objectid) as Hospital, \n", + " fips as hospital_fips, \n", + " sum(num_licensed_beds) as licensed_beds, \n", + " sum(num_staffed_beds) as staffed_beds, \n", + " sum(num_icu_beds) as icu_beds, \n", + " avg(bed_utilization) as bed_utilization, \n", + " sum(\n", + " potential_increase_in_bed_capac\n", + " ) as potential_increase_bed_capacity \n", + " FROM \"covid-19\".hospital_beds \n", + " WHERE \n", + " fips in (\n", + " SELECT \n", + " DISTINCT fips \n", + " FROM \n", + " \"covid-19\".nytimes_counties\n", + " ) \n", + " GROUP BY \n", + " 2\n", + " ) hb ON hb.hospital_fips = j.fips\n", + "\"\"\"\n", + "\n", + "wr.athena.read_sql_query(sql, database=\"covid-19\", ctas_approach=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Dataset with custom SQL option" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "wr.quicksight.create_athena_dataset(\n", + " name=\"covid19-nytimes-usa\",\n", + " sql=sql,\n", + " sql_name='CustomSQL',\n", + " data_source_name=\"covid-19\",\n", + " import_mode='SPICE',\n", + " allowed_to_manage=[\"dev\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "ingestion_id = wr.quicksight.create_ingestion(\"covid19-nytimes-usa\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wait ingestion" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "while wr.quicksight.describe_ingestion(ingestion_id=ingestion_id, dataset_name=\"covid19-nytimes-usa\")[\"IngestionStatus\"] not in [\"COMPLETED\", \"FAILED\"]:\n", + " sleep(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Describe last ingestion" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'RowsIngested': 227689, 'RowsDropped': 0}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wr.quicksight.describe_ingestion(ingestion_id=ingestion_id, dataset_name=\"covid19-nytimes-usa\")[\"RowInfo\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List all ingestions" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'time': datetime.datetime(2020, 6, 12, 15, 13, 46, 996000, tzinfo=tzlocal()),\n", + " 'source': 'MANUAL'},\n", + " {'time': datetime.datetime(2020, 6, 12, 15, 13, 42, 344000, tzinfo=tzlocal()),\n", + " 'source': 'MANUAL'}]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[{\"time\": user[\"CreatedTime\"], \"source\": user[\"RequestSource\"]} for user in wr.quicksight.list_ingestions(\"covid19-nytimes-usa\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create new dataset from a table directly" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "wr.quicksight.create_athena_dataset(\n", + " name=\"covid-19-tableau_jhu\",\n", + " table=\"tableau_jhu\",\n", + " data_source_name=\"covid-19\",\n", + " database=\"covid-19\",\n", + " import_mode='DIRECT_QUERY',\n", + " rename_columns={\n", + " \"cases\": \"Count_of_Cases\", \n", + " \"combined_key\": \"County\"\n", + " },\n", + " cast_columns_types={\n", + " \"Count_of_Cases\": \"INTEGER\"\n", + " },\n", + " allowed_to_manage=[\"dev\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cleaning up" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "wr.quicksight.delete_data_source(\"covid-19\")\n", + "wr.quicksight.delete_dataset(\"covid19-nytimes-usa\")\n", + "wr.quicksight.delete_dataset(\"covid-19-tableau_jhu\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "awswrangler", + "language": "python", + "name": "awswrangler" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}