diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 2d971513a..0ecda8668 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -13,4 +13,4 @@ A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior. Also add details about Python and Wrangler's version and how the library was installed. -*P.S. Don't attach file. Please, prefer add code snippets directly in the message body.* +*P.S. Don't attach files. Please, prefer add code snippets directly in the message body.* diff --git a/.github/ISSUE_TEMPLATE/enhancement-request.md b/.github/ISSUE_TEMPLATE/enhancement-request.md index 5cd92f408..483ab790a 100644 --- a/.github/ISSUE_TEMPLATE/enhancement-request.md +++ b/.github/ISSUE_TEMPLATE/enhancement-request.md @@ -13,4 +13,4 @@ A clear and concise description of what the problem is. Ex. I'm always frustrate **Describe the solution you'd like** A clear and concise description of what you want to happen. -*P.S. Don't attach file. Please, prefer add code snippets directly in the message body.* +*P.S. Don't attach files. Please, prefer add code snippets directly in the message body.* diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 9f013d329..54e5a4d90 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -13,4 +13,4 @@ A clear and concise description of what the problem is. Ex. I'm always frustrate **Describe the solution you'd like** A clear and concise description of what you want to happen. -*P.S. Don't attach file. Please, prefer add code snippets directly in the message body.* +*P.S. Don't attach files. Please, prefer add code snippets directly in the message body.* diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index aa32131b1..051f953b2 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -7,4 +7,4 @@ assignees: '' --- -*P.S. Don't attach file. Please, prefer add code snippets directly in the message body.* +*P.S. Don't attach files. Please, prefer add code snippets directly in the message body.* diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml index 5dd6f5d21..814ac0091 100644 --- a/.github/workflows/static-checking.yml +++ b/.github/workflows/static-checking.yml @@ -41,4 +41,4 @@ jobs: - name: Black style run: black --check --line-length 120 --target-version py36 awswrangler tests - name: Imports order check (isort) - run: isort -rc --check-only awswrangler tests + run: isort --check-only awswrangler tests diff --git a/.gitignore b/.gitignore index b9ab4fb78..ffdd7d184 100644 --- a/.gitignore +++ b/.gitignore @@ -128,7 +128,7 @@ dmypy.json output/ # Development -dev/ +/dev/ metrics/ python/ diff --git a/.pylintrc b/.pylintrc index f92ace361..daa1c3241 100644 --- a/.pylintrc +++ b/.pylintrc @@ -332,7 +332,7 @@ indent-string=' ' max-line-length=120 # Maximum number of lines in a module. -max-module-lines=1250 +max-module-lines=1500 # List of optional constructs for which whitespace checking is disabled. `dict- # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. diff --git a/README.md b/README.md index ddcc771de..a171f445e 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ > An [AWS Professional Service](https://aws.amazon.com/professional-services/) open source initiative | aws-proserve-opensource@amazon.com -[![Release](https://img.shields.io/badge/release-1.8.1-brightgreen.svg)](https://pypi.org/project/awswrangler/) +[![Release](https://img.shields.io/badge/release-1.9.0-brightgreen.svg)](https://pypi.org/project/awswrangler/) [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) @@ -15,10 +15,12 @@ ![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest) -| Source | Downloads | Page | Installation Command | -|-----------|---------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------|--------------------------------------------| -| **PyPi** | [![PyPI Downloads](https://pepy.tech/badge/awswrangler)](https://pypi.org/project/awswrangler/) | [Link](https://pypi.org/project/awswrangler/) | `pip install awswrangler` | -| **Conda** | [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/awswrangler.svg)](https://anaconda.org/conda-forge/awswrangler) | [Link](https://anaconda.org/conda-forge/awswrangler) | `conda install -c conda-forge awswrangler` | +| Source | Downloads | Installation Command | +|--------|-----------|----------------------| +| **[PyPi](https://pypi.org/project/awswrangler/)** | [![PyPI Downloads](https://pepy.tech/badge/awswrangler)](https://pypi.org/project/awswrangler/) | `pip install awswrangler` | +| **[Conda](https://anaconda.org/conda-forge/awswrangler)** | [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/awswrangler.svg)](https://anaconda.org/conda-forge/awswrangler) | `conda install -c conda-forge awswrangler` | + +Powered By [](https://arrow.apache.org/powered_by/) ## Table of contents @@ -121,10 +123,13 @@ Knowing which companies are using this library is important to help prioritize t Please send a PR with your company name and @githubhandle if you may. -1. [Digio](https://www.digio.com.br/) [[@afonsomy](https://github.com/afonsomy)] -2. [Pier](https://www.pier.digital/) [[@flaviomax](https://github.com/flaviomax)] -3. [M4U](https://www.m4u.com.br/) [[@Thiago-Dantas](https://github.com/Thiago-Dantas)] -4. [Serasa Experian](https://www.serasaexperian.com.br/) [[@andre-marcos-perez](https://github.com/andre-marcos-perez)] -5. [LINE TV](https://www.linetv.tw/) [[@bryanyang0528](https://github.com/bryanyang0528)] -6. [OKRA Technologies](https://okra.ai) [[@JPFrancoia](https://github.com/JPFrancoia), [@schot](https://github.com/schot)] -7. [DNX](https://www.dnx.solutions/) [[@DNXLabs](https://github.com/DNXLabs)] +* [Amazon](https://www.amazon.com/) +* [AWS](https://aws.amazon.com/) +* [Cepsa](https://cepsa.com) [[@alvaropc](https://github.com/alvaropc)] +* [Digio](https://www.digio.com.br/) [[@afonsomy](https://github.com/afonsomy)] +* [DNX](https://www.dnx.solutions/) [[@DNXLabs](https://github.com/DNXLabs)] +* [LINE TV](https://www.linetv.tw/) [[@bryanyang0528](https://github.com/bryanyang0528)] +* [M4U](https://www.m4u.com.br/) [[@Thiago-Dantas](https://github.com/Thiago-Dantas)] +* [OKRA Technologies](https://okra.ai) [[@JPFrancoia](https://github.com/JPFrancoia), [@schot](https://github.com/schot)] +* [Pier](https://www.pier.digital/) [[@flaviomax](https://github.com/flaviomax)] +* [Serasa Experian](https://www.serasaexperian.com.br/) [[@andre-marcos-perez](https://github.com/andre-marcos-perez)] \ No newline at end of file diff --git a/THIRD_PARTY.txt b/THIRD_PARTY.txt index 978276f04..1108c912a 100644 --- a/THIRD_PARTY.txt +++ b/THIRD_PARTY.txt @@ -296,9 +296,6 @@ Copyright 2013-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. ** pandas; version 1.1.0 -- https://pandas.pydata.org/ Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. -** s3fs; version 4.2.0 -- https://s3fs.readthedocs.io/en/latest/ -Copyright (c) 2016, Continuum Analytics, Inc. and contributors -All rights reserved. ** numpy; version 1.19.1 -- https://numpy.org/ Copyright (c) 2005-2020, NumPy Developers. All rights reserved. diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py index b92119688..b4b469757 100644 --- a/awswrangler/__metadata__.py +++ b/awswrangler/__metadata__.py @@ -7,5 +7,5 @@ __title__: str = "awswrangler" __description__: str = "Pandas on AWS." -__version__: str = "1.8.1" +__version__: str = "1.9.0" __license__: str = "Apache License 2.0" diff --git a/awswrangler/_config.py b/awswrangler/_config.py index 7ebaa7ae6..eb5bb1506 100644 --- a/awswrangler/_config.py +++ b/awswrangler/_config.py @@ -5,7 +5,7 @@ import os from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, cast -import pandas as pd # type: ignore +import pandas as pd from awswrangler import _utils, exceptions @@ -29,7 +29,7 @@ class _ConfigArg(NamedTuple): "database": _ConfigArg(dtype=str, nullable=True), "max_cache_query_inspections": _ConfigArg(dtype=int, nullable=False), "max_cache_seconds": _ConfigArg(dtype=int, nullable=False), - "s3fs_block_size": _ConfigArg(dtype=int, nullable=False, enforced=True), + "s3_block_size": _ConfigArg(dtype=int, nullable=False, enforced=True), } @@ -138,8 +138,8 @@ def _apply_type(name: str, value: Any, dtype: Type[Union[str, bool, int]], nulla exceptions.InvalidArgumentValue(f"{name} configuration does not accept a null value. Please pass {dtype}.") try: return dtype(value) if isinstance(value, dtype) is False else value - except ValueError: - raise exceptions.InvalidConfiguration(f"Config {name} must receive a {dtype} value.") + except ValueError as ex: + raise exceptions.InvalidConfiguration(f"Config {name} must receive a {dtype} value.") from ex @staticmethod def _is_null(value: _ConfigValueType) -> bool: @@ -206,13 +206,13 @@ def max_cache_seconds(self, value: int) -> None: self._set_config_value(key="max_cache_seconds", value=value) @property - def s3fs_block_size(self) -> int: - """Property s3fs_block_size.""" - return cast(int, self["s3fs_block_size"]) + def s3_block_size(self) -> int: + """Property s3_block_size.""" + return cast(int, self["s3_block_size"]) - @s3fs_block_size.setter - def s3fs_block_size(self, value: int) -> None: - self._set_config_value(key="s3fs_block_size", value=value) + @s3_block_size.setter + def s3_block_size(self, value: int) -> None: + self._set_config_value(key="s3_block_size", value=value) def _inject_config_doc(doc: Optional[str], available_configs: Tuple[str, ...]) -> str: diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py index fa36292f0..84e750ccd 100644 --- a/awswrangler/_data_types.py +++ b/awswrangler/_data_types.py @@ -1,18 +1,20 @@ """Internal (private) Data Types Module.""" +import datetime import logging import re from decimal import Decimal from typing import Any, Dict, List, Match, Optional, Sequence, Tuple -import pandas as pd # type: ignore -import pyarrow as pa # type: ignore -import pyarrow.parquet # type: ignore -import sqlalchemy # type: ignore -import sqlalchemy.dialects.mysql # type: ignore -import sqlalchemy.dialects.postgresql # type: ignore -import sqlalchemy_redshift.dialect # type: ignore -from sqlalchemy.sql.visitors import VisitableType # type: ignore +import numpy as np +import pandas as pd +import pyarrow as pa +import pyarrow.parquet +import sqlalchemy +import sqlalchemy.dialects.mysql +import sqlalchemy.dialects.postgresql +import sqlalchemy_redshift.dialect +from sqlalchemy.sql.visitors import VisitableType from awswrangler import _utils, exceptions @@ -444,11 +446,21 @@ def _normalize_pandas_dtype_name(dtype: str) -> str: return dtype +def _cast2date(value: Any) -> Any: + if isinstance(value, float) and (np.isnan(value) or np.isinf(value)): + return None + if pd.isna(value) or value is None: + return None + if isinstance(value, datetime.date): + return value + return pd.to_datetime(value).date() + + def _cast_pandas_column(df: pd.DataFrame, col: str, current_type: str, desired_type: str) -> pd.DataFrame: if desired_type == "datetime64": df[col] = pd.to_datetime(df[col]) elif desired_type == "date": - df[col] = pd.to_datetime(df[col]).dt.date.replace(to_replace={pd.NaT: None}) + df[col] = df[col].apply(lambda x: _cast2date(value=x)).replace(to_replace={pd.NaT: None}) elif desired_type == "bytes": df[col] = df[col].astype("string").str.encode(encoding="utf-8").replace(to_replace={pd.NA: None}) elif desired_type == "decimal": @@ -456,15 +468,6 @@ def _cast_pandas_column(df: pd.DataFrame, col: str, current_type: str, desired_t df = _cast_pandas_column(df=df, col=col, current_type=current_type, desired_type="string") # Then cast to decimal df[col] = df[col].apply(lambda x: Decimal(str(x)) if str(x) not in ("", "none", "None", " ", "") else None) - elif desired_type == "string": - if current_type.lower().startswith("int") is True: - df[col] = df[col].astype(str).astype("string") - elif current_type.startswith("float") is True: - df[col] = df[col].astype(str).astype("string") - elif current_type in ("object", "category"): - df[col] = df[col].astype(str).astype("string") - else: - df[col] = df[col].astype("string") else: try: df[col] = df[col].astype(desired_type) diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py index 8f684e417..caec9dbf9 100644 --- a/awswrangler/_utils.py +++ b/awswrangler/_utils.py @@ -1,22 +1,22 @@ """Internal (private) Utilities Module.""" import copy +import itertools import logging import math import os import random import time -from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union, cast +from concurrent.futures import FIRST_COMPLETED, Future, wait +from typing import Any, Callable, Dict, Generator, List, Optional, Sequence, Tuple, Union, cast -import boto3 # type: ignore -import botocore.config # type: ignore -import numpy as np # type: ignore -import pandas as pd # type: ignore -import psycopg2 # type: ignore -import s3fs # type: ignore +import boto3 +import botocore.config +import numpy as np +import pandas as pd +import psycopg2 from awswrangler import exceptions -from awswrangler._config import apply_configs _logger: logging.Logger = logging.getLogger(__name__) @@ -63,14 +63,20 @@ def boto3_from_primitives(primitives: Optional[Boto3PrimitivesType] = None) -> b def client(service_name: str, session: Optional[boto3.Session] = None) -> boto3.client: """Create a valid boto3.client.""" return ensure_session(session=session).client( - service_name=service_name, use_ssl=True, config=botocore.config.Config(retries={"max_attempts": 15}) + service_name=service_name, + use_ssl=True, + config=botocore.config.Config(retries={"max_attempts": 10}, connect_timeout=10, max_pool_connections=30), ) def resource(service_name: str, session: Optional[boto3.Session] = None) -> boto3.resource: """Create a valid boto3.resource.""" return ensure_session(session=session).resource( - service_name=service_name, use_ssl=True, config=botocore.config.Config(retries={"max_attempts": 15}) + service_name=service_name, + use_ssl=True, + config=botocore.config.Config( + retries={"max_attempts": 10, "mode": "adaptive"}, connect_timeout=10, max_pool_connections=30 + ), ) @@ -172,37 +178,6 @@ def chunkify(lst: List[Any], num_chunks: int = 1, max_length: Optional[int] = No return [arr.tolist() for arr in np_chunks if len(arr) > 0] -@apply_configs -def get_fs( - s3fs_block_size: int, - session: Optional[Union[boto3.Session, Dict[str, Optional[str]]]] = None, - s3_additional_kwargs: Optional[Dict[str, str]] = None, -) -> s3fs.S3FileSystem: - """Build a S3FileSystem from a given boto3 session.""" - fs: s3fs.S3FileSystem = s3fs.S3FileSystem( - anon=False, - use_ssl=True, - default_cache_type="readahead", - default_fill_cache=False, - default_block_size=s3fs_block_size, - config_kwargs={"retries": {"max_attempts": 15}}, - session=ensure_session(session=session)._session, # pylint: disable=protected-access - s3_additional_kwargs=s3_additional_kwargs, - use_listings_cache=False, - skip_instance_cache=True, - ) - fs.invalidate_cache() - fs.clear_instance_cache() - return fs - - -def open_file(fs: s3fs.S3FileSystem, **kwargs: Any) -> Any: - """Open s3fs file with retries to overcome eventual consistency.""" - fs.invalidate_cache() - fs.clear_instance_cache() - return try_it(f=fs.open, ex=FileNotFoundError, **kwargs) - - def empty_generator() -> Generator[None, None, None]: """Empty Generator.""" yield from () @@ -276,7 +251,13 @@ def check_duplicated_columns(df: pd.DataFrame) -> Any: """Raise an exception if there are duplicated columns names.""" duplicated: List[str] = df.loc[:, df.columns.duplicated()].columns.to_list() if duplicated: - raise exceptions.InvalidDataFrame(f"There is duplicated column names in your DataFrame: {duplicated}") + raise exceptions.InvalidDataFrame( + f"There are duplicated column names in your DataFrame: {duplicated}. " + f"Note that your columns may have been sanitized and it can be the cause of " + f"the duplicity. Wrangler sanitization removes all special characters and " + f"also converts CamelCase to snake_case. So you must avoid columns like " + f"['MyCol', 'my_col'] in your DataFrame." + ) def try_it(f: Callable[..., Any], ex: Any, base: float = 1.0, max_num_tries: int = 3, **kwargs: Any) -> Any: @@ -294,3 +275,35 @@ def try_it(f: Callable[..., Any], ex: Any, base: float = 1.0, max_num_tries: int delay = random.uniform(base, delay * 3) _logger.error("Retrying %s | Fail number %s/%s | Exception: %s", f, i + 1, max_num_tries, exception) time.sleep(delay) + + +def get_even_chunks_sizes(total_size: int, chunk_size: int, upper_bound: bool) -> Tuple[int, ...]: + """Calculate even chunks sizes (Best effort).""" + round_func: Callable[[float], float] = math.ceil if upper_bound is True else math.floor + num_chunks: int = int(round_func(float(total_size) / float(chunk_size))) + num_chunks = 1 if num_chunks < 1 else num_chunks + base_size: int = int(total_size / num_chunks) + rest: int = total_size % num_chunks + sizes: List[int] = list(itertools.repeat(base_size, num_chunks)) + for i in range(rest): + i_cycled: int = i % len(sizes) + sizes[i_cycled] += 1 + return tuple(sizes) + + +def get_running_futures(seq: Sequence[Future]) -> Tuple[Future, ...]: # type: ignore + """Filter only running futures.""" + return tuple(f for f in seq if f.running()) + + +def wait_any_future_available(seq: Sequence[Future]) -> None: # type: ignore + """Wait until any future became available.""" + wait(fs=seq, timeout=None, return_when=FIRST_COMPLETED) + + +def block_waiting_available_thread(seq: Sequence[Future], max_workers: int) -> None: # type: ignore + """Block until any thread became available.""" + running: Tuple[Future, ...] = get_running_futures(seq=seq) # type: ignore + while len(running) >= max_workers: + wait_any_future_available(seq=running) + running = get_running_futures(seq=running) diff --git a/awswrangler/athena/_read.py b/awswrangler/athena/_read.py index 8a03b3a60..7aa209dd4 100644 --- a/awswrangler/athena/_read.py +++ b/awswrangler/athena/_read.py @@ -7,9 +7,9 @@ import uuid from typing import Any, Dict, Iterator, List, Match, NamedTuple, Optional, Union -import boto3 # type: ignore -import botocore.exceptions # type: ignore -import pandas as pd # type: ignore +import boto3 +import botocore.exceptions +import pandas as pd from awswrangler import _utils, catalog, exceptions, s3 from awswrangler._config import apply_configs @@ -365,6 +365,10 @@ def _resolve_query_without_cache_ctas( ) except botocore.exceptions.ClientError as ex: error: Dict[str, Any] = ex.response["Error"] + if error["Code"] == "InvalidRequestException" and "Exception parsing query" in error["Message"]: + raise exceptions.InvalidCtasApproachQuery( + "Is not possible to wrap this query into a CTAS statement. Please use ctas_approach=False." + ) if error["Code"] == "InvalidRequestException" and "extraneous input" in error["Message"]: raise exceptions.InvalidCtasApproachQuery( "Is not possible to wrap this query into a CTAS statement. Please use ctas_approach=False." diff --git a/awswrangler/athena/_utils.py b/awswrangler/athena/_utils.py index 72d5740e2..31a73c0e1 100644 --- a/awswrangler/athena/_utils.py +++ b/awswrangler/athena/_utils.py @@ -7,8 +7,8 @@ from decimal import Decimal from typing import Any, Dict, Generator, List, NamedTuple, Optional, Union, cast -import boto3 # type: ignore -import pandas as pd # type: ignore +import boto3 +import pandas as pd from awswrangler import _data_types, _utils, exceptions, s3, sts from awswrangler._config import apply_configs diff --git a/awswrangler/catalog/__init__.py b/awswrangler/catalog/__init__.py index 8c3416bd1..11be6645d 100644 --- a/awswrangler/catalog/__init__.py +++ b/awswrangler/catalog/__init__.py @@ -10,7 +10,12 @@ overwrite_table_parameters, upsert_table_parameters, ) -from awswrangler.catalog._delete import delete_database, delete_table_if_exists # noqa +from awswrangler.catalog._delete import ( # noqa + delete_all_partitions, + delete_database, + delete_partitions, + delete_table_if_exists, +) from awswrangler.catalog._get import ( # noqa _get_table_input, databases, diff --git a/awswrangler/catalog/_add.py b/awswrangler/catalog/_add.py index ad7595361..98d4f6c0d 100644 --- a/awswrangler/catalog/_add.py +++ b/awswrangler/catalog/_add.py @@ -3,12 +3,12 @@ import logging from typing import Any, Dict, List, Optional -import boto3 # type: ignore +import boto3 from awswrangler import _utils, exceptions from awswrangler._config import apply_configs from awswrangler.catalog._definitions import _csv_partition_definition, _parquet_partition_definition -from awswrangler.catalog._utils import _catalog_id +from awswrangler.catalog._utils import _catalog_id, sanitize_table_name _logger: logging.Logger = logging.getLogger(__name__) @@ -134,6 +134,7 @@ def add_parquet_partitions( ... ) """ + table = sanitize_table_name(table=table) if partitions_values: inputs: List[Dict[str, Any]] = [ _parquet_partition_definition(location=k, values=v, compression=compression) diff --git a/awswrangler/catalog/_create.py b/awswrangler/catalog/_create.py index c8f05373a..7c595b140 100644 --- a/awswrangler/catalog/_create.py +++ b/awswrangler/catalog/_create.py @@ -1,15 +1,15 @@ """AWS Glue Catalog Module.""" import logging -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional -import boto3 # type: ignore +import boto3 from awswrangler import _utils, exceptions from awswrangler._config import apply_configs from awswrangler.catalog._definitions import _csv_table_definition, _parquet_table_definition -from awswrangler.catalog._delete import delete_table_if_exists -from awswrangler.catalog._get import _get_partitions, _get_table_input +from awswrangler.catalog._delete import delete_all_partitions, delete_table_if_exists +from awswrangler.catalog._get import _get_table_input from awswrangler.catalog._utils import _catalog_id, sanitize_column_name, sanitize_table_name _logger: logging.Logger = logging.getLogger(__name__) @@ -118,21 +118,8 @@ def _create_table( # pylint: disable=too-many-branches,too-many-statements f"{mode} is not a valid mode. It must be 'overwrite', 'append' or 'overwrite_partitions'." ) if table_exist is True and mode == "overwrite": - _logger.debug("Fetching existing partitions...") - partitions_values: List[List[str]] = list( - _get_partitions(database=database, table=table, boto3_session=session, catalog_id=catalog_id).values() - ) - _logger.debug("Number of old partitions: %s", len(partitions_values)) - _logger.debug("Deleting existing partitions...") - client_glue.batch_delete_partition( - **_catalog_id( - catalog_id=catalog_id, - DatabaseName=database, - TableName=table, - PartitionsToDelete=[{"Values": v} for v in partitions_values], - ) - ) - _logger.debug("Updating table...") + delete_all_partitions(table=table, database=database, catalog_id=catalog_id, boto3_session=session) + _logger.debug("Updating table (%s)...", mode) client_glue.update_table( **_catalog_id( catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive @@ -140,6 +127,7 @@ def _create_table( # pylint: disable=too-many-branches,too-many-statements ) elif (table_exist is True) and (mode in ("append", "overwrite_partitions", "update")): if mode == "update": + _logger.debug("Updating table (%s)...", mode) client_glue.update_table( **_catalog_id( catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive @@ -147,17 +135,35 @@ def _create_table( # pylint: disable=too-many-branches,too-many-statements ) elif table_exist is False: try: + _logger.debug("Creating table (%s)...", mode) client_glue.create_table( **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input) ) - except client_glue.exceptions.AlreadyExistsException as ex: + except client_glue.exceptions.AlreadyExistsException: if mode == "overwrite": - delete_table_if_exists(database=database, table=table, boto3_session=session, catalog_id=catalog_id) - client_glue.create_table( - **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input) + _utils.try_it( + f=_overwrite_table, + ex=client_glue.exceptions.AlreadyExistsException, + client_glue=client_glue, + catalog_id=catalog_id, + database=database, + table=table, + table_input=table_input, + boto3_session=boto3_session, ) - else: - raise ex + _logger.debug("Leaving table as is (%s)...", mode) + + +def _overwrite_table( + client_glue: boto3.client, + catalog_id: Optional[str], + database: str, + table: str, + table_input: Dict[str, Any], + boto3_session: boto3.Session, +) -> None: + delete_table_if_exists(database=database, table=table, boto3_session=boto3_session, catalog_id=catalog_id) + client_glue.create_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)) def _upsert_table_parameters( @@ -231,19 +237,16 @@ def _create_parquet_table( table_input: Dict[str, Any] if (catalog_table_input is not None) and (mode in ("append", "overwrite_partitions")): table_input = catalog_table_input - updated: bool = False catalog_cols: Dict[str, str] = {x["Name"]: x["Type"] for x in table_input["StorageDescriptor"]["Columns"]} for c, t in columns_types.items(): if c not in catalog_cols: _logger.debug("New column %s with type %s.", c, t) table_input["StorageDescriptor"]["Columns"].append({"Name": c, "Type": t}) - updated = True + mode = "update" elif t != catalog_cols[c]: # Data type change detected! raise exceptions.InvalidArgumentValue( - f"Data type change detected on column {c}. Old type: {catalog_cols[c]}. New type {t}." + f"Data type change detected on column {c} (Old type: {catalog_cols[c]} / New type {t})." ) - if updated is True: - mode = "update" else: table_input = _parquet_table_definition( table=table, diff --git a/awswrangler/catalog/_delete.py b/awswrangler/catalog/_delete.py index 000e16272..b67c8f390 100644 --- a/awswrangler/catalog/_delete.py +++ b/awswrangler/catalog/_delete.py @@ -1,12 +1,13 @@ """AWS Glue Catalog Delete Module.""" import logging -from typing import Optional +from typing import List, Optional -import boto3 # type: ignore +import boto3 from awswrangler import _utils from awswrangler._config import apply_configs +from awswrangler.catalog._get import _get_partitions from awswrangler.catalog._utils import _catalog_id _logger: logging.Logger = logging.getLogger(__name__) @@ -39,11 +40,7 @@ def delete_database(name: str, catalog_id: Optional[str] = None, boto3_session: ... ) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) - - if catalog_id is not None: - client_glue.delete_database(CatalogId=catalog_id, Name=name) - else: - client_glue.delete_database(Name=name) + client_glue.delete_database(**_catalog_id(Name=name, catalog_id=catalog_id)) @apply_configs @@ -84,3 +81,103 @@ def delete_table_if_exists( return True except client_glue.exceptions.EntityNotFoundException: return False + + +@apply_configs +def delete_partitions( + table: str, + database: str, + partitions_values: List[List[str]], + catalog_id: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, +) -> None: + """Delete specified partitions in a AWS Glue Catalog table. + + Parameters + ---------- + table : str + Table name. + database : str + Table name. + catalog_id : str, optional + The ID of the Data Catalog from which to retrieve Databases. + If none is provided, the AWS account ID is used by default. + partitions_values : List[List[str]] + List of lists of partitions values as strings. + (e.g. [['2020', '10', '25'], ['2020', '11', '16'], ['2020', '12', '19']]). + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + None + None. + + Examples + -------- + >>> import awswrangler as wr + >>> wr.catalog.delete_partitions( + ... table='my_table', + ... database='awswrangler_test', + ... partitions_values=[['2020', '10', '25'], ['2020', '11', '16'], ['2020', '12', '19']] + ... ) + """ + client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) + chunks: List[List[List[str]]] = _utils.chunkify(lst=partitions_values, max_length=25) + for chunk in chunks: + client_glue.batch_delete_partition( + **_catalog_id( + catalog_id=catalog_id, + DatabaseName=database, + TableName=table, + PartitionsToDelete=[{"Values": v} for v in chunk], + ) + ) + + +@apply_configs +def delete_all_partitions( + table: str, database: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None +) -> List[List[str]]: + """Delete all partitions in a AWS Glue Catalog table. + + Parameters + ---------- + table : str + Table name. + database : str + Table name. + catalog_id : str, optional + The ID of the Data Catalog from which to retrieve Databases. + If none is provided, the AWS account ID is used by default. + boto3_session : boto3.Session(), optional + Boto3 Session. The default boto3 session will be used if boto3_session receive None. + + Returns + ------- + List[List[str]] + Partitions values. + + Examples + -------- + >>> import awswrangler as wr + >>> partitions = wr.catalog.delete_all_partitions( + ... table='my_table', + ... database='awswrangler_test', + ... ) + """ + session: boto3.Session = _utils.ensure_session(session=boto3_session) + _logger.debug("Fetching existing partitions...") + partitions_values: List[List[str]] = list( + _get_partitions(database=database, table=table, boto3_session=session, catalog_id=catalog_id).values() + ) + _logger.debug("Number of old partitions: %s", len(partitions_values)) + _logger.debug("Deleting existing partitions...") + delete_partitions( + table=table, + database=database, + catalog_id=catalog_id, + partitions_values=partitions_values, + boto3_session=boto3_session, + ) + return partitions_values diff --git a/awswrangler/catalog/_get.py b/awswrangler/catalog/_get.py index 8b3ff1635..f5ece8619 100644 --- a/awswrangler/catalog/_get.py +++ b/awswrangler/catalog/_get.py @@ -6,9 +6,9 @@ from typing import Any, Dict, Iterator, List, Optional, Union, cast from urllib.parse import quote_plus as _quote_plus -import boto3 # type: ignore -import pandas as pd # type: ignore -import sqlalchemy # type: ignore +import boto3 +import pandas as pd +import sqlalchemy from awswrangler import _utils, exceptions from awswrangler._config import apply_configs @@ -119,7 +119,7 @@ def get_table_types( Examples -------- >>> import awswrangler as wr - >>> wr.catalog.get_table_types(database='default', name='my_table') + >>> wr.catalog.get_table_types(database='default', table='my_table') {'col0': 'int', 'col1': double} """ @@ -479,8 +479,8 @@ def get_table_location(database: str, table: str, boto3_session: Optional[boto3. res: Dict[str, Any] = client_glue.get_table(DatabaseName=database, Name=table) try: return cast(str, res["Table"]["StorageDescriptor"]["Location"]) - except KeyError: - raise exceptions.InvalidTable(f"{database}.{table}") + except KeyError as ex: + raise exceptions.InvalidTable(f"{database}.{table}") from ex def get_connection( diff --git a/awswrangler/catalog/_utils.py b/awswrangler/catalog/_utils.py index 2ade681cf..348244c1e 100644 --- a/awswrangler/catalog/_utils.py +++ b/awswrangler/catalog/_utils.py @@ -4,8 +4,8 @@ import unicodedata from typing import Any, Dict, List, Optional, Tuple -import boto3 # type: ignore -import pandas as pd # type: ignore +import boto3 +import pandas as pd from awswrangler import _data_types, _utils, exceptions from awswrangler._config import apply_configs diff --git a/awswrangler/cloudwatch.py b/awswrangler/cloudwatch.py index 9b2a490b9..7da49b0a9 100644 --- a/awswrangler/cloudwatch.py +++ b/awswrangler/cloudwatch.py @@ -5,8 +5,8 @@ import time from typing import Any, Dict, List, Optional, cast -import boto3 # type: ignore -import pandas as pd # type: ignore +import boto3 +import pandas as pd from awswrangler import _utils, exceptions diff --git a/awswrangler/db.py b/awswrangler/db.py index 78b30d590..097d893ad 100644 --- a/awswrangler/db.py +++ b/awswrangler/db.py @@ -6,11 +6,11 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from urllib.parse import quote_plus as _quote_plus -import boto3 # type: ignore -import pandas as pd # type: ignore -import pyarrow as pa # type: ignore -import sqlalchemy # type: ignore -from sqlalchemy.sql.visitors import VisitableType # type: ignore +import boto3 +import pandas as pd +import pyarrow as pa +import sqlalchemy +from sqlalchemy.sql.visitors import VisitableType from awswrangler import _data_types, _utils, exceptions, s3 from awswrangler.s3._list import _path2list # noqa @@ -274,7 +274,9 @@ def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs: Any SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() pandas_kwargs - keyword arguments forwarded to pandas.DataFrame.to_csv() + KEYWORD arguments forwarded to pandas.DataFrame.to_sql(). You can NOT pass `pandas_kwargs` explicit, just add + valid Pandas arguments in the function call and Wrangler will accept it. + e.g. wr.db.to_sql(df, con=con, name="table_name", schema="schema_name", if_exists="replace", index=False) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html Returns @@ -295,6 +297,19 @@ def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs: Any ... schema="schema_name" ... ) + Writing to Redshift with temporary credentials and using pandas_kwargs + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.db.to_sql( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... con=wr.db.get_redshift_temp_engine(cluster_identifier="...", user="..."), + ... name="table_name", + ... schema="schema_name", + ... if_exists="replace", + ... index=False, + ... ) + Writing to Redshift from Glue Catalog Connections >>> import awswrangler as wr @@ -307,6 +322,12 @@ def to_sql(df: pd.DataFrame, con: sqlalchemy.engine.Engine, **pandas_kwargs: Any ... ) """ + if "pandas_kwargs" in pandas_kwargs: + raise exceptions.InvalidArgument( + "You can NOT pass `pandas_kwargs` explicit, just add valid " + "Pandas arguments in the function call and Wrangler will accept it." + "e.g. wr.db.to_sql(df, con, name='...', schema='...', if_exists='replace')" + ) if df.empty is True: raise exceptions.EmptyDataFrame() if not isinstance(con, sqlalchemy.engine.Engine): @@ -698,8 +719,9 @@ def copy_to_redshift( # pylint: disable=too-many-arguments boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", + "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". + e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} max_rows_by_file : int Max number of rows in each file. Default is None i.e. dont split the files. @@ -847,7 +869,9 @@ def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-argument boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to boto3.client('s3').put_object when writing manifest, useful for server side encryption + Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", + "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". + e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} Returns ------- @@ -941,7 +965,9 @@ def write_redshift_copy_manifest( boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to boto3.client('s3').put_object when writing manifest, useful for server side encryption + Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", + "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". + e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} Returns ------- @@ -1073,8 +1099,7 @@ def unload_redshift( boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- diff --git a/awswrangler/emr.py b/awswrangler/emr.py index 914c1e859..1e730649b 100644 --- a/awswrangler/emr.py +++ b/awswrangler/emr.py @@ -5,7 +5,7 @@ import pprint from typing import Any, Dict, List, Optional, Union, cast -import boto3 # type: ignore +import boto3 from awswrangler import _utils, exceptions, sts diff --git a/awswrangler/quicksight/_cancel.py b/awswrangler/quicksight/_cancel.py index 7b9d0c79f..db3028c91 100644 --- a/awswrangler/quicksight/_cancel.py +++ b/awswrangler/quicksight/_cancel.py @@ -3,7 +3,7 @@ import logging from typing import Optional -import boto3 # type: ignore +import boto3 from awswrangler import _utils, exceptions, sts from awswrangler.quicksight._get_list import get_dataset_id diff --git a/awswrangler/quicksight/_create.py b/awswrangler/quicksight/_create.py index ebd6ed7b4..c85b2a6d4 100644 --- a/awswrangler/quicksight/_create.py +++ b/awswrangler/quicksight/_create.py @@ -4,7 +4,7 @@ import uuid from typing import Any, Dict, List, Optional, Union, cast -import boto3 # type: ignore +import boto3 from awswrangler import _utils, exceptions, sts from awswrangler.quicksight._get_list import get_data_source_arn, get_dataset_id diff --git a/awswrangler/quicksight/_delete.py b/awswrangler/quicksight/_delete.py index d865cdb48..848cc283c 100644 --- a/awswrangler/quicksight/_delete.py +++ b/awswrangler/quicksight/_delete.py @@ -3,7 +3,7 @@ import logging from typing import Any, Callable, Dict, Optional -import boto3 # type: ignore +import boto3 from awswrangler import _utils, exceptions, sts from awswrangler.quicksight._get_list import ( diff --git a/awswrangler/quicksight/_describe.py b/awswrangler/quicksight/_describe.py index 0cb4eb51e..3165df81c 100644 --- a/awswrangler/quicksight/_describe.py +++ b/awswrangler/quicksight/_describe.py @@ -3,7 +3,7 @@ import logging from typing import Any, Dict, Optional, cast -import boto3 # type: ignore +import boto3 from awswrangler import _utils, exceptions, sts from awswrangler.quicksight._get_list import get_dashboard_id, get_data_source_id, get_dataset_id diff --git a/awswrangler/quicksight/_get_list.py b/awswrangler/quicksight/_get_list.py index 71a8bf8ab..a11339e98 100644 --- a/awswrangler/quicksight/_get_list.py +++ b/awswrangler/quicksight/_get_list.py @@ -7,7 +7,7 @@ import logging from typing import Any, Callable, Dict, List, Optional -import boto3 # type: ignore +import boto3 from awswrangler import _utils, exceptions, sts diff --git a/awswrangler/quicksight/_utils.py b/awswrangler/quicksight/_utils.py index 957cf9f53..f376bb889 100644 --- a/awswrangler/quicksight/_utils.py +++ b/awswrangler/quicksight/_utils.py @@ -3,7 +3,7 @@ import logging from typing import Any, Dict, List, Optional -import boto3 # type: ignore +import boto3 from awswrangler import _data_types, athena, catalog, exceptions from awswrangler.quicksight._get_list import list_data_sources diff --git a/awswrangler/s3/_copy.py b/awswrangler/s3/_copy.py index e0bd82209..4235e9882 100644 --- a/awswrangler/s3/_copy.py +++ b/awswrangler/s3/_copy.py @@ -3,8 +3,8 @@ import logging from typing import Dict, List, Optional, Tuple -import boto3 # type: ignore -from boto3.s3.transfer import TransferConfig # type: ignore +import boto3 +from boto3.s3.transfer import TransferConfig from awswrangler import _utils, exceptions from awswrangler.s3._delete import delete_objects diff --git a/awswrangler/s3/_delete.py b/awswrangler/s3/_delete.py index 9c89ee2bb..8cce7d7a6 100644 --- a/awswrangler/s3/_delete.py +++ b/awswrangler/s3/_delete.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union from urllib.parse import unquote_plus as _unquote_plus -import boto3 # type: ignore +import boto3 from awswrangler import _utils, exceptions from awswrangler.s3._list import _path2list diff --git a/awswrangler/s3/_describe.py b/awswrangler/s3/_describe.py index d26fe9952..2024b7dd7 100644 --- a/awswrangler/s3/_describe.py +++ b/awswrangler/s3/_describe.py @@ -6,7 +6,7 @@ import logging from typing import Any, Dict, List, Optional, Tuple, Union -import boto3 # type: ignore +import boto3 from awswrangler import _utils from awswrangler.s3._list import _path2list diff --git a/awswrangler/s3/_fs.py b/awswrangler/s3/_fs.py new file mode 100644 index 000000000..6a5ef35b0 --- /dev/null +++ b/awswrangler/s3/_fs.py @@ -0,0 +1,560 @@ +"""Amazon S3 filesystem abstraction layer (PRIVATE).""" + +import concurrent.futures +import io +import itertools +import logging +import math +import socket +from contextlib import contextmanager +from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Set, Tuple, Union, cast + +import boto3 +from botocore.exceptions import ClientError, ReadTimeoutError + +from awswrangler import _utils, exceptions +from awswrangler._config import apply_configs +from awswrangler.s3._describe import size_objects + +_logger: logging.Logger = logging.getLogger(__name__) + +_S3_RETRYABLE_ERRORS: Tuple[Any, Any, Any] = (socket.timeout, ConnectionError, ReadTimeoutError) + +_MIN_WRITE_BLOCK: int = 5_242_880 # 5 MB (5 * 2**20) +_MIN_PARALLEL_READ_BLOCK: int = 5_242_880 # 5 MB (5 * 2**20) + +_BOTOCORE_ACCEPTED_KWARGS: Dict[str, Set[str]] = { + "get_object": {"SSECustomerAlgorithm", "SSECustomerKey"}, + "create_multipart_upload": { + "ACL", + "Metadata", + "ServerSideEncryption", + "StorageClass", + "SSECustomerAlgorithm", + "SSECustomerKey", + "SSEKMSKeyId", + "SSEKMSEncryptionContext", + "Tagging", + }, + "upload_part": {"SSECustomerAlgorithm", "SSECustomerKey"}, + "complete_multipart_upload": set(), + "put_object": { + "ACL", + "Metadata", + "ServerSideEncryption", + "StorageClass", + "SSECustomerAlgorithm", + "SSECustomerKey", + "SSEKMSKeyId", + "SSEKMSEncryptionContext", + "Tagging", + }, +} + + +def _fetch_range( + range_values: Tuple[int, int], + bucket: str, + key: str, + boto3_primitives: _utils.Boto3PrimitivesType, + boto3_kwargs: Dict[str, Any], +) -> Tuple[int, bytes]: + start, end = range_values + _logger.debug("Fetching: s3://%s/%s - Range: %s-%s", bucket, key, start, end) + boto3_session: boto3.Session = _utils.boto3_from_primitives(primitives=boto3_primitives) + client: boto3.client = _utils.client(service_name="s3", session=boto3_session) + try: + resp: Dict[str, Any] = _utils.try_it( + f=client.get_object, + ex=_S3_RETRYABLE_ERRORS, + base=0.5, + max_num_tries=6, + Bucket=bucket, + Key=key, + Range=f"bytes={start}-{end - 1}", + **boto3_kwargs, + ) + except ClientError as ex: + if ex.response["Error"].get("Code", "Unknown") in ("416", "InvalidRange"): + return start, b"" + raise ex + return start, cast(bytes, resp["Body"].read()) + + +class _UploadProxy: + def __init__(self, use_threads: bool): + self.closed = False + self._exec: Optional[concurrent.futures.ThreadPoolExecutor] + self._results: List[Dict[str, Union[str, int]]] = [] + self._cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + if self._cpus > 1: + self._exec = concurrent.futures.ThreadPoolExecutor(max_workers=self._cpus) + self._futures: List[Any] = [] + else: + self._exec = None + + @staticmethod + def _sort_by_part_number(parts: List[Dict[str, Union[str, int]]]) -> List[Dict[str, Union[str, int]]]: + return sorted(parts, key=lambda k: k["PartNumber"]) + + @staticmethod + def _caller( + bucket: str, + key: str, + part: int, + upload_id: str, + data: bytes, + boto3_primitives: _utils.Boto3PrimitivesType, + boto3_kwargs: Dict[str, Any], + ) -> Dict[str, Union[str, int]]: + _logger.debug("Upload part %s started.", part) + boto3_session: boto3.Session = _utils.boto3_from_primitives(primitives=boto3_primitives) + client: boto3.client = _utils.client(service_name="s3", session=boto3_session) + resp: Dict[str, Any] = _utils.try_it( + f=client.upload_part, + ex=_S3_RETRYABLE_ERRORS, + base=0.5, + max_num_tries=6, + Bucket=bucket, + Key=key, + Body=data, + PartNumber=part, + UploadId=upload_id, + **boto3_kwargs, + ) + _logger.debug("Upload part %s done.", part) + return {"PartNumber": part, "ETag": resp["ETag"]} + + def upload( + self, + bucket: str, + key: str, + part: int, + upload_id: str, + data: bytes, + boto3_session: boto3.Session, + boto3_kwargs: Dict[str, Any], + ) -> None: + """Upload Part.""" + if self._exec is not None: + _utils.block_waiting_available_thread(seq=self._futures, max_workers=self._cpus) + future = self._exec.submit( + _UploadProxy._caller, + bucket=bucket, + key=key, + part=part, + upload_id=upload_id, + data=data, + boto3_primitives=_utils.boto3_to_primitives(boto3_session=boto3_session), + boto3_kwargs=boto3_kwargs, + ) + self._futures.append(future) + else: + self._results.append( + self._caller( + bucket=bucket, + key=key, + part=part, + upload_id=upload_id, + data=data, + boto3_primitives=_utils.boto3_to_primitives(boto3_session=boto3_session), + boto3_kwargs=boto3_kwargs, + ) + ) + + def close(self) -> List[Dict[str, Union[str, int]]]: + """Close the proxy.""" + if self.closed is True: + return [] + if self._exec is not None: + for future in concurrent.futures.as_completed(self._futures): + self._results.append(future.result()) + self._exec.shutdown(wait=True) + self.closed = True + return self._sort_by_part_number(parts=self._results) + + +class _S3Object: # pylint: disable=too-many-instance-attributes + """Class to abstract S3 objects as ordinary files.""" + + def __init__( + self, + path: str, + s3_block_size: int, + mode: str, + use_threads: bool, + s3_additional_kwargs: Optional[Dict[str, str]], + boto3_session: Optional[boto3.Session], + newline: Optional[str], + encoding: Optional[str], + ) -> None: + self.closed: bool = False + self._use_threads = use_threads + self._newline: str = "\n" if newline is None else newline + self._encoding: str = "utf-8" if encoding is None else encoding + self._bucket, self._key = _utils.parse_path(path=path) + self._boto3_session: boto3.Session = _utils.ensure_session(session=boto3_session) + if mode not in {"rb", "wb", "r", "w"}: + raise NotImplementedError("File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode) + self._mode: str = "rb" if mode is None else mode + if s3_block_size < 2: + raise exceptions.InvalidArgumentValue("s3_block_size MUST > 1") + self._s3_block_size: int = s3_block_size + self._s3_half_block_size: int = s3_block_size // 2 + self._s3_additional_kwargs: Dict[str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs + self._client: boto3.client = _utils.client(service_name="s3", session=self._boto3_session) + self._loc: int = 0 + + if self.readable() is True: + self._cache: bytes = b"" + self._start: int = 0 + self._end: int = 0 + size: Optional[int] = size_objects(path=[path], use_threads=False, boto3_session=self._boto3_session)[path] + if size is None: + raise exceptions.InvalidArgumentValue(f"S3 object w/o defined size: {path}") + self._size: int = size + _logger.debug("self._size: %s", self._size) + _logger.debug("self._s3_block_size: %s", self._s3_block_size) + elif self.writable() is True: + self._mpu: Dict[str, Any] = {} + self._buffer: io.BytesIO = io.BytesIO() + self._parts_count: int = 0 + self._size = 0 + self._upload_proxy: _UploadProxy = _UploadProxy(use_threads=self._use_threads) + else: + raise RuntimeError(f"Invalid mode: {self._mode}") + + def __enter__(self) -> Union["_S3Object", io.TextIOWrapper]: + return self + + def __exit__(self, exc_type: Any, exc_value: Any, exc_traceback: Any) -> None: + """Close the context.""" + _logger.debug("exc_type: %s", exc_type) + _logger.debug("exc_value: %s", exc_value) + _logger.debug("exc_traceback: %s", exc_traceback) + self.close() + + def __del__(self) -> None: + """Delete object tear down.""" + self.close() + + def __next__(self) -> Union[bytes, str]: + """Next line.""" + out: Union[bytes, str, None] = self.readline() + if not out: + raise StopIteration + return out + + next = __next__ + + def __iter__(self) -> "_S3Object": + """Iterate over lines.""" + return self + + def _get_botocore_valid_kwargs(self, function_name: str) -> Dict[str, Any]: + return {k: v for k, v in self._s3_additional_kwargs.items() if k in _BOTOCORE_ACCEPTED_KWARGS[function_name]} + + @staticmethod + def _merge_range(ranges: List[Tuple[int, bytes]]) -> bytes: + return b"".join(data for start, data in sorted(ranges, key=lambda r: r[0])) + + def _fetch_range_proxy(self, start: int, end: int) -> bytes: + _logger.debug("Fetching: s3://%s/%s - Range: %s-%s", self._bucket, self._key, start, end) + boto3_primitives: _utils.Boto3PrimitivesType = _utils.boto3_to_primitives(boto3_session=self._boto3_session) + boto3_kwargs: Dict[str, Any] = self._get_botocore_valid_kwargs(function_name="get_object") + cpus: int = _utils.ensure_cpu_count(use_threads=self._use_threads) + range_size: int = end - start + if cpus < 2 or range_size < (2 * _MIN_PARALLEL_READ_BLOCK): + return _fetch_range( + range_values=(start, end), + bucket=self._bucket, + key=self._key, + boto3_primitives=boto3_primitives, + boto3_kwargs=boto3_kwargs, + )[1] + sizes: Tuple[int, ...] = _utils.get_even_chunks_sizes( + total_size=range_size, chunk_size=_MIN_PARALLEL_READ_BLOCK, upper_bound=False + ) + ranges: List[Tuple[int, int]] = [] + chunk_start: int = start + for size in sizes: + ranges.append((chunk_start, chunk_start + size)) + chunk_start += size + with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: + return self._merge_range( + ranges=list( + executor.map( + _fetch_range, + ranges, + itertools.repeat(self._bucket), + itertools.repeat(self._key), + itertools.repeat(boto3_primitives), + itertools.repeat(boto3_kwargs), + ) + ), + ) + + def _fetch(self, start: int, end: int) -> None: + end = self._size if end > self._size else end + start = 0 if start < 0 else start + + if start >= self._start and end <= self._end: + return None # Does not require download + + if end - start >= self._s3_block_size: # Fetching length greater than cache length + self._cache = self._fetch_range_proxy(start, end) + self._start = start + self._end = end + return None + + # Calculating block START and END positions + _logger.debug("Downloading: %s (start) / %s (end)", start, end) + mid: int = int(math.ceil((start + end) / 2)) + new_block_start: int = mid - self._s3_half_block_size + new_block_end: int = mid + self._s3_half_block_size + _logger.debug("new_block_start: %s / new_block_end: %s / mid: %s", new_block_start, new_block_end, mid) + if new_block_start < 0 and new_block_end > self._size: # both ends overflowing + new_block_start = 0 + new_block_end = self._size + elif new_block_end > self._size: # right overflow + new_block_start = new_block_start - (new_block_end - self._size) + new_block_start = 0 if new_block_start < 0 else new_block_start + new_block_end = self._size + elif new_block_start < 0: # left overflow + new_block_end = new_block_end + (0 - new_block_start) + new_block_end = self._size if new_block_end > self._size else new_block_end + new_block_start = 0 + _logger.debug( + "new_block_start: %s / new_block_end: %s/ self._start: %s / self._end: %s", + new_block_start, + new_block_end, + self._start, + self._end, + ) + + # Calculating missing bytes in cache + if ( # Full block download + (new_block_start < self._start and new_block_end > self._end) + or new_block_start > self._end + or new_block_end < self._start + ): + self._cache = self._fetch_range_proxy(new_block_start, new_block_end) + elif new_block_end > self._end: + prune_diff: int = new_block_start - self._start + self._cache = self._cache[prune_diff:] + self._fetch_range_proxy(self._end, new_block_end) + elif new_block_start < self._start: + prune_diff = new_block_end - self._end + self._cache = self._cache[:-prune_diff] + self._fetch_range_proxy(new_block_start, self._start) + else: + raise RuntimeError("Wrangler's cache calculation error.") + self._start = new_block_start + self._end = new_block_end + + return None + + def read(self, length: int = -1) -> Union[bytes, str]: + """Return cached data and fetch on demand chunks.""" + _logger.debug("Reading: %s bytes at %s", length, self._loc) + if self.readable() is False: + raise ValueError("File not in read mode.") + if length < 0: + length = self._size - self._loc + if self.closed is True: + raise ValueError("I/O operation on closed file.") + + self._fetch(self._loc, self._loc + length) + out: bytes = self._cache[self._loc - self._start : self._loc - self._start + length] + self._loc += len(out) + return out + + def readline(self, length: int = -1) -> Union[bytes, str]: + """Read until the next line terminator.""" + self._fetch(self._loc, self._loc + self._s3_block_size) + while True: + found: int = self._cache[self._loc - self._start :].find(self._newline.encode(encoding=self._encoding)) + + if 0 < length < found: + return self.read(length + 1) + if found >= 0: + return self.read(found + 1) + if self._end >= self._size: + return self.read(length) + + self._fetch(self._loc, self._end + self._s3_half_block_size) + + def readlines(self) -> List[Union[bytes, str]]: + """Return all lines as list.""" + return list(self) + + def tell(self) -> int: + """Return the current file location.""" + return self._loc + + def seek(self, loc: int, whence: int = 0) -> int: + """Set current file location.""" + if self.readable() is False: + raise ValueError("Seek only available in read mode") + if whence == 0: + loc_tmp: int = loc + elif whence == 1: + loc_tmp = self._loc + loc + elif whence == 2: + loc_tmp = self._size + loc + else: + raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2).") + if loc_tmp < 0: + raise ValueError("Seek before start of file") + self._loc = loc_tmp + return self._loc + + def write(self, data: bytes) -> int: + """Write data to buffer and only upload on close() or if buffer is greater than or equal to _MIN_WRITE_BLOCK.""" + if self.writable() is False: + raise RuntimeError("File not in write mode.") + if self.closed: + raise RuntimeError("I/O operation on closed file.") + n: int = self._buffer.write(data) + self._loc += n + if self._buffer.tell() >= _MIN_WRITE_BLOCK: + self.flush() + return n + + def flush(self, force: bool = False) -> None: + """Write buffered data to S3.""" + if self.closed: + raise RuntimeError("I/O operation on closed file.") + if self.writable(): + total_size: int = self._buffer.tell() + if total_size < _MIN_WRITE_BLOCK and force is False: + return None + if total_size == 0: + return None + _logger.debug("Flushing: %s bytes", total_size) + self._mpu = self._mpu or _utils.try_it( + f=self._client.create_multipart_upload, + ex=_S3_RETRYABLE_ERRORS, + base=0.5, + max_num_tries=6, + Bucket=self._bucket, + Key=self._key, + **self._get_botocore_valid_kwargs(function_name="create_multipart_upload"), + ) + self._buffer.seek(0) + for chunk_size in _utils.get_even_chunks_sizes( + total_size=total_size, chunk_size=_MIN_WRITE_BLOCK, upper_bound=False + ): + _logger.debug("chunk_size: %s bytes", chunk_size) + self._parts_count += 1 + self._upload_proxy.upload( + bucket=self._bucket, + key=self._key, + part=self._parts_count, + upload_id=self._mpu["UploadId"], + data=self._buffer.read(chunk_size), + boto3_session=self._boto3_session, + boto3_kwargs=self._get_botocore_valid_kwargs(function_name="upload_part"), + ) + self._buffer = io.BytesIO() + return None + + def readable(self) -> bool: + """Return whether this object is opened for reading.""" + return "r" in self._mode + + def seekable(self) -> bool: + """Return whether this object is opened for seeking.""" + return self.readable() + + def writable(self) -> bool: + """Return whether this object is opened for writing.""" + return "w" in self._mode + + def close(self) -> None: + """Clean up the cache.""" + if self.closed: + return None + if self.writable(): + _logger.debug("Closing: %s parts", self._parts_count) + _logger.debug("Buffer tell: %s", self._buffer.tell()) + if self._parts_count > 0: + self.flush(force=True) + pasts: List[Dict[str, Union[str, int]]] = self._upload_proxy.close() + part_info: Dict[str, List[Dict[str, Any]]] = {"Parts": pasts} + _logger.debug("complete_multipart_upload") + _utils.try_it( + f=self._client.complete_multipart_upload, + ex=_S3_RETRYABLE_ERRORS, + base=0.5, + max_num_tries=6, + Bucket=self._bucket, + Key=self._key, + UploadId=self._mpu["UploadId"], + MultipartUpload=part_info, + **self._get_botocore_valid_kwargs(function_name="complete_multipart_upload"), + ) + elif self._buffer.tell() > 0: + _logger.debug("put_object") + _utils.try_it( + f=self._client.put_object, + ex=_S3_RETRYABLE_ERRORS, + base=0.5, + max_num_tries=6, + Bucket=self._bucket, + Key=self._key, + Body=self._buffer.getvalue(), + **self._get_botocore_valid_kwargs(function_name="put_object"), + ) + self._parts_count = 0 + self._buffer.seek(0) + self._buffer.truncate(0) + self._upload_proxy.close() + elif self.readable(): + self._cache = b"" + else: + raise RuntimeError(f"Invalid mode: {self._mode}") + self.closed = True + return None + + +@contextmanager +@apply_configs +def open_s3_object( + path: str, + mode: str, + use_threads: bool = False, + s3_additional_kwargs: Optional[Dict[str, str]] = None, + s3_block_size: int = 4_194_304, # 4 MB (4 * 2**20) + boto3_session: Optional[boto3.Session] = None, + newline: Optional[str] = "\n", + encoding: Optional[str] = "utf-8", +) -> Iterator[Union[_S3Object, io.TextIOWrapper]]: + """Return a _S3Object or TextIOWrapper based in the received mode.""" + s3obj: Optional[_S3Object] = None + text_s3obj: Optional[io.TextIOWrapper] = None + try: + s3obj = _S3Object( + path=path, + s3_block_size=s3_block_size, + mode=mode, + use_threads=use_threads, + s3_additional_kwargs=s3_additional_kwargs, + boto3_session=boto3_session, + encoding=encoding, + newline=newline, + ) + if "b" in mode: # binary + yield s3obj + else: # text + text_s3obj = io.TextIOWrapper( + buffer=cast(BinaryIO, s3obj), + encoding=encoding, + newline=newline, + line_buffering=False, + write_through=False, + ) + yield text_s3obj + finally: + if text_s3obj is not None and text_s3obj.closed is False: + text_s3obj.close() + if s3obj is not None and s3obj.closed is False: + s3obj.close() diff --git a/awswrangler/s3/_list.py b/awswrangler/s3/_list.py index a0b5e13a0..4406a38e8 100644 --- a/awswrangler/s3/_list.py +++ b/awswrangler/s3/_list.py @@ -5,8 +5,8 @@ import logging from typing import Any, Dict, List, Optional, Sequence, Union -import boto3 # type: ignore -import botocore.exceptions # type: ignore +import boto3 +import botocore.exceptions from awswrangler import _utils, exceptions diff --git a/awswrangler/s3/_read.py b/awswrangler/s3/_read.py index 6bef0a144..14d7d1ef1 100644 --- a/awswrangler/s3/_read.py +++ b/awswrangler/s3/_read.py @@ -3,11 +3,12 @@ import logging from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, cast -import numpy as np # type: ignore -import pandas as pd # type: ignore -from pandas.api.types import union_categoricals # type: ignore +import numpy as np +import pandas as pd +from pandas.api.types import union_categoricals from awswrangler import exceptions +from awswrangler.s3._list import _prefix_cleanup _logger: logging.Logger = logging.getLogger(__name__) @@ -15,7 +16,7 @@ def _get_path_root(path: Union[str, List[str]], dataset: bool) -> Optional[str]: if (dataset is True) and (not isinstance(path, str)): raise exceptions.InvalidArgument("The path argument must be a string if dataset=True (Amazon S3 prefix).") - return str(path) if dataset is True else None + return _prefix_cleanup(str(path)) if dataset is True else None def _get_path_ignore_suffix(path_ignore_suffix: Union[str, List[str], None]) -> Union[List[str], None]: @@ -107,7 +108,14 @@ def _extract_partitions_dtypes_from_table_details(response: Dict[str, Any]) -> D return dtypes -def _union(dfs: List[pd.DataFrame], ignore_index: bool) -> pd.DataFrame: +def _union(dfs: List[pd.DataFrame], ignore_index: Optional[bool]) -> pd.DataFrame: + if ignore_index is None: + ignore_index = False + for df in dfs: + if hasattr(df, "_awswrangler_ignore_index"): + if df._awswrangler_ignore_index is True: # pylint: disable=protected-access + ignore_index = True + break cats: Tuple[Set[str], ...] = tuple(set(df.select_dtypes(include="category").columns) for df in dfs) for col in set.intersection(*cats): cat = union_categoricals([df[col] for df in dfs]) diff --git a/awswrangler/s3/_read_concurrent.py b/awswrangler/s3/_read_concurrent.py index d6b569c83..08b90c042 100644 --- a/awswrangler/s3/_read_concurrent.py +++ b/awswrangler/s3/_read_concurrent.py @@ -3,10 +3,10 @@ import concurrent.futures import itertools import logging -from typing import Any, Callable, Dict, List +from typing import Any, Callable, Dict, List, Optional -import boto3 # type: ignore -import pandas as pd # type: ignore +import boto3 +import pandas as pd from awswrangler import _utils from awswrangler.s3._read import _union @@ -29,7 +29,7 @@ def _caller( def _read_concurrent( func: Callable[..., pd.DataFrame], paths: List[str], - ignore_index: bool, + ignore_index: Optional[bool], boto3_session: boto3.Session, **func_kwargs: Any, ) -> pd.DataFrame: diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py index a9e825d8b..38713f0de 100644 --- a/awswrangler/s3/_read_parquet.py +++ b/awswrangler/s3/_read_parquet.py @@ -3,19 +3,21 @@ import concurrent.futures import datetime import itertools +import json import logging import pprint +import warnings from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union, cast -import boto3 # type: ignore -import pandas as pd # type: ignore -import pyarrow as pa # type: ignore -import pyarrow.lib # type: ignore -import pyarrow.parquet # type: ignore -import s3fs # type: ignore +import boto3 +import pandas as pd +import pyarrow as pa +import pyarrow.lib +import pyarrow.parquet from awswrangler import _data_types, _utils, exceptions from awswrangler._config import apply_configs +from awswrangler.s3._fs import open_s3_object from awswrangler.s3._list import _path2list from awswrangler.s3._read import ( _apply_partition_filter, @@ -32,12 +34,16 @@ def _read_parquet_metadata_file( - path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], + path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool ) -> Dict[str, str]: - fs: s3fs.S3FileSystem = _utils.get_fs( - s3fs_block_size=4_194_304, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs # 4 MB (4 * 2**20) - ) - with _utils.open_file(fs=fs, path=path, mode="rb") as f: + with open_s3_object( + path=path, + mode="rb", + use_threads=use_threads, + s3_block_size=131_072, # 128 KB (128 * 2**10) + s3_additional_kwargs=s3_additional_kwargs, + boto3_session=boto3_session, + ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f) return _data_types.athena_types_from_pyarrow_schema(schema=pq_file.schema.to_arrow_schema(), partitions=None)[0] @@ -54,7 +60,9 @@ def _read_schemas_from_files( n_paths: int = len(paths) if use_threads is False or n_paths == 1: schemas = tuple( - _read_parquet_metadata_file(path=p, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs,) + _read_parquet_metadata_file( + path=p, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads + ) for p in paths ) elif n_paths > 1: @@ -66,6 +74,7 @@ def _read_schemas_from_files( paths, itertools.repeat(_utils.boto3_to_primitives(boto3_session=boto3_session)), # Boto3.Session itertools.repeat(s3_additional_kwargs), + itertools.repeat(use_threads), ) ) _logger.debug("schemas: %s", schemas) @@ -160,6 +169,36 @@ def _read_parquet_metadata( return columns_types, partitions_types, partitions_values +def _apply_index(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame: + index_columns: List[Any] = metadata["index_columns"] + if index_columns: + if isinstance(index_columns[0], str): + df = df.set_index(keys=index_columns, drop=True, inplace=False, verify_integrity=False) + elif isinstance(index_columns[0], dict) and index_columns[0]["kind"] == "range": + col = index_columns[0] + if col["kind"] == "range": + df.index = pd.RangeIndex(start=col["start"], stop=col["stop"], step=col["step"]) + if col["name"] is not None and col["name"].startswith("__index_level_") is False: + df.index.name = col["name"] + df.index.names = [None if n is not None and n.startswith("__index_level_") else n for n in df.index.names] + ignore_index: bool = False + else: + ignore_index = True + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + df._awswrangler_ignore_index = ignore_index # pylint: disable=protected-access + return df + + +def _apply_timezone(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame: + for c in metadata["columns"]: + if c["pandas_type"] == "datetimetz": + _logger.debug("applying timezone (%s) on column %s", c["metadata"]["timezone"], c["field_name"]) + df[c["field_name"]] = df[c["field_name"]].dt.tz_localize(tz="UTC") + df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=c["metadata"]["timezone"]) + return df + + def _arrowtable2df( table: pa.Table, categories: Optional[List[str]], @@ -169,6 +208,9 @@ def _arrowtable2df( path: str, path_root: Optional[str], ) -> pd.DataFrame: + metadata: Dict[str, Any] = {} + if table.schema.metadata is not None and b"pandas" in table.schema.metadata: + metadata = json.loads(table.schema.metadata[b"pandas"]) df: pd.DataFrame = _apply_partitions( df=table.to_pandas( use_threads=use_threads, @@ -177,15 +219,21 @@ def _arrowtable2df( integer_object_nulls=False, date_as_object=True, ignore_metadata=True, - categories=categories, + strings_to_categorical=False, safe=safe, + categories=categories, types_mapper=_data_types.pyarrow2pandas_extension, ), dataset=dataset, path=path, path_root=path_root, ) - return _utils.ensure_df_is_mutable(df=df) + df = _utils.ensure_df_is_mutable(df=df) + if metadata: + _logger.debug("metadata: %s", metadata) + df = _apply_index(df=df, metadata=metadata) + df = _apply_timezone(df=df, metadata=metadata) + return df def _read_parquet_chunked( @@ -202,13 +250,17 @@ def _read_parquet_chunked( use_threads: bool, ) -> Iterator[pd.DataFrame]: next_slice: Optional[pd.DataFrame] = None - fs: s3fs.S3FileSystem = _utils.get_fs( - s3fs_block_size=8_388_608, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs # 8 MB (8 * 2**20) - ) last_schema: Optional[Dict[str, str]] = None last_path: str = "" for path in paths: - with _utils.open_file(fs=fs, path=path, mode="rb") as f: + with open_s3_object( + path=path, + mode="rb", + use_threads=use_threads, + s3_block_size=10_485_760, # 10 MB (10 * 2**20) + s3_additional_kwargs=s3_additional_kwargs, + boto3_session=boto3_session, + ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories) schema: Dict[str, str] = _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None @@ -241,7 +293,7 @@ def _read_parquet_chunked( yield df elif isinstance(chunked, int) and chunked > 0: if next_slice is not None: - df = pd.concat(objs=[next_slice, df], ignore_index=True, sort=False, copy=False) + df = _union(dfs=[next_slice, df], ignore_index=None) while len(df.index) >= chunked: yield df.iloc[:chunked] df = df.iloc[chunked:] @@ -261,13 +313,16 @@ def _read_parquet_file( categories: Optional[List[str]], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], + use_threads: bool, ) -> pa.Table: - fs: s3fs.S3FileSystem = _utils.get_fs( - s3fs_block_size=134_217_728, - session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, # 128 MB (128 * 2**20) - ) - with _utils.open_file(fs=fs, path=path, mode="rb") as f: + with open_s3_object( + path=path, + mode="rb", + use_threads=use_threads, + s3_block_size=134_217_728, # 128 MB (128 * 2**20) + s3_additional_kwargs=s3_additional_kwargs, + boto3_session=boto3_session, + ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories) return pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False) @@ -277,13 +332,21 @@ def _count_row_groups( categories: Optional[List[str]], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], + use_threads: bool, ) -> int: - fs: s3fs.S3FileSystem = _utils.get_fs( - s3fs_block_size=4_194_304, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs # 4 MB (4 * 2**20) - ) - with _utils.open_file(fs=fs, path=path, mode="rb") as f: + _logger.debug("Counting row groups...") + with open_s3_object( + path=path, + mode="rb", + use_threads=use_threads, + s3_block_size=131_072, # 128 KB (128 * 2**10) + s3_additional_kwargs=s3_additional_kwargs, + boto3_session=boto3_session, + ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories) - return cast(int, pq_file.num_row_groups) + n: int = cast(int, pq_file.num_row_groups) + _logger.debug("Row groups count: %d", n) + return n def _read_parquet_row_group( @@ -293,14 +356,17 @@ def _read_parquet_row_group( categories: Optional[List[str]], boto3_primitives: _utils.Boto3PrimitivesType, s3_additional_kwargs: Optional[Dict[str, str]], + use_threads: bool, ) -> pa.Table: boto3_session: boto3.Session = _utils.boto3_from_primitives(primitives=boto3_primitives) - fs: s3fs.S3FileSystem = _utils.get_fs( - s3fs_block_size=134_217_728, - session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, # 128 MB (128 * 2**20) - ) - with _utils.open_file(fs=fs, path=path, mode="rb") as f: + with open_s3_object( + path=path, + mode="rb", + use_threads=use_threads, + s3_block_size=10_485_760, # 10 MB (10 * 2**20) + s3_additional_kwargs=s3_additional_kwargs, + boto3_session=boto3_session, + ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories) num_row_groups: int = pq_file.num_row_groups _logger.debug("Reading Row Group %s/%s [multi-threaded]", row_group + 1, num_row_groups) @@ -325,11 +391,16 @@ def _read_parquet( categories=categories, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, + use_threads=use_threads, ) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) num_row_groups: int = _count_row_groups( - path=path, categories=categories, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs + path=path, + categories=categories, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + use_threads=use_threads, ) with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: tables: Tuple[pa.Table, ...] = tuple( @@ -341,6 +412,7 @@ def _read_parquet( itertools.repeat(categories), itertools.repeat(_utils.boto3_to_primitives(boto3_session=boto3_session)), itertools.repeat(s3_additional_kwargs), + itertools.repeat(use_threads), ) ) table = pa.lib.concat_tables(tables, promote=False) @@ -453,9 +525,8 @@ def read_parquet( The filter is applied only after list all s3 files. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + s3_additional_kwargs : Dict[str, str] + Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- @@ -469,17 +540,6 @@ def read_parquet( >>> import awswrangler as wr >>> df = wr.s3.read_parquet(path='s3://bucket/prefix/') - Reading all Parquet files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_parquet( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) - Reading all Parquet files from a list >>> import awswrangler as wr @@ -546,8 +606,8 @@ def read_parquet( ) if use_threads is True: args["use_threads"] = True - return _read_concurrent(func=_read_parquet, ignore_index=True, paths=paths, **args) - return _union(dfs=[_read_parquet(path=p, **args) for p in paths], ignore_index=True) + return _read_concurrent(func=_read_parquet, paths=paths, ignore_index=None, **args) + return _union(dfs=[_read_parquet(path=p, **args) for p in paths], ignore_index=None) @apply_configs @@ -629,8 +689,7 @@ def read_parquet_table( boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- @@ -677,8 +736,8 @@ def read_parquet_table( res: Dict[str, Any] = client_glue.get_table(**args) try: path: str = res["Table"]["StorageDescriptor"]["Location"] - except KeyError: - raise exceptions.InvalidTable(f"Missing s3 location for {database}.{table}.") + except KeyError as ex: + raise exceptions.InvalidTable(f"Missing s3 location for {database}.{table}.") from ex return _data_types.cast_pandas_with_athena_types( df=read_parquet( path=path, @@ -749,8 +808,7 @@ def read_parquet_metadata( boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- diff --git a/awswrangler/s3/_read_text.py b/awswrangler/s3/_read_text.py index e784e4eb1..49bbcbf57 100644 --- a/awswrangler/s3/_read_text.py +++ b/awswrangler/s3/_read_text.py @@ -5,13 +5,13 @@ import pprint from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union -import boto3 # type: ignore -import pandas as pd # type: ignore -import pandas.io.parsers # type: ignore -import s3fs # type: ignore -from pandas.io.common import infer_compression # type: ignore +import boto3 +import pandas as pd +import pandas.io.parsers +from pandas.io.common import infer_compression from awswrangler import _utils, exceptions +from awswrangler.s3._fs import open_s3_object from awswrangler.s3._list import _path2list from awswrangler.s3._read import ( _apply_partition_filter, @@ -43,16 +43,21 @@ def _read_text_chunked( pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, + use_threads: bool, ) -> Iterator[pd.DataFrame]: for path in paths: _logger.debug("path: %s", path) - fs: s3fs.S3FileSystem = _utils.get_fs( - s3fs_block_size=8_388_608, - session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, # 8 MB (8 * 2**20) - ) mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) - with _utils.open_file(fs=fs, path=path, mode=mode, encoding=encoding, newline=newline) as f: + with open_s3_object( + path=path, + mode=mode, + s3_block_size=10_485_760, # 10 MB (10 * 2**20) + encoding=encoding, + use_threads=use_threads, + s3_additional_kwargs=s3_additional_kwargs, + newline=newline, + boto3_session=boto3_session, + ) as f: reader: pandas.io.parsers.TextFileReader = parser_func(f, chunksize=chunksize, **pandas_kwargs) for df in reader: yield _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root) @@ -66,14 +71,19 @@ def _read_text_file( pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, + use_threads: bool, ) -> pd.DataFrame: - fs: s3fs.S3FileSystem = _utils.get_fs( - s3fs_block_size=134_217_728, - session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, # 128 MB (128 * 2**20) - ) mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) - with _utils.open_file(fs=fs, path=path, mode=mode, encoding=encoding, newline=newline) as f: + with open_s3_object( + path=path, + mode=mode, + use_threads=use_threads, + s3_block_size=134_217_728, # 128 MB (128 * 2**20) + encoding=encoding, + s3_additional_kwargs=s3_additional_kwargs, + newline=newline, + boto3_session=boto3_session, + ) as f: df: pd.DataFrame = parser_func(f, **pandas_kwargs) return _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root) @@ -119,6 +129,7 @@ def _read_text( "path_root": path_root, "pandas_kwargs": pandas_kwargs, "s3_additional_kwargs": s3_additional_kwargs, + "use_threads": use_threads, } _logger.debug("args:\n%s", pprint.pformat(args)) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] @@ -186,14 +197,13 @@ def read_csv( The filter is applied only after list all s3 files. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. - s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + s3_additional_kwargs : Dict[str, str] + Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. - dataset: bool + dataset : bool If `True` read a CSV dataset instead of simple file(s) loading all the related partitions as columns. - partition_filter: Optional[Callable[[Dict[str, str]], bool]] + partition_filter : Optional[Callable[[Dict[str, str]], bool]] Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter). This function MUST receive a single argument (Dict[str, str]) where keys are partitions names and values are partitions values. Partitions values will be always strings extracted from S3. @@ -201,8 +211,10 @@ def read_csv( Ignored if `dataset=False`. E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False`` https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/023%20-%20Flexible%20Partitions%20Filter.ipynb - pandas_kwargs: - keyword arguments forwarded to pandas.read_csv(). + pandas_kwargs : + KEYWORD arguments forwarded to pandas.read_csv(). You can NOT pass `pandas_kwargs` explicit, just add valid + Pandas arguments in the function call and Wrangler will accept it. + e.g. wr.s3.read_csv('s3://bucket/prefix/', sep='|', na_values=['null', 'none'], skip_blank_lines=True) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html Returns @@ -217,16 +229,10 @@ def read_csv( >>> import awswrangler as wr >>> df = wr.s3.read_csv(path='s3://bucket/prefix/') - Reading all CSV files under a prefix encrypted with a KMS key + Reading all CSV files under a prefix and using pandas_kwargs >>> import awswrangler as wr - >>> df = wr.s3.read_csv( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) + >>> df = wr.s3.read_csv('s3://bucket/prefix/', sep='|', na_values=['null', 'none'], skip_blank_lines=True) Reading all CSV files from a list @@ -247,6 +253,12 @@ def read_csv( >>> df = wr.s3.read_csv(path, dataset=True, partition_filter=my_filter) """ + if "pandas_kwargs" in pandas_kwargs: + raise exceptions.InvalidArgument( + "You can NOT pass `pandas_kwargs` explicit, just add valid " + "Pandas arguments in the function call and Wrangler will accept it." + "e.g. wr.s3.read_csv('s3://bucket/prefix/', sep='|', skip_blank_lines=True)" + ) ignore_index: bool = "index_col" not in pandas_kwargs return _read_text( parser_func=pd.read_csv, @@ -320,8 +332,7 @@ def read_fwf( boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. dataset: bool @@ -335,7 +346,9 @@ def read_fwf( E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False`` https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/023%20-%20Flexible%20Partitions%20Filter.ipynb pandas_kwargs: - keyword arguments forwarded to pandas.read_fwf(). + KEYWORD arguments forwarded to pandas.read_fwf(). You can NOT pass `pandas_kwargs` explicit, just add valid + Pandas arguments in the function call and Wrangler will accept it. + e.g. wr.s3.read_fwf(path='s3://bucket/prefix/', widths=[1, 3], names=["c0", "c1"]) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_fwf.html Returns @@ -348,28 +361,22 @@ def read_fwf( Reading all fixed-width formatted (FWF) files under a prefix >>> import awswrangler as wr - >>> df = wr.s3.read_fwf(path='s3://bucket/prefix/') - - Reading all fixed-width formatted (FWF) files under a prefix encrypted with a KMS key - - >>> import awswrangler as wr - >>> df = wr.s3.read_fwf( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) + >>> df = wr.s3.read_fwf(path='s3://bucket/prefix/', widths=[1, 3], names=['c0', 'c1]) Reading all fixed-width formatted (FWF) files from a list >>> import awswrangler as wr - >>> df = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt']) + >>> df = wr.s3.read_fwf(path=['s3://bucket/0.txt', 's3://bucket/1.txt'], widths=[1, 3], names=['c0', 'c1']) Reading in chunks of 100 lines >>> import awswrangler as wr - >>> dfs = wr.s3.read_fwf(path=['s3://bucket/filename0.txt', 's3://bucket/filename1.txt'], chunksize=100) + >>> dfs = wr.s3.read_fwf( + ... path=['s3://bucket/0.txt', 's3://bucket/1.txt'], + ... chunksize=100, + ... widths=[1, 3], + ... names=["c0", "c1"] + ... ) >>> for df in dfs: >>> print(df) # 100 lines Pandas DataFrame @@ -377,9 +384,15 @@ def read_fwf( >>> import awswrangler as wr >>> my_filter = lambda x: True if x["city"].startswith("new") else False - >>> df = wr.s3.read_fwf(path, dataset=True, partition_filter=my_filter) + >>> df = wr.s3.read_fwf(path, dataset=True, partition_filter=my_filter, widths=[1, 3], names=["c0", "c1"]) """ + if "pandas_kwargs" in pandas_kwargs: + raise exceptions.InvalidArgument( + "You can NOT pass `pandas_kwargs` explicit, just add valid " + "Pandas arguments in the function call and Wrangler will accept it." + "e.g. wr.s3.read_fwf(path, widths=[1, 3], names=['c0', 'c1'])" + ) return _read_text( parser_func=pd.read_fwf, path=path, @@ -456,8 +469,7 @@ def read_json( boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. dataset: bool @@ -472,7 +484,9 @@ def read_json( E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False`` https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/023%20-%20Flexible%20Partitions%20Filter.ipynb pandas_kwargs: - keyword arguments forwarded to pandas.read_json(). + KEYWORD arguments forwarded to pandas.read_json(). You can NOT pass `pandas_kwargs` explicit, just add valid + Pandas arguments in the function call and Wrangler will accept it. + e.g. wr.s3.read_json('s3://bucket/prefix/', lines=True, keep_default_dates=True) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html Returns @@ -487,16 +501,10 @@ def read_json( >>> import awswrangler as wr >>> df = wr.s3.read_json(path='s3://bucket/prefix/') - Reading all JSON files under a prefix encrypted with a KMS key + Reading all CSV files under a prefix and using pandas_kwargs >>> import awswrangler as wr - >>> df = wr.s3.read_json( - ... path='s3://bucket/prefix/', - ... s3_additional_kwargs={ - ... 'ServerSideEncryption': 'aws:kms', - ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' - ... } - ... ) + >>> df = wr.s3.read_json('s3://bucket/prefix/', lines=True, keep_default_dates=True) Reading all JSON files from a list @@ -506,7 +514,7 @@ def read_json( Reading in chunks of 100 lines >>> import awswrangler as wr - >>> dfs = wr.s3.read_json(path=['s3://bucket/filename0.json', 's3://bucket/filename1.json'], chunksize=100) + >>> dfs = wr.s3.read_json(path=['s3://bucket/0.json', 's3://bucket/1.json'], chunksize=100, lines=True) >>> for df in dfs: >>> print(df) # 100 lines Pandas DataFrame @@ -517,6 +525,12 @@ def read_json( >>> df = wr.s3.read_json(path, dataset=True, partition_filter=my_filter) """ + if "pandas_kwargs" in pandas_kwargs: + raise exceptions.InvalidArgument( + "You can NOT pass `pandas_kwargs` explicit, just add valid " + "Pandas arguments in the function call and Wrangler will accept it." + "e.g. wr.s3.read_json(path, lines=True, keep_default_dates=True)" + ) if (dataset is True) and ("lines" not in pandas_kwargs): pandas_kwargs["lines"] = True pandas_kwargs["orient"] = orient diff --git a/awswrangler/s3/_wait.py b/awswrangler/s3/_wait.py index 4b2dcf048..1094901f0 100644 --- a/awswrangler/s3/_wait.py +++ b/awswrangler/s3/_wait.py @@ -5,7 +5,7 @@ import logging from typing import List, Optional, Tuple, Union -import boto3 # type: ignore +import boto3 from awswrangler import _utils diff --git a/awswrangler/s3/_write.py b/awswrangler/s3/_write.py index 2490e9d6d..370a713ed 100644 --- a/awswrangler/s3/_write.py +++ b/awswrangler/s3/_write.py @@ -3,7 +3,7 @@ import logging from typing import Any, Dict, List, Optional, Tuple -import pandas as pd # type: ignore +import pandas as pd from awswrangler import _data_types, _utils, catalog, exceptions @@ -38,6 +38,7 @@ def _apply_dtype( def _validate_args( df: pd.DataFrame, table: Optional[str], + database: Optional[str], dataset: bool, path: str, partition_cols: Optional[List[str]], @@ -63,6 +64,11 @@ def _validate_args( "arguments: database, table, description, parameters, " "columns_comments." ) + elif (database is None) != (table is None): + raise exceptions.InvalidArgumentCombination( + "Arguments database and table must be passed together. If you want to store your dataset in the Glue " + "Catalog, please ensure you are passing both." + ) def _sanitize( diff --git a/awswrangler/s3/_write_concurrent.py b/awswrangler/s3/_write_concurrent.py index cd1bbd48e..a2fc7e8fc 100644 --- a/awswrangler/s3/_write_concurrent.py +++ b/awswrangler/s3/_write_concurrent.py @@ -4,8 +4,8 @@ import logging from typing import Any, Callable, Dict, List, Optional -import boto3 # type: ignore -import pandas as pd # type: ignore +import boto3 +import pandas as pd from awswrangler import _utils @@ -16,9 +16,9 @@ class _WriteProxy: def __init__(self, use_threads: bool): self._exec: Optional[concurrent.futures.ThreadPoolExecutor] self._results: List[str] = [] - cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) - if cpus > 1: - self._exec = concurrent.futures.ThreadPoolExecutor(max_workers=cpus) + self._cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) + if self._cpus > 1: + self._exec = concurrent.futures.ThreadPoolExecutor(max_workers=self._cpus) self._futures: List[Any] = [] else: self._exec = None @@ -35,6 +35,7 @@ def _caller( def write(self, func: Callable[..., List[str]], boto3_session: boto3.Session, **func_kwargs: Any) -> None: """Write File.""" if self._exec is not None: + _utils.block_waiting_available_thread(seq=self._futures, max_workers=self._cpus) _logger.debug("Submitting: %s", func) future = self._exec.submit( _WriteProxy._caller, diff --git a/awswrangler/s3/_write_dataset.py b/awswrangler/s3/_write_dataset.py index b36c6f351..58cc2b9e6 100644 --- a/awswrangler/s3/_write_dataset.py +++ b/awswrangler/s3/_write_dataset.py @@ -3,8 +3,8 @@ import logging from typing import Any, Callable, Dict, List, Optional, Tuple -import boto3 # type: ignore -import pandas as pd # type: ignore +import boto3 +import pandas as pd from awswrangler import exceptions from awswrangler.s3._delete import delete_objects @@ -33,7 +33,14 @@ def _to_partitions( prefix: str = f"{path_root}{subdir}/" if mode == "overwrite_partitions": delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) - proxy.write(func=func, df=subgroup, path_root=prefix, boto3_session=boto3_session, **func_kwargs) + proxy.write( + func=func, + df=subgroup, + path_root=prefix, + boto3_session=boto3_session, + use_threads=use_threads, + **func_kwargs, + ) partitions_values[prefix] = [str(k) for k in keys] paths: List[str] = proxy.close() # blocking return paths, partitions_values @@ -51,7 +58,7 @@ def _to_dataset( boto3_session: boto3.Session, **func_kwargs: Any, ) -> Tuple[List[str], Dict[str, List[str]]]: - path_root = path_root if path_root[-1] == "/" else f"{path_root}/" + path_root = path_root if path_root.endswith("/") else f"{path_root}/" # Evaluate mode if mode not in ["append", "overwrite", "overwrite_partitions"]: @@ -64,7 +71,9 @@ def _to_dataset( # Writing partitions_values: Dict[str, List[str]] = {} if not partition_cols: - paths: List[str] = func(df=df, path_root=path_root, boto3_session=boto3_session, index=index, **func_kwargs) + paths: List[str] = func( + df=df, path_root=path_root, use_threads=use_threads, boto3_session=boto3_session, index=index, **func_kwargs + ) else: paths, partitions_values = _to_partitions( func=func, diff --git a/awswrangler/s3/_write_parquet.py b/awswrangler/s3/_write_parquet.py index c67de80d8..a801db375 100644 --- a/awswrangler/s3/_write_parquet.py +++ b/awswrangler/s3/_write_parquet.py @@ -3,17 +3,18 @@ import logging import math import uuid -from typing import Any, Dict, List, Optional, Tuple, Union +from contextlib import contextmanager +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -import boto3 # type: ignore -import pandas as pd # type: ignore -import pyarrow as pa # type: ignore -import pyarrow.lib # type: ignore -import pyarrow.parquet # type: ignore -import s3fs # type: ignore +import boto3 +import pandas as pd +import pyarrow as pa +import pyarrow.lib +import pyarrow.parquet from awswrangler import _data_types, _utils, catalog, exceptions from awswrangler._config import apply_configs +from awswrangler.s3._fs import open_s3_object from awswrangler.s3._read_parquet import _read_parquet_metadata from awswrangler.s3._write import _COMPRESSION_2_EXT, _apply_dtype, _sanitize, _validate_args from awswrangler.s3._write_concurrent import _WriteProxy @@ -22,6 +23,23 @@ _logger: logging.Logger = logging.getLogger(__name__) +def _check_schema_changes(columns_types: Dict[str, str], table_input: Optional[Dict[str, Any]], mode: str) -> None: + if (table_input is not None) and (mode in ("append", "overwrite_partitions")): + catalog_cols: Dict[str, str] = {x["Name"]: x["Type"] for x in table_input["StorageDescriptor"]["Columns"]} + for c, t in columns_types.items(): + if c not in catalog_cols: + raise exceptions.InvalidArgumentValue( + f"Schema change detected: New column {c} with type {t}. " + "Please pass schema_evolution=True to allow new columns " + "behaviour." + ) + if t != catalog_cols[c]: # Data type change detected! + raise exceptions.InvalidArgumentValue( + f"Schema change detected: Data type change on column {c} " + f"(Old type: {catalog_cols[c]} / New type {t})." + ) + + def _get_file_path(file_counter: int, file_path: str) -> str: slash_index: int = file_path.rfind("/") dot_index: int = file_path.find(".", slash_index) @@ -33,29 +51,37 @@ def _get_file_path(file_counter: int, file_path: str) -> str: return file_path -def _get_fs( - boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]] -) -> s3fs.S3FileSystem: - return _utils.get_fs( - s3fs_block_size=33_554_432, # 32 MB (32 * 2**20) - session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, - ) - - +@contextmanager def _new_writer( - file_path: str, fs: s3fs.S3FileSystem, compression: Optional[str], schema: pa.Schema -) -> pyarrow.parquet.ParquetWriter: - return pyarrow.parquet.ParquetWriter( - where=file_path, - write_statistics=True, - use_dictionary=True, - filesystem=fs, - coerce_timestamps="ms", - compression=compression, - flavor="spark", - schema=schema, - ) + file_path: str, + compression: Optional[str], + schema: pa.Schema, + boto3_session: boto3.Session, + s3_additional_kwargs: Optional[Dict[str, str]], + use_threads: bool, +) -> Iterator[pyarrow.parquet.ParquetWriter]: + writer: Optional[pyarrow.parquet.ParquetWriter] = None + with open_s3_object( + path=file_path, + mode="wb", + use_threads=use_threads, + s3_additional_kwargs=s3_additional_kwargs, + boto3_session=boto3_session, + ) as f: + try: + writer = pyarrow.parquet.ParquetWriter( + where=f, + write_statistics=True, + use_dictionary=True, + coerce_timestamps="ms", + compression=compression, + flavor="spark", + schema=schema, + ) + yield writer + finally: + if writer is not None and writer.is_open is True: + writer.close() def _write_chunk( @@ -66,9 +92,16 @@ def _write_chunk( table: pa.Table, offset: int, chunk_size: int, + use_threads: bool, ) -> List[str]: - fs = _get_fs(boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) - with _new_writer(file_path=file_path, fs=fs, compression=compression, schema=table.schema) as writer: + with _new_writer( + file_path=file_path, + compression=compression, + schema=table.schema, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + use_threads=use_threads, + ) as writer: writer.write_table(table.slice(offset, chunk_size)) return [file_path] @@ -98,6 +131,7 @@ def _to_parquet_chunked( table=table, offset=offset, chunk_size=max_rows_by_file, + use_threads=use_threads, ) return proxy.close() # blocking @@ -112,6 +146,7 @@ def _to_parquet( dtype: Dict[str, str], boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], + use_threads: bool, path: Optional[str] = None, path_root: Optional[str] = None, max_rows_by_file: Optional[int] = 0, @@ -143,8 +178,14 @@ def _to_parquet( cpus=cpus, ) else: - fs = _get_fs(boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) - with _new_writer(file_path=file_path, fs=fs, compression=compression, schema=table.schema) as writer: + with _new_writer( + file_path=file_path, + compression=compression, + schema=table.schema, + boto3_session=boto3_session, + s3_additional_kwargs=s3_additional_kwargs, + use_threads=use_threads, + ) as writer: writer.write_table(table) paths = [file_path] return paths @@ -166,6 +207,7 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, + schema_evolution: bool = True, database: Optional[str] = None, table: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, @@ -221,8 +263,9 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", + "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". + e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. @@ -242,6 +285,11 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals https://aws-data-wrangler.readthedocs.io/en/latest/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. + schema_evolution : bool + If True allows schema evolution (new or missing columns), otherwise a exception will be raised. + (Only considered if dataset=True and mode in ("append", "overwrite_partitions")) + Related tutorial: + https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/014%20-%20Schema%20Evolution.ipynb database : str, optional Glue/Athena catalog: Database name. table : str, optional @@ -398,6 +446,7 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals _validate_args( df=df, table=table, + database=database, dataset=dataset, path=path, partition_cols=partition_cols, @@ -449,8 +498,17 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals s3_additional_kwargs=s3_additional_kwargs, dtype=dtype, max_rows_by_file=max_rows_by_file, + use_threads=use_threads, ) else: + columns_types: Dict[str, str] = {} + partitions_types: Dict[str, str] = {} + if (database is not None) and (table is not None): + columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( + df=df, index=index, partition_cols=partition_cols, dtype=dtype + ) + if schema_evolution is False: + _check_schema_changes(columns_types=columns_types, table_input=catalog_table_input, mode=mode) paths, partitions_values = _to_dataset( func=_to_parquet, concurrent_partitioning=concurrent_partitioning, @@ -470,9 +528,6 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals max_rows_by_file=max_rows_by_file, ) if (database is not None) and (table is not None): - columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( - df=df, index=index, partition_cols=partition_cols, dtype=dtype - ) catalog._create_parquet_table( # pylint: disable=protected-access database=database, table=table, @@ -631,8 +686,9 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", + "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". + e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. diff --git a/awswrangler/s3/_write_text.py b/awswrangler/s3/_write_text.py index 64405ce70..aaa50df95 100644 --- a/awswrangler/s3/_write_text.py +++ b/awswrangler/s3/_write_text.py @@ -5,12 +5,12 @@ import uuid from typing import Any, Dict, List, Optional, Union -import boto3 # type: ignore -import pandas as pd # type: ignore -import s3fs # type: ignore +import boto3 +import pandas as pd from awswrangler import _data_types, _utils, catalog, exceptions from awswrangler._config import apply_configs +from awswrangler.s3._fs import open_s3_object from awswrangler.s3._write import _apply_dtype, _sanitize, _validate_args from awswrangler.s3._write_dataset import _to_dataset @@ -20,6 +20,7 @@ def _to_text( file_format: str, df: pd.DataFrame, + use_threads: bool, boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], path: Optional[str] = None, @@ -34,14 +35,17 @@ def _to_text( file_path = path else: raise RuntimeError("path and path_root received at the same time.") - fs: s3fs.S3FileSystem = _utils.get_fs( - s3fs_block_size=33_554_432, - session=boto3_session, - s3_additional_kwargs=s3_additional_kwargs, # 32 MB (32 * 2**20) - ) encoding: Optional[str] = pandas_kwargs.get("encoding", None) newline: Optional[str] = pandas_kwargs.get("line_terminator", None) - with _utils.open_file(fs=fs, path=file_path, mode="w", encoding=encoding, newline=newline) as f: + with open_s3_object( + path=file_path, + mode="w", + use_threads=use_threads, + s3_additional_kwargs=s3_additional_kwargs, + boto3_session=boto3_session, + encoding=encoding, + newline=newline, + ) as f: _logger.debug("pandas_kwargs: %s", pandas_kwargs) if file_format == "csv": df.to_csv(f, **pandas_kwargs) @@ -131,8 +135,9 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", + "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". + e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. @@ -199,7 +204,9 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. pandas_kwargs : - keyword arguments forwarded to pandas.DataFrame.to_csv() + KEYWORD arguments forwarded to pandas.DataFrame.to_csv(). You can NOT pass `pandas_kwargs` explicit, just add + valid Pandas arguments in the function call and Wrangler will accept it. + e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',') https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html Returns @@ -225,6 +232,22 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals 'partitions_values': {} } + Writing single file with pandas_kwargs + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_csv( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/prefix/my_file.csv', + ... sep='|', + ... na_rep='NULL', + ... decimal=',' + ... ) + { + 'paths': ['s3://bucket/prefix/my_file.csv'], + 'partitions_values': {} + } + Writing single file encrypted with a KMS key >>> import awswrangler as wr @@ -308,9 +331,16 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals } """ + if "pandas_kwargs" in pandas_kwargs: + raise exceptions.InvalidArgument( + "You can NOT pass `pandas_kwargs` explicit, just add valid " + "Pandas arguments in the function call and Wrangler will accept it." + "e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',')" + ) _validate_args( df=df, table=table, + database=database, dataset=dataset, path=path, partition_cols=partition_cols, @@ -346,6 +376,7 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals _to_text( file_format="csv", df=df, + use_threads=use_threads, path=path, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, @@ -413,10 +444,16 @@ def to_json( path: str, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, + use_threads: bool = True, **pandas_kwargs: Any, ) -> None: """Write JSON file on Amazon S3. + Note + ---- + In case of `use_threads=True` the number of threads + that will be spawned will be gotten from os.cpu_count(). + Parameters ---------- df: pandas.DataFrame @@ -426,10 +463,16 @@ def to_json( boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs: - Forward to s3fs, useful for server side encryption - https://s3fs.readthedocs.io/en/latest/#serverside-encryption + Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", + "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". + e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} + use_threads : bool + True to enable concurrent requests, False to disable multiple threads. + If enabled os.cpu_count() will be used as the max number of threads. pandas_kwargs: - keyword arguments forwarded to pandas.DataFrame.to_csv() + KEYWORD arguments forwarded to pandas.DataFrame.to_json(). You can NOT pass `pandas_kwargs` explicit, just add + valid Pandas arguments in the function call and Wrangler will accept it. + e.g. wr.s3.to_json(df, path, lines=True, date_format='iso') https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html Returns @@ -448,6 +491,17 @@ def to_json( ... path='s3://bucket/filename.json', ... ) + Writing JSON file using pandas_kwargs + + >>> import awswrangler as wr + >>> import pandas as pd + >>> wr.s3.to_json( + ... df=pd.DataFrame({'col': [1, 2, 3]}), + ... path='s3://bucket/filename.json', + ... lines=True, + ... date_format='iso' + ... ) + Writing CSV file encrypted with a KMS key >>> import awswrangler as wr @@ -462,10 +516,17 @@ def to_json( ... ) """ + if "pandas_kwargs" in pandas_kwargs: + raise exceptions.InvalidArgument( + "You can NOT pass `pandas_kwargs` explicit, just add valid " + "Pandas arguments in the function call and Wrangler will accept it." + "e.g. wr.s3.to_json(df, path, lines=True, date_format='iso')" + ) _to_text( file_format="json", df=df, path=path, + use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, **pandas_kwargs, diff --git a/awswrangler/sts.py b/awswrangler/sts.py index 6203ee204..095abc29b 100644 --- a/awswrangler/sts.py +++ b/awswrangler/sts.py @@ -3,7 +3,7 @@ import logging from typing import Optional, cast -import boto3 # type: ignore +import boto3 from awswrangler import _utils diff --git a/building/lambda/build-lambda-layer.sh b/building/lambda/build-lambda-layer.sh index 2e95bf317..ad8446610 100644 --- a/building/lambda/build-lambda-layer.sh +++ b/building/lambda/build-lambda-layer.sh @@ -14,7 +14,7 @@ export ARROW_HOME=$(pwd)/dist export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH git clone \ - --branch apache-arrow-1.0.0 \ + --branch apache-arrow-1.0.1 \ --single-branch \ https://github.com/apache/arrow.git diff --git a/docs/source/api.rst b/docs/source/api.rst index d01f3b788..e679bc15e 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -56,6 +56,8 @@ AWS Glue Catalog create_parquet_table databases delete_database + delete_partitions + delete_all_partitions delete_table_if_exists does_table_exist drop_duplicated_columns diff --git a/docs/source/what.rst b/docs/source/what.rst index 148b66bd5..50f5aa9e0 100644 --- a/docs/source/what.rst +++ b/docs/source/what.rst @@ -3,6 +3,6 @@ What is AWS Data Wrangler? An `AWS Professional Service `_ `open source `_ python initiative that extends the power of `Pandas `_ library to AWS connecting **DataFrames** and AWS data related services (**Amazon Redshift**, **AWS Glue**, **Amazon Athena**, **Amazon EMR**, **Amazon QuickSight**, etc). -Built on top of other open-source projects like `Pandas `_, `Apache Arrow `_, `Boto3 `_, `s3fs `_, `SQLAlchemy `_, `Psycopg2 `_ and `PyMySQL `_, it offers abstracted functions to execute usual ETL tasks like load/unload data from **Data Lakes**, **Data Warehouses** and **Databases**. +Built on top of other open-source projects like `Pandas `_, `Apache Arrow `_, `Boto3 `_, `SQLAlchemy `_, `Psycopg2 `_ and `PyMySQL `_, it offers abstracted functions to execute usual ETL tasks like load/unload data from **Data Lakes**, **Data Warehouses** and **Databases**. Check our `tutorials `_ or the `list of functionalities `_. \ No newline at end of file diff --git a/pytest.ini b/pytest.ini index 7aadc6fda..0186f7b75 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,6 @@ log_cli=False filterwarnings = ignore::DeprecationWarning addopts = - --log-cli-format "[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s" + --log-cli-format "[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s][%(thread)d] %(message)s" --verbose --capture=sys diff --git a/requirements-dev.txt b/requirements-dev.txt index 1881e4a3e..c770ffe54 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,21 +1,23 @@ +wheel==0.35.1 awscli>=1.18.0,<2.0.0 -isort>=4.2.5,<5 -wheel==0.34.2 +isort==5.4.2 black==19.10b0 -pylint==2.5.3 +pylint==2.6.0 flake8==3.8.3 mypy==0.782 -pydocstyle==5.0.2 +pydocstyle==5.1.0 doc8==0.8.1 tox==3.19.0 pytest==6.0.1 -pytest-cov==2.10.0 -pytest-xdist==1.34.0 +pytest-cov==2.10.1 +pytest-xdist==2.1.0 +pytest-timeout==1.4.2 scikit-learn==0.23.2 -cfn-lint==0.34.1 +cfn-lint==0.35.0 cfn-flip==1.2.3 twine==3.2.0 -sphinx==3.2.0 +sphinx==3.2.1 sphinx_bootstrap_theme==0.7.1 moto==1.3.14 -jupyterlab==2.2.4 +jupyterlab==2.2.6 +s3fs==0.4.2 diff --git a/requirements.txt b/requirements.txt index 5a4777d91..27283f297 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,3 @@ pandas>=1.0.0,<1.2.0 SQLAlchemy~=1.3.10 pyarrow~=1.0.0 psycopg2-binary~=2.8.0 -s3fs==0.4.2 diff --git a/setup.cfg b/setup.cfg index 5c9781f44..3706d0108 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,6 +6,7 @@ license_files = [flake8] max-line-length = 120 +ignore = E203,W503 [isort] multi_line_output=3 @@ -17,3 +18,4 @@ line_length=120 [mypy] python_version = 3.6 strict = True +ignore_missing_imports=True diff --git a/tests/_utils.py b/tests/_utils.py index 072432744..5c6574966 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -574,7 +574,9 @@ def create_workgroup(wkg_name, config): wkgs = [x["Name"] for x in wkgs["WorkGroups"]] deleted = False if wkg_name in wkgs: - wkg = client.get_work_group(WorkGroup=wkg_name)["WorkGroup"] + wkg = try_it(client.get_work_group, botocore.exceptions.ClientError, max_num_tries=5, WorkGroup=wkg_name)[ + "WorkGroup" + ] if validate_workgroup_key(workgroup=wkg) is False: client.delete_work_group(WorkGroup=wkg_name, RecursiveDeleteOption=True) deleted = True diff --git a/tests/test_athena_csv.py b/tests/test_athena_csv.py index dd1287810..99d89354d 100644 --- a/tests/test_athena_csv.py +++ b/tests/test_athena_csv.py @@ -166,6 +166,30 @@ def test_to_csv_modes(glue_database, glue_table, path, use_threads, concurrent_p assert comments["c1"] == "one" +@pytest.mark.parametrize("use_threads", [True, False]) +def test_csv_overwrite_several_partitions(path, glue_database, glue_table, use_threads): + df0 = pd.DataFrame({"id": list(range(27)), "par": list(range(27))}) + df1 = pd.DataFrame({"id": list(range(26)), "par": list(range(26))}) + for df in (df0, df1): + paths = wr.s3.to_csv( + df=df, + path=path, + index=False, + use_threads=use_threads, + dataset=True, + partition_cols=["par"], + mode="overwrite", + table=glue_table, + database=glue_database, + concurrent_partitioning=True, + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads) + df2 = wr.athena.read_sql_table(glue_table, glue_database, use_threads=use_threads) + assert df2.shape == df.shape + assert df2["id"].sum() == df["id"].sum() + assert df2["par"].sum() == df["par"].sum() + + def test_csv_dataset(path, glue_database): with pytest.raises(wr.exceptions.UndetectedType): wr.s3.to_csv(pd.DataFrame({"A": [None]}), path, dataset=True, database=glue_database, table="test_csv_dataset") diff --git a/tests/test_athena_parquet.py b/tests/test_athena_parquet.py index 6e142fb1b..d5af8a66d 100644 --- a/tests/test_athena_parquet.py +++ b/tests/test_athena_parquet.py @@ -2,6 +2,7 @@ import logging import math +import numpy as np import pandas as pd import pytest @@ -134,6 +135,29 @@ def test_parquet_catalog_casting(path, glue_database): assert wr.catalog.delete_table_if_exists(database=glue_database, table="__test_parquet_catalog_casting") is True +def test_parquet_catalog_casting_to_string_with_null(path, glue_table, glue_database): + data = [{"A": "foo"}, {"A": "boo", "B": "bar"}] + df = pd.DataFrame(data) + paths = wr.s3.to_parquet( + df, path, dataset=True, database=glue_database, table=glue_table, dtype={"A": "string", "B": "string"} + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + df = wr.s3.read_parquet(path=path) + assert df.shape == (2, 2) + for dtype in df.dtypes.values: + assert str(dtype) == "string" + assert pd.isna(df[df["a"] == "foo"].b.iloc[0]) + df = wr.athena.read_sql_table(table=glue_table, database=glue_database, ctas_approach=True) + assert df.shape == (2, 2) + for dtype in df.dtypes.values: + assert str(dtype) == "string" + assert pd.isna(df[df["a"] == "foo"].b.iloc[0]) + df = wr.athena.read_sql_query( + f"SELECT count(*) as counter FROM {glue_table} WHERE b is NULL ", database=glue_database + ) + assert df.counter.iloc[0] == 1 + + @pytest.mark.parametrize("compression", [None, "gzip", "snappy"]) def test_parquet_compress(path, glue_table, glue_database, compression): paths = wr.s3.to_parquet( @@ -482,3 +506,124 @@ def test_sanitize_index(path, glue_table, glue_database): assert df2.shape == (4, 2) assert df2.id.sum() == 6 assert list(df2.columns) == ["id", "date"] + + +def test_to_parquet_sanitize(path, glue_database): + df = pd.DataFrame({"C0": [0, 1], "camelCase": [2, 3], "c**--2": [4, 5]}) + table_name = "TableName*!" + paths = wr.s3.to_parquet( + df, path, dataset=True, database=glue_database, table=table_name, mode="overwrite", partition_cols=["c**--2"] + )["paths"] + wr.s3.wait_objects_exist(paths) + df2 = wr.athena.read_sql_table(database=glue_database, table=table_name) + assert df.shape == df2.shape + assert list(df2.columns) == ["c0", "camel_case", "c_2"] + assert df2.c0.sum() == 1 + assert df2.camel_case.sum() == 5 + assert df2.c_2.sum() == 9 + + +def test_schema_evolution_disabled(path, glue_table, glue_database): + wr.s3.to_parquet( + df=pd.DataFrame({"c0": [1]}), + path=path, + dataset=True, + database=glue_database, + table=glue_table, + schema_evolution=False, + ) + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.s3.to_parquet( + df=pd.DataFrame({"c0": [2], "c1": [2]}), + path=path, + dataset=True, + database=glue_database, + table=glue_table, + schema_evolution=False, + ) + paths = wr.s3.to_parquet( + df=pd.DataFrame({"c0": [2]}), + path=path, + dataset=True, + database=glue_database, + table=glue_table, + schema_evolution=False, + )["paths"] + wr.s3.wait_objects_exist(paths) + df2 = wr.athena.read_sql_table(database=glue_database, table=glue_table) + assert df2.shape == (2, 1) + assert df2.c0.sum() == 3 + + +def test_date_cast(path, glue_table, glue_database): + df = pd.DataFrame( + { + "c0": [ + datetime.date(4000, 1, 1), + datetime.datetime(2000, 1, 1, 10), + "2020", + "2020-01", + 1, + None, + pd.NA, + pd.NaT, + np.nan, + np.inf, + ] + } + ) + df_expected = pd.DataFrame( + { + "c0": [ + datetime.date(4000, 1, 1), + datetime.date(2000, 1, 1), + datetime.date(2020, 1, 1), + datetime.date(2020, 1, 1), + datetime.date(1970, 1, 1), + None, + None, + None, + None, + None, + ] + } + ) + paths = wr.s3.to_parquet( + df=df, path=path, dataset=True, database=glue_database, table=glue_table, dtype={"c0": "date"} + )["paths"] + wr.s3.wait_objects_exist(paths) + df2 = wr.s3.read_parquet(path=path) + assert df_expected.equals(df2) + df3 = wr.athena.read_sql_table(database=glue_database, table=glue_table) + assert df_expected.equals(df3) + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("partition_cols", [None, ["par0"], ["par0", "par1"]]) +def test_partitions_overwrite(path, glue_table, glue_database, use_threads, partition_cols): + df = get_df_list() + wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + database=glue_database, + table=glue_table, + use_threads=use_threads, + partition_cols=partition_cols, + mode="overwrite_partitions", + ) + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + database=glue_database, + table=glue_table, + use_threads=use_threads, + partition_cols=partition_cols, + mode="overwrite_partitions", + )["paths"] + wr.s3.wait_objects_exist(paths, use_threads=use_threads) + df2 = wr.athena.read_sql_table(database=glue_database, table=glue_table, use_threads=use_threads) + ensure_data_types(df2, has_list=True) + assert df2.shape == (3, 19) + assert df.iint8.sum() == df2.iint8.sum() diff --git a/tests/test_config.py b/tests/test_config.py index cfc31e864..2e0444f1d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,14 +1,10 @@ -import copy import logging import os -from unittest.mock import patch -import boto3 -import pandas as pd import pytest -import s3fs import awswrangler as wr +from awswrangler.s3._fs import open_s3_object logging.getLogger("awswrangler").setLevel(logging.DEBUG) @@ -27,28 +23,12 @@ def test_basics(path, glue_database, glue_table): wr.catalog.create_parquet_table(**args) # Testing configured s3 block size - size = 5 * 2 ** 20 # 5 MB - wr.config.s3fs_block_size = size - df = pd.DataFrame({"id": [1, 2, 3]}) - file_path = path + "0.csv" - args = dict( - anon=False, - config_kwargs={"retries": {"max_attempts": 15}}, - default_block_size=size, - default_cache_type="readahead", - default_fill_cache=False, - s3_additional_kwargs=None, - skip_instance_cache=True, - use_listings_cache=False, - use_ssl=True, - ) - with patch( - "s3fs.S3FileSystem", - return_value=s3fs.S3FileSystem(session=boto3.DEFAULT_SESSION._session, **copy.deepcopy(args)), - ) as mock: - wr.s3.to_csv(df, file_path, index=False) - mock.assert_called_with(session=boto3.DEFAULT_SESSION._session, **args) - wr.s3.read_csv([file_path]) + size = 1 * 2 ** 20 # 1 MB + wr.config.s3_block_size = size + with open_s3_object(path, mode="wb") as s3obj: + s3obj.write(b"foo") + with open_s3_object(path, mode="rb") as s3obj: + assert s3obj._s3_block_size == size # Resetting all configs wr.config.reset() diff --git a/tests/test_fs.py b/tests/test_fs.py new file mode 100644 index 000000000..e9d990154 --- /dev/null +++ b/tests/test_fs.py @@ -0,0 +1,188 @@ +import logging + +import boto3 +import pytest + +import awswrangler as wr +from awswrangler.s3._fs import open_s3_object + +from ._utils import ensure_data_types, get_df_list + +logging.getLogger("awswrangler").setLevel(logging.DEBUG) + + +@pytest.mark.parametrize("use_threads", [True, False]) +def test_io_intense(path, use_threads): + path = f"{path}0.txt" + data = b"0" * 10_000_000 + b"1" * 10_000_000 + b"2" * 10_000_000 + + with open_s3_object(path, mode="wb", use_threads=use_threads) as s3obj: + s3obj.write(data) + + with open_s3_object(path, mode="rb", use_threads=use_threads) as s3obj: + assert s3obj.read() == data + + bucket, key = wr._utils.parse_path(path) + assert boto3.client("s3").get_object(Bucket=bucket, Key=key)["Body"].read() == data + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("mode", ["r", "rb"]) +def test_read_full(path, mode, use_threads): + client_s3 = boto3.client("s3") + path = f"{path}0.txt" + bucket, key = wr._utils.parse_path(path) + text = "AHDG*AWY&GD*A&WGd*AWgd87AGWD*GA*G*g*AGˆˆ&ÂDTW&ˆˆD&ÂTW7ˆˆTAWˆˆDAW&ˆˆAWGDIUHWOD#N" + client_s3.put_object(Body=text, Bucket=bucket, Key=key) + with open_s3_object(path, mode=mode, s3_block_size=100, newline="\n", use_threads=use_threads) as s3obj: + if mode == "r": + assert s3obj.read() == text + else: + assert s3obj.read() == text.encode("utf-8") + if "b" in mode: + assert s3obj._cache == b"" + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("mode", ["r", "rb"]) +@pytest.mark.parametrize("block_size", [100, 2]) +def test_read_chunked(path, mode, block_size, use_threads): + client_s3 = boto3.client("s3") + path = f"{path}0.txt" + bucket, key = wr._utils.parse_path(path) + text = "0123456789" + client_s3.put_object(Body=text, Bucket=bucket, Key=key) + with open_s3_object(path, mode=mode, s3_block_size=block_size, newline="\n", use_threads=use_threads) as s3obj: + if mode == "r": + for i in range(3): + assert s3obj.read(1) == text[i] + else: + for i in range(3): + assert s3obj.read(1) == text[i].encode("utf-8") + if "b" in mode: + assert len(s3obj._cache) <= block_size + if "b" in mode: + assert s3obj._cache == b"" + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("mode", ["r", "rb"]) +@pytest.mark.parametrize("block_size", [2, 3, 10, 23, 48, 65, 100]) +def test_read_line(path, mode, block_size, use_threads): + client_s3 = boto3.client("s3") + path = f"{path}0.txt" + bucket, key = wr._utils.parse_path(path) + text = "0\n11\n22222\n33333333333333\n44444444444444444444444444444444444444444444\n55555" + expected = ["0\n", "11\n", "22222\n", "33333333333333\n", "44444444444444444444444444444444444444444444\n", "55555"] + client_s3.put_object(Body=text, Bucket=bucket, Key=key) + with open_s3_object(path, mode=mode, s3_block_size=block_size, newline="\n", use_threads=use_threads) as s3obj: + for i, line in enumerate(s3obj): + if mode == "r": + assert line == expected[i] + else: + assert line == expected[i].encode("utf-8") + s3obj.seek(0) + lines = s3obj.readlines() + print(lines) + if mode == "r": + assert lines == expected + else: + assert [line.decode("utf-8") for line in lines] == expected + if "b" in mode: + assert s3obj._cache == b"" + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("mode", ["wb", "w"]) +def test_write_full(path, mode, use_threads): + client_s3 = boto3.client("s3") + path = f"{path}0.txt" + bucket, key = wr._utils.parse_path(path) + text = "ajdaebdiebdkibaekdbekfbksbfksebkfjebkfjbekjfbkjebfkebwkfbewkjfbkjwebf" + with open_s3_object(path, mode=mode, newline="\n", use_threads=use_threads) as s3obj: + if mode == "wb": + s3obj.write(text.encode("utf-8")) + else: + s3obj.write(text) + assert client_s3.get_object(Bucket=bucket, Key=key)["Body"].read() == text.encode("utf-8") + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("mode", ["wb", "w"]) +@pytest.mark.parametrize("data_size", [6_000_000, 10_000_000, 12_000_000]) +def test_write_chunked(path, mode, data_size, use_threads): + client_s3 = boto3.client("s3") + path = f"{path}0.txt" + bucket, key = wr._utils.parse_path(path) + chunks = ["a", "jdae", "bdiebdkibaekdbekfbksbfk", "sebkf", "jebkfjbekjfbkjebfkebwkfbe", "f", "0" * data_size] + expected = b"ajdaebdiebdkibaekdbekfbksbfksebkfjebkfjbekjfbkjebfkebwkfbef" + (b"0" * data_size) + with open_s3_object(path, mode=mode, newline="\n", use_threads=use_threads) as s3obj: + for chunk in chunks: + if mode == "wb": + s3obj.write(chunk.encode("utf-8")) + else: + s3obj.write(chunk) + assert client_s3.get_object(Bucket=bucket, Key=key)["Body"].read() == expected + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize( + "s3_additional_kwargs", + [None, {"ServerSideEncryption": "AES256"}, {"ServerSideEncryption": "aws:kms", "SSEKMSKeyId": None}], +) +def test_additional_kwargs(path, kms_key_id, s3_additional_kwargs, use_threads): + if s3_additional_kwargs is not None and "SSEKMSKeyId" in s3_additional_kwargs: + s3_additional_kwargs["SSEKMSKeyId"] = kms_key_id + path = f"{path}0.txt" + with open_s3_object(path, mode="w", s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads) as s3obj: + s3obj.write("foo") + with open_s3_object( + path, mode="r", s3_block_size=10_000_000, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads, + ) as s3obj: + assert s3obj.read() == "foo" + desc = wr.s3.describe_objects([path])[path] + if s3_additional_kwargs is None: + assert desc.get("ServerSideEncryption") is None + elif s3_additional_kwargs["ServerSideEncryption"] == "aws:kms": + assert desc.get("ServerSideEncryption") == "aws:kms" + elif s3_additional_kwargs["ServerSideEncryption"] == "AES256": + assert desc.get("ServerSideEncryption") == "AES256" + + +def test_pyarrow(path, glue_table, glue_database): + df = get_df_list() + paths = wr.s3.to_parquet(df, path, dataset=True, database=glue_database, table=glue_table)["paths"] + wr.s3.wait_objects_exist(paths) + df2 = wr.athena.read_sql_table(database=glue_database, table=glue_table) + ensure_data_types(df2, has_list=True) + assert df2.shape == (3, 19) + assert df.iint8.sum() == df2.iint8.sum() + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("block_size", [2, 3, 5, 8, 9, 15]) +@pytest.mark.parametrize("text", ["012345678", "0123456789"]) +def test_cache(path, use_threads, block_size, text): + client_s3 = boto3.client("s3") + path = f"{path}0.txt" + bucket, key = wr._utils.parse_path(path) + client_s3.put_object(Body=text, Bucket=bucket, Key=key) + with open_s3_object(path, mode="rb", s3_block_size=block_size, use_threads=use_threads) as s3obj: + for i in range(len(text)): + value = s3obj.read(1) + print(value) + assert value == text[i].encode("utf-8") + assert len(s3obj._cache) in (block_size, block_size - 1, len(text)) + assert s3obj._cache == b"" + + +def test_cache_seek(path): + client_s3 = boto3.client("s3") + path = f"{path}0.txt" + bucket, key = wr._utils.parse_path(path) + text = "0" * 1_000_000 + "1" * 4 + client_s3.put_object(Body=text, Bucket=bucket, Key=key) + with open_s3_object(path, mode="rb", s3_block_size=1_000) as s3obj: + s3obj.seek(1_000_000) + assert s3obj.read(100).decode("utf-8") == "1" * 4 + assert s3obj._cache == b"" diff --git a/tests/test_metadata.py b/tests/test_metadata.py index be7685063..70bc315ee 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -2,7 +2,7 @@ def test_metadata(): - assert wr.__version__ == "1.8.1" + assert wr.__version__ == "1.9.0" assert wr.__title__ == "awswrangler" assert wr.__description__ == "Pandas on AWS." assert wr.__license__ == "Apache License 2.0" diff --git a/tests/test_moto.py b/tests/test_moto.py index 523cf34b5..e42824d7a 100644 --- a/tests/test_moto.py +++ b/tests/test_moto.py @@ -244,15 +244,13 @@ def test_read_csv_with_chucksize_and_pandas_arguments(moto_s3): @mock.patch("pandas.read_csv") -@mock.patch("s3fs.S3FileSystem.open") -def test_read_csv_pass_pandas_arguments_and_encoding_succeed(mock_open, mock_read_csv, moto_s3): +def test_read_csv_pass_pandas_arguments_and_encoding_succeed(mock_read_csv, moto_s3): bucket = "bucket" key = "foo/foo.csv" path = "s3://{}/{}".format(bucket, key) s3_object = moto_s3.Object(bucket, key) s3_object.put(Body=b"foo") wr.s3.read_csv(path=path, encoding="ISO-8859-1", sep=",", lineterminator="\r\n") - mock_open.assert_called_with(path="s3://bucket/foo/foo.csv", mode="r", encoding="ISO-8859-1", newline="\r\n") mock_read_csv.assert_called_with(ANY, compression=None, encoding="ISO-8859-1", sep=",", lineterminator="\r\n") diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py index be6e8940c..3e1c7cbc7 100644 --- a/tests/test_s3_parquet.py +++ b/tests/test_s3_parquet.py @@ -1,8 +1,10 @@ import itertools import logging import math +from datetime import datetime import boto3 +import numpy as np import pandas as pd import pytest @@ -225,3 +227,99 @@ def test_parquet_with_size(path, use_threads, max_rows_by_file): ensure_data_types(df2, has_list=True) assert df2.shape == (300, 19) assert df.iint8.sum() == df2.iint8.sum() + + +@pytest.mark.parametrize("use_threads", [True, False]) +def test_index_and_timezone(path, use_threads): + df = pd.DataFrame({"c0": [datetime.utcnow(), datetime.utcnow()], "par": ["a", "b"]}, index=["foo", "boo"]) + df["c1"] = pd.DatetimeIndex(df.c0).tz_localize(tz="US/Eastern") + paths = wr.s3.to_parquet(df, path, index=True, use_threads=use_threads, dataset=True, partition_cols=["par"])[ + "paths" + ] + wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads) + df2 = wr.s3.read_parquet(path, use_threads=use_threads, dataset=True) + assert df[["c0", "c1"]].equals(df2[["c0", "c1"]]) + + +@pytest.mark.parametrize("use_threads", [True, False]) +def test_index_recovery_simple_int(path, use_threads): + df = pd.DataFrame({"c0": np.arange(10, 1_010, 1)}, dtype="Int64") + paths = wr.s3.to_parquet(df, path, index=True, use_threads=use_threads, dataset=True, max_rows_by_file=300)["paths"] + assert len(paths) == 4 + wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads) + df2 = wr.s3.read_parquet(f"{path}*.parquet", use_threads=use_threads) + assert df.equals(df2) + + +@pytest.mark.parametrize("use_threads", [True, False]) +def test_index_recovery_simple_str(path, use_threads): + df = pd.DataFrame({"c0": [0, 1, 2, 3, 4]}, index=["a", "b", "c", "d", "e"], dtype="Int64") + paths = wr.s3.to_parquet(df, path, index=True, use_threads=use_threads, dataset=True, max_rows_by_file=1)["paths"] + assert len(paths) == 5 + wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads) + df2 = wr.s3.read_parquet(f"{path}*.parquet", use_threads=use_threads) + assert df.equals(df2) + + +@pytest.mark.parametrize("use_threads", [True, False]) +def test_index_recovery_partitioned_str(path, use_threads): + df = pd.DataFrame( + {"c0": [0, 1, 2, 3, 4], "par": ["foo", "boo", "bar", "foo", "boo"]}, index=["a", "b", "c", "d", "e"] + ) + df["c0"] = df["c0"].astype("Int64") + df["par"] = df["c0"].astype("category") + paths = wr.s3.to_parquet( + df, path, index=True, use_threads=use_threads, dataset=True, partition_cols=["par"], max_rows_by_file=1 + )["paths"] + assert len(paths) == 5 + wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads) + df2 = wr.s3.read_parquet(f"{path}*.parquet", use_threads=use_threads, dataset=True) + assert df.shape == df2.shape + assert df.c0.equals(df2.c0) + assert df.dtypes.equals(df2.dtypes) + assert df.index.equals(df2.index) + + +@pytest.mark.parametrize("use_threads", [True, False]) +def test_range_index_recovery_simple(path, use_threads): + df = pd.DataFrame({"c0": np.arange(10, 15, 1)}, dtype="Int64", index=pd.RangeIndex(start=5, stop=30, step=5)) + paths = wr.s3.to_parquet(df, path, index=True, use_threads=use_threads, dataset=True, max_rows_by_file=3)["paths"] + assert len(paths) == 2 + wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads) + df2 = wr.s3.read_parquet(f"{path}*.parquet", use_threads=use_threads) + assert df.reset_index(level=0).equals(df2.reset_index(level=0)) + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("name", [None, "foo"]) +def test_range_index_recovery_pandas(path, use_threads, name): + df = pd.DataFrame({"c0": np.arange(10, 15, 1)}, dtype="Int64", index=pd.RangeIndex(start=5, stop=30, step=5)) + df.index.name = name + path_file = f"{path}0.parquet" + df.to_parquet(path_file) + wr.s3.wait_objects_exist(paths=[path_file], use_threads=use_threads) + df2 = wr.s3.read_parquet([path_file], use_threads=use_threads) + assert df.reset_index(level=0).equals(df2.reset_index(level=0)) + + +@pytest.mark.parametrize("use_threads", [True, False]) +def test_multi_index_recovery_simple(path, use_threads): + df = pd.DataFrame({"c0": [0, 1, 2], "c1": ["a", "b", "c"], "c2": [True, False, True], "c3": [0, 1, 2]}) + df["c3"] = df["c3"].astype("Int64") + df = df.set_index(["c0", "c1", "c2"]) + paths = wr.s3.to_parquet(df, path, index=True, use_threads=use_threads, dataset=True, max_rows_by_file=1)["paths"] + assert len(paths) == 3 + wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads) + df2 = wr.s3.read_parquet(f"{path}*.parquet", use_threads=use_threads) + assert df.reset_index().equals(df2.reset_index()) + + +@pytest.mark.parametrize("use_threads", [True, False]) +def test_multi_index_recovery_nameless(path, use_threads): + df = pd.DataFrame({"c0": np.arange(10, 13, 1)}, dtype="Int64") + df = df.set_index([[1, 2, 3], [1, 2, 3]]) + paths = wr.s3.to_parquet(df, path, index=True, use_threads=use_threads, dataset=True, max_rows_by_file=1)["paths"] + assert len(paths) == 3 + wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads) + df2 = wr.s3.read_parquet(f"{path}*.parquet", use_threads=use_threads) + assert df.reset_index().equals(df2.reset_index()) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 000000000..1549b5c32 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,29 @@ +import logging + +import pytest + +from awswrangler._utils import get_even_chunks_sizes + +logging.getLogger("awswrangler").setLevel(logging.DEBUG) + + +@pytest.mark.parametrize( + "total_size,chunk_size,upper_bound,result", + [ + (10, 4, True, (4, 3, 3)), + (2, 3, True, (2,)), + (1, 1, True, (1,)), + (2, 1, True, (1, 1)), + (11, 4, True, (4, 4, 3)), + (1_001, 500, True, (334, 334, 333)), + (1_002, 500, True, (334, 334, 334)), + (10, 4, False, (5, 5)), + (1, 1, False, (1,)), + (2, 1, False, (1, 1)), + (11, 4, False, (6, 5)), + (1_001, 500, False, (501, 500)), + (1_002, 500, False, (501, 501)), + ], +) +def test_get_even_chunks_sizes(total_size, chunk_size, upper_bound, result): + assert get_even_chunks_sizes(total_size, chunk_size, upper_bound) == result diff --git a/tox.ini b/tox.ini index 0c5ce3022..dc81582be 100644 --- a/tox.ini +++ b/tox.ini @@ -5,15 +5,17 @@ envlist = py{36,37,38} passenv = AWS_PROFILE AWS_DEFAULT_REGION AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY deps = pytest==6.0.1 - pytest-xdist==1.34.0 + pytest-xdist==2.1.0 + pytest-timeout==1.4.2 moto==1.3.14 + s3fs==0.4.2 commands = - pytest -n 32 tests + pytest -n 16 --timeout=300 tests [testenv:py38] passenv = AWS_PROFILE AWS_DEFAULT_REGION AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY deps = {[testenv]deps} - pytest-cov==2.10.0 + pytest-cov==2.10.1 commands = - pytest -n 32 --cov=awswrangler tests + pytest -n 16 --timeout=300 --cov=awswrangler tests diff --git a/tutorials/001 - Introduction.ipynb b/tutorials/001 - Introduction.ipynb index deb93139c..b6b1dea38 100644 --- a/tutorials/001 - Introduction.ipynb +++ b/tutorials/001 - Introduction.ipynb @@ -17,7 +17,7 @@ "\n", "An [open-source](https://github.com/awslabs/aws-data-wrangler>) Python package that extends the power of [Pandas](https://github.com/pandas-dev/pandas>) library to AWS connecting **DataFrames** and AWS data related services (**Amazon Redshift**, **AWS Glue**, **Amazon Athena**, **Amazon EMR**, etc).\n", "\n", - "Built on top of other open-source projects like [Pandas](https://github.com/pandas-dev/pandas), [Apache Arrow](https://github.com/apache/arrow), [Boto3](https://github.com/boto/boto3), [s3fs](https://github.com/dask/s3fs), [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy), [Psycopg2](https://github.com/psycopg/psycopg2) and [PyMySQL](https://github.com/PyMySQL/PyMySQL), it offers abstracted functions to execute usual ETL tasks like load/unload data from **Data Lakes**, **Data Warehouses** and **Databases**.\n", + "Built on top of other open-source projects like [Pandas](https://github.com/pandas-dev/pandas), [Apache Arrow](https://github.com/apache/arrow), [Boto3](https://github.com/boto/boto3), [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy), [Psycopg2](https://github.com/psycopg/psycopg2) and [PyMySQL](https://github.com/PyMySQL/PyMySQL), it offers abstracted functions to execute usual ETL tasks like load/unload data from **Data Lakes**, **Data Warehouses** and **Databases**.\n", "\n", "Check our [list of functionalities](https://aws-data-wrangler.readthedocs.io/en/latest/api.html)." ] @@ -33,7 +33,8 @@ " - [PyPi (pip)](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#pypi-pip)\n", " - [Conda](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#conda)\n", " - [AWS Lambda Layer](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#aws-lambda-layer)\n", - " - [AWS Glue Wheel](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#aws-glue-wheel)\n", + " - [AWS Glue Python Shell Jobs](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#aws-glue-python-shell-jobs)\n", + " - [AWS Glue PySpark Jobs](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#aws-glue-pyspark-jobs)\n", " - [Amazon SageMaker Notebook](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#amazon-sagemaker-notebook)\n", " - [Amazon SageMaker Notebook Lifecycle](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#amazon-sagemaker-notebook-lifecycle)\n", " - [EMR Cluster](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#emr-cluster)\n", @@ -69,16 +70,16 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'1.7.0'" + "'1.9.0'" ] }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } diff --git a/tutorials/002 - Sessions.ipynb b/tutorials/002 - Sessions.ipynb index 4147958d5..baca14e2a 100644 --- a/tutorials/002 - Sessions.ipynb +++ b/tutorials/002 - Sessions.ipynb @@ -17,7 +17,7 @@ "\n", "After version 1.0.0 Wrangler absolutely relies on [Boto3.Session()](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html) to manage AWS credentials and configurations.\n", "\n", - "Wrangler will not store any kind of state internally, and users is in charge of all the Sessions management, if necessary.\n", + "Wrangler will not store any kind of state internally. Users are in charge of managing Sessions.\n", "\n", "Most Wrangler functions receive the optional `boto3_session` argument. If None is received, the default boto3 Session will be used." ] @@ -36,7 +36,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Using the default Session" + "## Using the default Boto3 Session" ] }, { @@ -63,7 +63,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Customizing and using the default Session" + "## Customizing and using the default Boto3 Session" ] }, { @@ -92,7 +92,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Using a new custom Session" + "## Using a new custom Boto3 Session" ] }, { diff --git a/tutorials/003 - Amazon S3.ipynb b/tutorials/003 - Amazon S3.ipynb index a3060af43..98e7007f1 100644 --- a/tutorials/003 - Amazon S3.ipynb +++ b/tutorials/003 - Amazon S3.ipynb @@ -1180,8 +1180,8 @@ "metadata": {}, "outputs": [], "source": [ - "begin = datetime.strptime(\"05/06/20 16:30\", \"%d/%m/%y %H:%M\")\n", - "end = datetime.strptime(\"15/06/21 16:30\", \"%d/%m/%y %H:%M\")\n", + "begin = datetime.strptime(\"20-07-31 20:30\", \"%y-%m-%d %H:%M\")\n", + "end = datetime.strptime(\"21-07-31 20:30\", \"%y-%m-%d %H:%M\")\n", "\n", "begin_utc = pytz.utc.localize(begin)\n", "end_utc = pytz.utc.localize(end)" @@ -1200,8 +1200,8 @@ "metadata": {}, "outputs": [], "source": [ - "begin = datetime.strptime(\"05/06/20 16:30\", \"%d/%m/%y %H:%M\")\n", - "end = datetime.strptime(\"10/06/21 16:30\", \"%d/%m/%y %H:%M\")\n", + "begin = datetime.strptime(\"20-07-31 20:30\", \"%y-%m-%d %H:%M\")\n", + "end = datetime.strptime(\"21-07-31 20:30\", \"%y-%m-%d %H:%M\")\n", "\n", "timezone = pytz.timezone(\"America/Los_Angeles\")\n", "\n", @@ -1209,189 +1209,23 @@ "end_Los_Angeles = timezone.localize(end)" ] }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2020-06-05 16:30:00+00:00\n", - "2021-06-15 16:30:00+00:00\n", - "2020-06-05 16:30:00-07:00\n", - "2021-06-10 16:30:00-07:00\n" - ] - } - ], - "source": [ - "print(begin_utc)\n", - "print(end_utc)\n", - "print(begin_Los_Angeles)\n", - "print(end_Los_Angeles)" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 5.3 Read json with no LastModified filter " + "### 5.3 Read json using the LastModified filters " ] }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# read_fwf\n", - " id name date\n", - "0 1 Herfelingen 27-12-18\n", - "1 2 Lambusart 14-06-18\n", - "2 3 Spormaggiore 15-04-18\n", - "3 4 Buizingen 05-09-19\n", - "4 5 San Rafael 04-09-19\n", - "\n", - " read_json\n", - " id name\n", - "0 1 foo\n", - "1 2 boo\n", - "0 3 bar\n", - "\n", - " read_csv\n", - " id name\n", - "0 1 foo\n", - "1 2 boo\n", - "2 3 bar\n", - "\n", - " read_parquet\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idname
01foo
12boo
23bar
\n", - "
" - ], - "text/plain": [ - " id name\n", - "0 1 foo\n", - "1 2 boo\n", - "2 3 bar" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print('# read_fwf')\n", - "print(wr.s3.read_fwf(f\"s3://{bucket}/fwf/\", names=[\"id\", \"name\", \"date\"]))\n", - "print('\\n read_json')\n", - "print(wr.s3.read_json(f\"s3://{bucket}/json/\"))\n", - "print('\\n read_csv')\n", - "print(wr.s3.read_csv(f\"s3://{bucket}/csv/\"))\n", - "print('\\n read_parquet')\n", - "wr.s3.read_parquet(f\"s3://{bucket}/parquet/\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 5.4 Read json using the LastModified filter " - ] - }, - { - "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# read_fwf\n", - " id name date\n", - "0 1 Herfelingen 27-12-18\n", - "1 2 Lambusart 14-06-18\n", - "2 3 Spormaggiore 15-04-18\n", - "3 4 Buizingen 05-09-19\n", - "4 5 San Rafael 04-09-19\n", - "\n", - " read_json\n", - " id name\n", - "0 1 foo\n", - "1 2 boo\n", - "0 3 bar\n", - "\n", - " read_csv\n", - " id name\n", - "0 1 foo\n", - "1 2 boo\n", - "2 3 bar\n", - "\n", - " read_parquet\n", - " id name\n", - "0 1 foo\n", - "1 2 boo\n", - "2 3 bar\n" - ] - } - ], + "outputs": [], "source": [ - "print('# read_fwf')\n", - "print(wr.s3.read_fwf(f\"s3://{bucket}/fwf/\", names=[\"id\", \"name\", \"date\"], last_modified_begin=begin_utc, last_modified_end=end_utc))\n", - "print('\\n read_json')\n", - "print(wr.s3.read_json(f\"s3://{bucket}/json/\", last_modified_begin=begin_utc, last_modified_end=end_utc))\n", - "print('\\n read_csv')\n", - "print(wr.s3.read_csv(f\"s3://{bucket}/csv/\", last_modified_begin=begin_utc, last_modified_end=end_utc))\n", - "print('\\n read_parquet')\n", - "print(wr.s3.read_parquet(f\"s3://{bucket}/parquet/\", last_modified_begin=begin_utc, last_modified_end=end_utc))" + "wr.s3.read_fwf(f\"s3://{bucket}/fwf/\", names=[\"id\", \"name\", \"date\"], last_modified_begin=begin_utc, last_modified_end=end_utc)\n", + "wr.s3.read_json(f\"s3://{bucket}/json/\", last_modified_begin=begin_utc, last_modified_end=end_utc)\n", + "wr.s3.read_csv(f\"s3://{bucket}/csv/\", last_modified_begin=begin_utc, last_modified_end=end_utc)\n", + "wr.s3.read_parquet(f\"s3://{bucket}/parquet/\", last_modified_begin=begin_utc, last_modified_end=end_utc);" ] }, { @@ -1403,7 +1237,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ diff --git a/tutorials/004 - Parquet Datasets.ipynb b/tutorials/004 - Parquet Datasets.ipynb index 1918a9143..5b9d7b9eb 100644 --- a/tutorials/004 - Parquet Datasets.ipynb +++ b/tutorials/004 - Parquet Datasets.ipynb @@ -50,7 +50,7 @@ "name": "stdin", "output_type": "stream", "text": [ - " ···········································\n" + " ············\n" ] } ], @@ -184,31 +184,31 @@ " \n", " \n", " 0\n", - " 3\n", - " bar\n", - " 2020-01-03\n", - " \n", - " \n", - " 1\n", " 1\n", " foo\n", " 2020-01-01\n", " \n", " \n", - " 2\n", + " 1\n", " 2\n", " boo\n", " 2020-01-02\n", " \n", + " \n", + " 2\n", + " 3\n", + " bar\n", + " 2020-01-03\n", + " \n", " \n", "\n", "" ], "text/plain": [ " id value date\n", - "0 3 bar 2020-01-03\n", - "1 1 foo 2020-01-01\n", - "2 2 boo 2020-01-02" + "0 1 foo 2020-01-01\n", + "1 2 boo 2020-01-02\n", + "2 3 bar 2020-01-03" ] }, "execution_count": 4, diff --git a/tutorials/005 - Glue Catalog.ipynb b/tutorials/005 - Glue Catalog.ipynb index bb2e11c12..097841ba2 100644 --- a/tutorials/005 - Glue Catalog.ipynb +++ b/tutorials/005 - Glue Catalog.ipynb @@ -39,7 +39,7 @@ "name": "stdin", "output_type": "stream", "text": [ - " ···········································\n" + " ············\n" ] } ], @@ -197,9 +197,7 @@ "text": [ " Database Description\n", "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", - "1 aws_dataframes AWS DataFrames Test Arena - Glue Database\n", - "2 covid-19 \n", - "3 default Default Hive database\n" + "1 default Default Hive database\n" ] } ], @@ -226,10 +224,8 @@ "text": [ " Database Description\n", "0 aws_data_wrangler AWS Data Wrangler Test Arena - Glue Database\n", - "1 aws_dataframes AWS DataFrames Test Arena - Glue Database\n", - "2 awswrangler_test \n", - "3 covid-19 \n", - "4 default Default Hive database\n" + "1 awswrangler_test \n", + "2 default Default Hive database\n" ] } ], diff --git a/tutorials/007 - Redshift, MySQL, PostgreSQL.ipynb b/tutorials/007 - Redshift, MySQL, PostgreSQL.ipynb index 0566090db..8c049e6a6 100644 --- a/tutorials/007 - Redshift, MySQL, PostgreSQL.ipynb +++ b/tutorials/007 - Redshift, MySQL, PostgreSQL.ipynb @@ -168,13 +168,6 @@ "wr.db.read_sql_query(\"SELECT * FROM test.tutorial\", con=eng_mysql) # MySQL\n", "wr.db.read_sql_query(\"SELECT * FROM public.tutorial\", con=eng_redshift) # Redshift" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tutorials/008 - Redshift - Copy & Unload.ipynb b/tutorials/008 - Redshift - Copy & Unload.ipynb index 8d263f879..faeca4bbb 100644 --- a/tutorials/008 - Redshift - Copy & Unload.ipynb +++ b/tutorials/008 - Redshift - Copy & Unload.ipynb @@ -44,7 +44,7 @@ "name": "stdin", "output_type": "stream", "text": [ - " ···········································\n" + " ············\n" ] } ], @@ -191,7 +191,7 @@ " ...\n", " \n", " \n", - " 3898086\n", + " 3899515\n", " UZM00038457\n", " 1897-12-31\n", " TMIN\n", @@ -202,7 +202,7 @@ " NaN\n", " \n", " \n", - " 3898087\n", + " 3899516\n", " UZM00038457\n", " 1897-12-31\n", " PRCP\n", @@ -213,7 +213,7 @@ " NaN\n", " \n", " \n", - " 3898088\n", + " 3899517\n", " UZM00038457\n", " 1897-12-31\n", " TAVG\n", @@ -224,7 +224,7 @@ " NaN\n", " \n", " \n", - " 3898089\n", + " 3899518\n", " UZM00038618\n", " 1897-12-31\n", " PRCP\n", @@ -235,7 +235,7 @@ " NaN\n", " \n", " \n", - " 3898090\n", + " 3899519\n", " UZM00038618\n", " 1897-12-31\n", " TAVG\n", @@ -247,7 +247,7 @@ " \n", " \n", "\n", - "

3898091 rows × 8 columns

\n", + "

3899520 rows × 8 columns

\n", "" ], "text/plain": [ @@ -258,13 +258,13 @@ "3 AGE00135039 1897-01-01 TMAX 140 NaN NaN E NaN\n", "4 AGE00135039 1897-01-01 TMIN 40 NaN NaN E NaN\n", "... ... ... ... ... ... ... ... ...\n", - "3898086 UZM00038457 1897-12-31 TMIN -145 NaN NaN r NaN\n", - "3898087 UZM00038457 1897-12-31 PRCP 4 NaN NaN r NaN\n", - "3898088 UZM00038457 1897-12-31 TAVG -95 NaN NaN r NaN\n", - "3898089 UZM00038618 1897-12-31 PRCP 66 NaN NaN r NaN\n", - "3898090 UZM00038618 1897-12-31 TAVG -45 NaN NaN r NaN\n", + "3899515 UZM00038457 1897-12-31 TMIN -145 NaN NaN r NaN\n", + "3899516 UZM00038457 1897-12-31 PRCP 4 NaN NaN r NaN\n", + "3899517 UZM00038457 1897-12-31 TAVG -95 NaN NaN r NaN\n", + "3899518 UZM00038618 1897-12-31 PRCP 66 NaN NaN r NaN\n", + "3899519 UZM00038618 1897-12-31 TAVG -45 NaN NaN r NaN\n", "\n", - "[3898091 rows x 8 columns]" + "[3899520 rows x 8 columns]" ] }, "execution_count": 4, @@ -299,7 +299,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1min 5s, sys: 2.62 s, total: 1min 8s\n", + "CPU times: user 1min 7s, sys: 2.45 s, total: 1min 9s\n", "Wall time: 4min 29s\n" ] } @@ -326,8 +326,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 15.3 s, sys: 2.01 s, total: 17.3 s\n", - "Wall time: 27.3 s\n" + "CPU times: user 15.8 s, sys: 1.93 s, total: 17.7 s\n", + "Wall time: 28.9 s\n" ] }, { @@ -408,10 +408,10 @@ " \n", " \n", " 4\n", - " AGE00147708\n", + " AGE00147706\n", " 1897-01-01\n", " TMAX\n", - " 170\n", + " 130\n", " <NA>\n", " <NA>\n", " E\n", @@ -429,7 +429,7 @@ " ...\n", " \n", " \n", - " 3898086\n", + " 3899515\n", " USW00094967\n", " 1897-12-31\n", " TMAX\n", @@ -440,7 +440,7 @@ " <NA>\n", " \n", " \n", - " 3898087\n", + " 3899516\n", " USW00094967\n", " 1897-12-31\n", " PRCP\n", @@ -451,7 +451,7 @@ " <NA>\n", " \n", " \n", - " 3898088\n", + " 3899517\n", " UZM00038457\n", " 1897-12-31\n", " TMAX\n", @@ -462,7 +462,7 @@ " <NA>\n", " \n", " \n", - " 3898089\n", + " 3899518\n", " UZM00038457\n", " 1897-12-31\n", " PRCP\n", @@ -473,7 +473,7 @@ " <NA>\n", " \n", " \n", - " 3898090\n", + " 3899519\n", " UZM00038618\n", " 1897-12-31\n", " PRCP\n", @@ -485,7 +485,7 @@ " \n", " \n", "\n", - "

3898091 rows × 8 columns

\n", + "

3899520 rows × 8 columns

\n", "" ], "text/plain": [ @@ -494,15 +494,15 @@ "1 AGE00135039 1897-01-01 TMAX 140 E \n", "2 AGE00135039 1897-01-01 PRCP 0 E \n", "3 AGE00147705 1897-01-01 TMIN 98 E \n", - "4 AGE00147708 1897-01-01 TMAX 170 E \n", + "4 AGE00147706 1897-01-01 TMAX 130 E \n", "... ... ... ... ... ... ... ... ...\n", - "3898086 USW00094967 1897-12-31 TMAX -144 6 \n", - "3898087 USW00094967 1897-12-31 PRCP 0 P 6 \n", - "3898088 UZM00038457 1897-12-31 TMAX -49 r \n", - "3898089 UZM00038457 1897-12-31 PRCP 4 r \n", - "3898090 UZM00038618 1897-12-31 PRCP 66 r \n", + "3899515 USW00094967 1897-12-31 TMAX -144 6 \n", + "3899516 USW00094967 1897-12-31 PRCP 0 P 6 \n", + "3899517 UZM00038457 1897-12-31 TMAX -49 r \n", + "3899518 UZM00038457 1897-12-31 PRCP 4 r \n", + "3899519 UZM00038618 1897-12-31 PRCP 66 r \n", "\n", - "[3898091 rows x 8 columns]" + "[3899520 rows x 8 columns]" ] }, "execution_count": 6, @@ -532,8 +532,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.23 s, sys: 201 ms, total: 2.43 s\n", - "Wall time: 9.51 s\n" + "CPU times: user 3.07 s, sys: 200 ms, total: 3.27 s\n", + "Wall time: 19.6 s\n" ] } ], @@ -560,8 +560,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.65 s, sys: 671 ms, total: 4.32 s\n", - "Wall time: 12.5 s\n" + "CPU times: user 3.99 s, sys: 682 ms, total: 4.67 s\n", + "Wall time: 16 s\n" ] }, { @@ -600,8 +600,8 @@ " 0\n", " AG000060590\n", " 1897-01-01\n", - " TMAX\n", - " 170\n", + " TMIN\n", + " -14\n", " <NA>\n", " <NA>\n", " E\n", @@ -609,10 +609,10 @@ " \n", " \n", " 1\n", - " AG000060590\n", + " AGE00135039\n", " 1897-01-01\n", - " PRCP\n", - " 0\n", + " TMAX\n", + " 140\n", " <NA>\n", " <NA>\n", " E\n", @@ -622,8 +622,8 @@ " 2\n", " AGE00135039\n", " 1897-01-01\n", - " TMIN\n", - " 40\n", + " PRCP\n", + " 0\n", " <NA>\n", " <NA>\n", " E\n", @@ -633,8 +633,8 @@ " 3\n", " AGE00147705\n", " 1897-01-01\n", - " TMAX\n", - " 164\n", + " TMIN\n", + " 98\n", " <NA>\n", " <NA>\n", " E\n", @@ -642,10 +642,10 @@ " \n", " \n", " 4\n", - " AGE00147705\n", + " AGE00147706\n", " 1897-01-01\n", - " PRCP\n", - " 0\n", + " TMAX\n", + " 130\n", " <NA>\n", " <NA>\n", " E\n", @@ -663,7 +663,7 @@ " ...\n", " \n", " \n", - " 3898086\n", + " 1949755\n", " USW00094967\n", " 1897-12-31\n", " TMAX\n", @@ -674,7 +674,7 @@ " <NA>\n", " \n", " \n", - " 3898087\n", + " 1949756\n", " USW00094967\n", " 1897-12-31\n", " PRCP\n", @@ -685,7 +685,7 @@ " <NA>\n", " \n", " \n", - " 3898088\n", + " 1949757\n", " UZM00038457\n", " 1897-12-31\n", " TMAX\n", @@ -696,7 +696,7 @@ " <NA>\n", " \n", " \n", - " 3898089\n", + " 1949758\n", " UZM00038457\n", " 1897-12-31\n", " PRCP\n", @@ -707,7 +707,7 @@ " <NA>\n", " \n", " \n", - " 3898090\n", + " 1949759\n", " UZM00038618\n", " 1897-12-31\n", " PRCP\n", @@ -719,24 +719,24 @@ " \n", " \n", "\n", - "

3898091 rows × 8 columns

\n", + "

3899520 rows × 8 columns

\n", "" ], "text/plain": [ " id dt element value m_flag q_flag s_flag obs_time\n", - "0 AG000060590 1897-01-01 TMAX 170 E \n", - "1 AG000060590 1897-01-01 PRCP 0 E \n", - "2 AGE00135039 1897-01-01 TMIN 40 E \n", - "3 AGE00147705 1897-01-01 TMAX 164 E \n", - "4 AGE00147705 1897-01-01 PRCP 0 E \n", + "0 AG000060590 1897-01-01 TMIN -14 E \n", + "1 AGE00135039 1897-01-01 TMAX 140 E \n", + "2 AGE00135039 1897-01-01 PRCP 0 E \n", + "3 AGE00147705 1897-01-01 TMIN 98 E \n", + "4 AGE00147706 1897-01-01 TMAX 130 E \n", "... ... ... ... ... ... ... ... ...\n", - "3898086 USW00094967 1897-12-31 TMAX -144 6 \n", - "3898087 USW00094967 1897-12-31 PRCP 0 P 6 \n", - "3898088 UZM00038457 1897-12-31 TMAX -49 r \n", - "3898089 UZM00038457 1897-12-31 PRCP 4 r \n", - "3898090 UZM00038618 1897-12-31 PRCP 66 r \n", + "1949755 USW00094967 1897-12-31 TMAX -144 6 \n", + "1949756 USW00094967 1897-12-31 PRCP 0 P 6 \n", + "1949757 UZM00038457 1897-12-31 TMAX -49 r \n", + "1949758 UZM00038457 1897-12-31 PRCP 4 r \n", + "1949759 UZM00038618 1897-12-31 PRCP 66 r \n", "\n", - "[3898091 rows x 8 columns]" + "[3899520 rows x 8 columns]" ] }, "execution_count": 8, @@ -755,13 +755,6 @@ " keep_files=True,\n", ")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tutorials/009 - Redshift - Append, Overwrite, Upsert.ipynb b/tutorials/009 - Redshift - Append, Overwrite, Upsert.ipynb index dcf62739e..0873bc7f7 100644 --- a/tutorials/009 - Redshift - Append, Overwrite, Upsert.ipynb +++ b/tutorials/009 - Redshift - Append, Overwrite, Upsert.ipynb @@ -46,7 +46,7 @@ "name": "stdin", "output_type": "stream", "text": [ - " ···········································\n" + " ············\n" ] } ], @@ -302,31 +302,31 @@ " \n", " \n", " 0\n", - " 2\n", - " xoo\n", - " 2020-01-02\n", - " \n", - " \n", - " 1\n", " 1\n", " foo\n", " 2020-01-01\n", " \n", " \n", - " 2\n", + " 1\n", " 3\n", " bar\n", " 2020-01-03\n", " \n", + " \n", + " 2\n", + " 2\n", + " xoo\n", + " 2020-01-02\n", + " \n", " \n", "\n", "" ], "text/plain": [ " id value date\n", - "0 2 xoo 2020-01-02\n", - "1 1 foo 2020-01-01\n", - "2 3 bar 2020-01-03" + "0 1 foo 2020-01-01\n", + "1 3 bar 2020-01-03\n", + "2 2 xoo 2020-01-02" ] }, "execution_count": 6, @@ -371,13 +371,6 @@ "with engine.connect() as con:\n", " con.execute(\"DROP TABLE public.my_table\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tutorials/010 - Parquet Crawler.ipynb b/tutorials/010 - Parquet Crawler.ipynb index 0768b7726..e618e2d96 100644 --- a/tutorials/010 - Parquet Crawler.ipynb +++ b/tutorials/010 - Parquet Crawler.ipynb @@ -36,7 +36,7 @@ "name": "stdin", "output_type": "stream", "text": [ - " ···········································\n" + " ············\n" ] } ], @@ -159,7 +159,7 @@ " ...\n", " \n", " \n", - " 29240014\n", + " 29249753\n", " UZM00038457\n", " 1899-12-31\n", " PRCP\n", @@ -170,7 +170,7 @@ " NaN\n", " \n", " \n", - " 29240015\n", + " 29249754\n", " UZM00038457\n", " 1899-12-31\n", " TAVG\n", @@ -181,7 +181,7 @@ " NaN\n", " \n", " \n", - " 29240016\n", + " 29249755\n", " UZM00038618\n", " 1899-12-31\n", " TMIN\n", @@ -192,7 +192,7 @@ " NaN\n", " \n", " \n", - " 29240017\n", + " 29249756\n", " UZM00038618\n", " 1899-12-31\n", " PRCP\n", @@ -203,7 +203,7 @@ " NaN\n", " \n", " \n", - " 29240018\n", + " 29249757\n", " UZM00038618\n", " 1899-12-31\n", " TAVG\n", @@ -215,7 +215,7 @@ " \n", " \n", "\n", - "

29240019 rows × 8 columns

\n", + "

29249758 rows × 8 columns

\n", "" ], "text/plain": [ @@ -226,13 +226,13 @@ "3 AGE00147705 1890-01-01 TMAX 140 NaN NaN E NaN\n", "4 AGE00147705 1890-01-01 TMIN 74 NaN NaN E NaN\n", "... ... ... ... ... ... ... ... ...\n", - "29240014 UZM00038457 1899-12-31 PRCP 16 NaN NaN r NaN\n", - "29240015 UZM00038457 1899-12-31 TAVG -73 NaN NaN r NaN\n", - "29240016 UZM00038618 1899-12-31 TMIN -76 NaN NaN r NaN\n", - "29240017 UZM00038618 1899-12-31 PRCP 0 NaN NaN r NaN\n", - "29240018 UZM00038618 1899-12-31 TAVG -60 NaN NaN r NaN\n", + "29249753 UZM00038457 1899-12-31 PRCP 16 NaN NaN r NaN\n", + "29249754 UZM00038457 1899-12-31 TAVG -73 NaN NaN r NaN\n", + "29249755 UZM00038618 1899-12-31 TMIN -76 NaN NaN r NaN\n", + "29249756 UZM00038618 1899-12-31 PRCP 0 NaN NaN r NaN\n", + "29249757 UZM00038618 1899-12-31 TAVG -60 NaN NaN r NaN\n", "\n", - "[29240019 rows x 8 columns]" + "[29249758 rows x 8 columns]" ] }, "execution_count": 3, @@ -370,16 +370,16 @@ { "data": { "text/plain": [ - "['year=1890/f66834ded9314208908667b40ccb5b54.snappy.parquet',\n", - " 'year=1891/73ee737ebb9144929ee63f6cd2725b8b.snappy.parquet',\n", - " 'year=1892/aee80df68614404d957d54f8b36a6143.snappy.parquet',\n", - " 'year=1893/159ae23b89b14de499b0312f03aca345.snappy.parquet',\n", - " 'year=1894/1694a1fe48194862803d8494c5405ad1.snappy.parquet',\n", - " 'year=1895/ba4d698250364922971a7b7dce96dc67.snappy.parquet',\n", - " 'year=1896/c2e422d32b2e4cb4a9d38b398845a976.snappy.parquet',\n", - " 'year=1897/2ec3223d6f284bfe9b604abbac225996.snappy.parquet',\n", - " 'year=1898/ffc78ab36f954d4ba6890892767a3cfb.snappy.parquet',\n", - " 'year=1899/c05cd01236a94b158b2b49e924e71431.snappy.parquet']" + "['year=1890/06a519afcf8e48c9b08c8908f30adcfe.snappy.parquet',\n", + " 'year=1891/5a99c28dbef54008bfc770c946099e02.snappy.parquet',\n", + " 'year=1892/9b1ea5d1cfad40f78c920f93540ca8ec.snappy.parquet',\n", + " 'year=1893/92259b49c134401eaf772506ee802af6.snappy.parquet',\n", + " 'year=1894/c734469ffff944f69dc277c630064a16.snappy.parquet',\n", + " 'year=1895/cf7ccde86aaf4d138f86c379c0817aa6.snappy.parquet',\n", + " 'year=1896/ce02f4c2c554438786b766b33db451b6.snappy.parquet',\n", + " 'year=1897/e04de04ad3c444deadcc9c410ab97ca1.snappy.parquet',\n", + " 'year=1898/acb0e02878f04b56a6200f4b5a97be0e.snappy.parquet',\n", + " 'year=1899/a269bdbb0f6a48faac55f3bcfef7df7a.snappy.parquet']" ] }, "execution_count": 6, @@ -407,8 +407,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 862 ms, sys: 382 ms, total: 1.24 s\n", - "Wall time: 1.45 s\n" + "CPU times: user 1.81 s, sys: 528 ms, total: 2.33 s\n", + "Wall time: 3.21 s\n" ] } ], @@ -563,8 +563,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.08 s, sys: 423 ms, total: 2.5 s\n", - "Wall time: 7.23 s\n" + "CPU times: user 3.52 s, sys: 811 ms, total: 4.33 s\n", + "Wall time: 9.6 s\n" ] }, { @@ -602,61 +602,61 @@ " \n", " \n", " 0\n", - " CA006116254\n", + " USC00195145\n", " 1890-01-01\n", " TMIN\n", - " -61\n", + " -28\n", " <NA>\n", " <NA>\n", - " C\n", + " 6\n", " <NA>\n", " 1890\n", " \n", " \n", " 1\n", - " CA006116254\n", + " USC00196770\n", " 1890-01-01\n", " PRCP\n", - " 127\n", - " <NA>\n", + " 0\n", + " P\n", " <NA>\n", - " C\n", + " 6\n", " <NA>\n", " 1890\n", " \n", " \n", " 2\n", - " CA006116254\n", + " USC00196770\n", " 1890-01-01\n", " SNOW\n", " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " 6\n", " <NA>\n", " 1890\n", " \n", " \n", " 3\n", - " CA006116705\n", + " USC00196915\n", " 1890-01-01\n", " PRCP\n", " 0\n", + " P\n", " <NA>\n", - " <NA>\n", - " C\n", + " 6\n", " <NA>\n", " 1890\n", " \n", " \n", " 4\n", - " CA006116705\n", + " USC00196915\n", " 1890-01-01\n", " SNOW\n", " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " 6\n", " <NA>\n", " 1890\n", " \n", @@ -673,62 +673,62 @@ " ...\n", " \n", " \n", - " 1276241\n", - " CA006120315\n", - " 1890-12-31\n", - " TMAX\n", - " 11\n", + " 6139\n", + " ASN00022006\n", + " 1890-12-03\n", + " PRCP\n", + " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " a\n", " <NA>\n", " 1890\n", " \n", " \n", - " 1276242\n", - " CA006120315\n", - " 1890-12-31\n", - " TMIN\n", + " 6140\n", + " ASN00022007\n", + " 1890-12-03\n", + " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " a\n", " <NA>\n", " 1890\n", " \n", " \n", - " 1276243\n", - " CA006120315\n", - " 1890-12-31\n", + " 6141\n", + " ASN00022008\n", + " 1890-12-03\n", " PRCP\n", - " 5\n", + " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " a\n", " <NA>\n", " 1890\n", " \n", " \n", - " 1276244\n", - " CA006120315\n", - " 1890-12-31\n", - " SNOW\n", + " 6142\n", + " ASN00022009\n", + " 1890-12-03\n", + " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " a\n", " <NA>\n", " 1890\n", " \n", " \n", - " 1276245\n", - " CA006120795\n", - " 1890-12-31\n", + " 6143\n", + " ASN00022011\n", + " 1890-12-03\n", " PRCP\n", " 0\n", " <NA>\n", " <NA>\n", - " C\n", + " a\n", " <NA>\n", " 1890\n", " \n", @@ -738,31 +738,31 @@ "" ], "text/plain": [ - " id dt element value m_flag q_flag s_flag obs_time \\\n", - "0 CA006116254 1890-01-01 TMIN -61 C \n", - "1 CA006116254 1890-01-01 PRCP 127 C \n", - "2 CA006116254 1890-01-01 SNOW 0 C \n", - "3 CA006116705 1890-01-01 PRCP 0 C \n", - "4 CA006116705 1890-01-01 SNOW 0 C \n", - "... ... ... ... ... ... ... ... ... \n", - "1276241 CA006120315 1890-12-31 TMAX 11 C \n", - "1276242 CA006120315 1890-12-31 TMIN 0 C \n", - "1276243 CA006120315 1890-12-31 PRCP 5 C \n", - "1276244 CA006120315 1890-12-31 SNOW 0 C \n", - "1276245 CA006120795 1890-12-31 PRCP 0 C \n", + " id dt element value m_flag q_flag s_flag obs_time \\\n", + "0 USC00195145 1890-01-01 TMIN -28 6 \n", + "1 USC00196770 1890-01-01 PRCP 0 P 6 \n", + "2 USC00196770 1890-01-01 SNOW 0 6 \n", + "3 USC00196915 1890-01-01 PRCP 0 P 6 \n", + "4 USC00196915 1890-01-01 SNOW 0 6 \n", + "... ... ... ... ... ... ... ... ... \n", + "6139 ASN00022006 1890-12-03 PRCP 0 a \n", + "6140 ASN00022007 1890-12-03 PRCP 0 a \n", + "6141 ASN00022008 1890-12-03 PRCP 0 a \n", + "6142 ASN00022009 1890-12-03 PRCP 0 a \n", + "6143 ASN00022011 1890-12-03 PRCP 0 a \n", "\n", - " year \n", - "0 1890 \n", - "1 1890 \n", - "2 1890 \n", - "3 1890 \n", - "4 1890 \n", - "... ... \n", - "1276241 1890 \n", - "1276242 1890 \n", - "1276243 1890 \n", - "1276244 1890 \n", - "1276245 1890 \n", + " year \n", + "0 1890 \n", + "1 1890 \n", + "2 1890 \n", + "3 1890 \n", + "4 1890 \n", + "... ... \n", + "6139 1890 \n", + "6140 1890 \n", + "6141 1890 \n", + "6142 1890 \n", + "6143 1890 \n", "\n", "[1276246 rows x 9 columns]" ] diff --git a/tutorials/011 - CSV Datasets.ipynb b/tutorials/011 - CSV Datasets.ipynb index 28e9cef9d..23a93aa8a 100644 --- a/tutorials/011 - CSV Datasets.ipynb +++ b/tutorials/011 - CSV Datasets.ipynb @@ -50,7 +50,7 @@ "name": "stdin", "output_type": "stream", "text": [ - " ···········································\n" + " ············\n" ] } ], @@ -204,31 +204,31 @@ " \n", " \n", " 0\n", - " 3\n", - " bar\n", - " 2020-01-03\n", - " \n", - " \n", - " 1\n", " 1\n", " foo\n", " 2020-01-01\n", " \n", " \n", - " 2\n", + " 1\n", " 2\n", " boo\n", " 2020-01-02\n", " \n", + " \n", + " 2\n", + " 3\n", + " bar\n", + " 2020-01-03\n", + " \n", " \n", "\n", "" ], "text/plain": [ " id value date\n", - "0 3 bar 2020-01-03\n", - "1 1 foo 2020-01-01\n", - "2 2 boo 2020-01-02" + "0 1 foo 2020-01-01\n", + "1 2 boo 2020-01-02\n", + "2 3 bar 2020-01-03" ] }, "execution_count": 5, @@ -370,24 +370,24 @@ " \n", " \n", " 0\n", - " 1\n", - " foo\n", - " 2020-01-01\n", - " \n", - " \n", - " 1\n", " 2\n", " boo\n", " 2020-01-02\n", " \n", + " \n", + " 1\n", + " 1\n", + " foo\n", + " 2020-01-01\n", + " \n", " \n", "\n", "" ], "text/plain": [ " id value date\n", - "0 1 foo 2020-01-01\n", - "1 2 boo 2020-01-02" + "0 2 boo 2020-01-02\n", + "1 1 foo 2020-01-01" ] }, "execution_count": 7, @@ -457,17 +457,17 @@ " \n", " \n", " 0\n", - " 3\n", - " bar\n", - " 2020-01-03\n", - " \n", - " \n", - " 1\n", " 2\n", " xoo\n", " 2020-01-02\n", " \n", " \n", + " 1\n", + " 3\n", + " bar\n", + " 2020-01-03\n", + " \n", + " \n", " 2\n", " 1\n", " foo\n", @@ -479,8 +479,8 @@ ], "text/plain": [ " id value date\n", - "0 3 bar 2020-01-03\n", - "1 2 xoo 2020-01-02\n", + "0 2 xoo 2020-01-02\n", + "1 3 bar 2020-01-03\n", "2 1 foo 2020-01-01" ] }, diff --git a/tutorials/012 - CSV Crawler.ipynb b/tutorials/012 - CSV Crawler.ipynb index 7ad0a5b94..bfa2d48fd 100644 --- a/tutorials/012 - CSV Crawler.ipynb +++ b/tutorials/012 - CSV Crawler.ipynb @@ -38,7 +38,7 @@ "name": "stdin", "output_type": "stream", "text": [ - " ···········································\n" + " ············\n" ] } ], @@ -557,7 +557,7 @@ " 1\n", " 1\n", " 1.0\n", - " 2020-01-01\n", + " None\n", " 2020-01-02\n", " True\n", " 1\n", @@ -568,7 +568,7 @@ " 1\n", " 1\n", " 1.0\n", - " 2020-01-01\n", + " None\n", " 2020-01-02\n", " True\n", " 1\n", @@ -579,9 +579,9 @@ "" ], "text/plain": [ - " id string float date timestamp bool par0 par1\n", - "0 1 1 1.0 2020-01-01 2020-01-02 True 1 a\n", - "1 1 1 1.0 2020-01-01 2020-01-02 True 1 a" + " id string float date timestamp bool par0 par1\n", + "0 1 1 1.0 None 2020-01-02 True 1 a\n", + "1 1 1 1.0 None 2020-01-02 True 1 a" ] }, "execution_count": 13, @@ -665,13 +665,6 @@ "source": [ "wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=\"csv_crawler\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tutorials/013 - Merging Datasets on S3.ipynb b/tutorials/013 - Merging Datasets on S3.ipynb index f77ba6201..5dfc091d0 100644 --- a/tutorials/013 - Merging Datasets on S3.ipynb +++ b/tutorials/013 - Merging Datasets on S3.ipynb @@ -50,7 +50,7 @@ "name": "stdin", "output_type": "stream", "text": [ - " ···········································\n" + " ············\n" ] } ], @@ -277,13 +277,13 @@ " \n", " 1\n", " 2\n", - " boo\n", + " xoo\n", " 2020-01-02\n", " \n", " \n", " 2\n", " 2\n", - " xoo\n", + " boo\n", " 2020-01-02\n", " \n", " \n", @@ -299,8 +299,8 @@ "text/plain": [ " id value date\n", "0 1 foo 2020-01-01\n", - "1 2 boo 2020-01-02\n", - "2 2 xoo 2020-01-02\n", + "1 2 xoo 2020-01-02\n", + "2 2 boo 2020-01-02\n", "3 3 bar 2020-01-03" ] }, @@ -494,13 +494,6 @@ "wr.s3.delete_objects(path1)\n", "wr.s3.delete_objects(path2)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tutorials/021 - Global Configurations.ipynb b/tutorials/021 - Global Configurations.ipynb index adab38a28..09ba0efba 100644 --- a/tutorials/021 - Global Configurations.ipynb +++ b/tutorials/021 - Global Configurations.ipynb @@ -288,8 +288,8 @@ " \n", " \n", " 6\n", - " s3fs_block_size\n", - " WR_S3FS_BLOCK_SIZE\n", + " s3_block_size\n", + " WR_S3_BLOCK_SIZE\n", " <class 'int'>\n", " False\n", " True\n", @@ -300,7 +300,7 @@ "" ], "text/plain": [ - "" + "" ] }, "execution_count": 7, @@ -315,9 +315,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "conda_python3", "language": "python", - "name": "python3" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { diff --git a/validate.sh b/validate.sh index b37d102aa..2a7807d2b 100755 --- a/validate.sh +++ b/validate.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -ex -isort -rc awswrangler tests +isort awswrangler tests black --line-length 120 --target-version py36 awswrangler tests pydocstyle awswrangler/ --convention=numpy mypy awswrangler