diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml index 76291af88..11133f8da 100644 --- a/.github/workflows/static-checking.yml +++ b/.github/workflows/static-checking.yml @@ -36,3 +36,7 @@ jobs: run: flake8 setup.py awswrangler testing/test_awswrangler - name: Pylint Lint run: pylint -j 0 awswrangler + - name: Black style + run: black --check --line-length 120 --target-version py36 awswrangler testing/test_awswrangler + - name: Imports order check (isort) + run: isort -rc --check-only awswrangler testing/test_awswrangler diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 000000000..f8e61f2f3 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,6 @@ +[settings] +multi_line_output=3 +include_trailing_comma=True +force_grid_wrap=0 +use_parentheses=True +line_length=120 diff --git a/README.md b/README.md index 0f01a2d7c..cabd20ea3 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![AWS Data Wrangler](docs/source/_static/logo2.png?raw=true "AWS Data Wrangler") -[![Release](https://img.shields.io/badge/release-1.3.0-brightgreen.svg)](https://pypi.org/project/awswrangler/) +[![Release](https://img.shields.io/badge/release-1.4.0-brightgreen.svg)](https://pypi.org/project/awswrangler/) [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) @@ -63,23 +63,23 @@ df = wr.db.read_sql_query("SELECT * FROM external_schema.my_table", con=engine) - [EMR](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#emr) - [From source](https://aws-data-wrangler.readthedocs.io/en/latest/install.html#from-source) - [**Tutorials**](https://github.com/awslabs/aws-data-wrangler/tree/master/tutorials) - - [01 - Introduction](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/01%20-%20Introduction.ipynb) - - [02 - Sessions](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/02%20-%20Sessions.ipynb) - - [03 - Amazon S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/03%20-%20Amazon%20S3.ipynb) - - [04 - Parquet Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/04%20-%20Parquet%20Datasets.ipynb) - - [05 - Glue Catalog](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/05%20-%20Glue%20Catalog.ipynb) - - [06 - Amazon Athena](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/06%20-%20Amazon%20Athena.ipynb) - - [07 - Databases (Redshift, MySQL and PostgreSQL)](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/07%20-%20Redshift%2C%20MySQL%2C%20PostgreSQL.ipynb) - - [08 - Redshift - Copy & Unload.ipynb](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/08%20-%20Redshift%20-%20Copy%20%26%20Unload.ipynb) - - [09 - Redshift - Append, Overwrite and Upsert](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/09%20-%20Redshift%20-%20Append%2C%20Overwrite%2C%20Upsert.ipynb) - - [10 - Parquet Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/10%20-%20Parquet%20Crawler.ipynb) - - [11 - CSV Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/11%20-%20CSV%20Datasets.ipynb) - - [12 - CSV Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/12%20-%20CSV%20Crawler.ipynb) - - [13 - Merging Datasets on S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/13%20-%20Merging%20Datasets%20on%20S3.ipynb) - - [14 - Schema Evolution](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/14%20-%20Schema%20Evolution.ipynb) - - [15 - EMR](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/15%20-%20EMR.ipynb) - - [16 - EMR & Docker](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/16%20-%20EMR%20%26%20Docker.ipynb) - - [17 - Partition Projection](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/17%20-%20Partition%20Projection.ipynb) + - [001 - Introduction](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/001%20-%20Introduction.ipynb) + - [002 - Sessions](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/002%20-%20Sessions.ipynb) + - [003 - Amazon S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/003%20-%20Amazon%20S3.ipynb) + - [004 - Parquet Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/004%20-%20Parquet%20Datasets.ipynb) + - [005 - Glue Catalog](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/005%20-%20Glue%20Catalog.ipynb) + - [006 - Amazon Athena](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/006%20-%20Amazon%20Athena.ipynb) + - [007 - Databases (Redshift, MySQL and PostgreSQL)](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/007%20-%20Redshift%2C%20MySQL%2C%20PostgreSQL.ipynb) + - [008 - Redshift - Copy & Unload.ipynb](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/008%20-%20Redshift%20-%20Copy%20%26%20Unload.ipynb) + - [009 - Redshift - Append, Overwrite and Upsert](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/009%20-%20Redshift%20-%20Append%2C%20Overwrite%2C%20Upsert.ipynb) + - [010 - Parquet Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/010%20-%20Parquet%20Crawler.ipynb) + - [011 - CSV Datasets](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/011%20-%20CSV%20Datasets.ipynb) + - [012 - CSV Crawler](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/012%20-%20CSV%20Crawler.ipynb) + - [013 - Merging Datasets on S3](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/013%20-%20Merging%20Datasets%20on%20S3.ipynb) + - [014 - Schema Evolution](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/014%20-%20Schema%20Evolution.ipynb) + - [015 - EMR](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/015%20-%20EMR.ipynb) + - [016 - EMR & Docker](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/016%20-%20EMR%20%26%20Docker.ipynb) + - [017 - Partition Projection](https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/017%20-%20Partition%20Projection.ipynb) - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/latest/api.html) - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#amazon-s3) - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/latest/api.html#aws-glue-catalog) diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py index 26d9ff44e..dc3dcb059 100644 --- a/awswrangler/__metadata__.py +++ b/awswrangler/__metadata__.py @@ -7,5 +7,5 @@ __title__ = "awswrangler" __description__ = "Pandas on AWS." -__version__ = "1.3.0" +__version__ = "1.4.0" __license__ = "Apache License 2.0" diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py index b869d78c1..c399701f8 100644 --- a/awswrangler/_utils.py +++ b/awswrangler/_utils.py @@ -203,10 +203,10 @@ def get_region_from_session(boto3_session: Optional[boto3.Session] = None, defau ) # pragma: no cover -def extract_partitions_from_paths( +def extract_partitions_metadata_from_paths( path: str, paths: List[str] ) -> Tuple[Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: - """Extract partitions from Amazon S3 paths.""" + """Extract partitions metadata from Amazon S3 paths.""" path = path if path.endswith("/") else f"{path}/" partitions_types: Dict[str, str] = {} partitions_values: Dict[str, List[str]] = {} @@ -217,7 +217,7 @@ def extract_partitions_from_paths( ) # pragma: no cover path_wo_filename: str = p.rpartition("/")[0] + "/" if path_wo_filename not in partitions_values: - path_wo_prefix: str = p.replace(f"{path}/", "") + path_wo_prefix: str = path_wo_filename.replace(f"{path}/", "") dirs: List[str] = [x for x in path_wo_prefix.split("/") if (x != "") and ("=" in x)] if dirs: values_tups: List[Tuple[str, str]] = [tuple(x.split("=")[:2]) for x in dirs] # type: ignore @@ -238,6 +238,23 @@ def extract_partitions_from_paths( return partitions_types, partitions_values +def extract_partitions_from_path(path_root: str, path: str) -> Dict[str, Any]: + """Extract partitions values and names from Amazon S3 path.""" + path_root = path_root if path_root.endswith("/") else f"{path_root}/" + if path_root not in path: + raise exceptions.InvalidArgumentValue( + f"Object {path} is not under the root path ({path_root})." + ) # pragma: no cover + path_wo_filename: str = path.rpartition("/")[0] + "/" + path_wo_prefix: str = path_wo_filename.replace(f"{path_root}/", "") + dirs: List[str] = [x for x in path_wo_prefix.split("/") if (x != "") and ("=" in x)] + if not dirs: + return {} # pragma: no cover + values_tups: List[Tuple[str, str]] = [tuple(x.split("=")[:2]) for x in dirs] # type: ignore + values_dics: Dict[str, str] = dict(values_tups) + return values_dics + + def list_sampling(lst: List[Any], sampling: float) -> List[Any]: """Random List sampling.""" if sampling > 1.0 or sampling <= 0.0: # pragma: no cover diff --git a/awswrangler/s3.py b/awswrangler/s3.py index b13ccff63..d82df8567 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -677,6 +677,8 @@ def to_csv( # pylint: disable=too-many-arguments raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") if mode is not None: raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") + if columns_comments: + raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use columns_comments.") if any(arg is not None for arg in (database, table, description, parameters)): raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use any one of these " @@ -887,14 +889,16 @@ def _to_text( raise exceptions.EmptyDataFrame() if fs is None: fs = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) - with fs.open(path, "w") as f: + encoding: Optional[str] = pandas_kwargs.get("encoding", None) + newline: Optional[str] = pandas_kwargs.get("line_terminator", None) + with fs.open(path=path, mode="w", encoding=encoding, newline=newline) as f: if file_format == "csv": df.to_csv(f, **pandas_kwargs) elif file_format == "json": df.to_json(f, **pandas_kwargs) -def to_parquet( # pylint: disable=too-many-arguments +def to_parquet( # pylint: disable=too-many-arguments,too-many-locals df: pd.DataFrame, path: str, index: bool = False, @@ -1153,9 +1157,14 @@ def to_parquet( # pylint: disable=too-many-arguments "arguments: database, table, description, parameters, " "columns_comments." ) + df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) + schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( + df=df, index=index, ignore_cols=partition_cols, dtype=dtype + ) + _logger.debug("schema: \n%s", schema) paths = [ _to_parquet_file( - df=df, path=path, schema=None, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype + df=df, path=path, schema=schema, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype ) ] else: @@ -1314,6 +1323,7 @@ def read_csv( boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, chunksize: Optional[int] = None, + dataset: bool = False, **pandas_kwargs, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read CSV file(s) from from a received S3 prefix or list of S3 objects paths. @@ -1340,6 +1350,8 @@ def read_csv( https://s3fs.readthedocs.io/en/latest/#serverside-encryption chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. + dataset: bool + If `True` read a CSV dataset instead of simple file(s) loading all the related partitions as columns. pandas_kwargs: keyword arguments forwarded to pandas.read_csv(). https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html @@ -1387,6 +1399,7 @@ def read_csv( boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, chunksize=chunksize, + dataset=dataset, **pandas_kwargs, ) @@ -1397,6 +1410,7 @@ def read_fwf( boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, chunksize: Optional[int] = None, + dataset: bool = False, **pandas_kwargs, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read fixed-width formatted file(s) from from a received S3 prefix or list of S3 objects paths. @@ -1423,6 +1437,8 @@ def read_fwf( https://s3fs.readthedocs.io/en/latest/#serverside-encryption chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. + dataset: bool + If `True` read a FWF dataset instead of simple file(s) loading all the related partitions as columns. pandas_kwargs: keyword arguments forwarded to pandas.read_fwf(). https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_fwf.html @@ -1470,6 +1486,7 @@ def read_fwf( boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, chunksize=chunksize, + dataset=dataset, **pandas_kwargs, ) @@ -1480,6 +1497,7 @@ def read_json( boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, chunksize: Optional[int] = None, + dataset: bool = False, **pandas_kwargs, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read JSON file(s) from from a received S3 prefix or list of S3 objects paths. @@ -1506,6 +1524,9 @@ def read_json( https://s3fs.readthedocs.io/en/latest/#serverside-encryption chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. + dataset: bool + If `True` read a JSON dataset instead of simple file(s) loading all the related partitions as columns. + If `True`, the `lines=True` will be assumed by default. pandas_kwargs: keyword arguments forwarded to pandas.read_json(). https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html @@ -1546,6 +1567,8 @@ def read_json( >>> print(df) # 100 lines Pandas DataFrame """ + if (dataset is True) and ("lines" not in pandas_kwargs): + pandas_kwargs["lines"] = True return _read_text( parser_func=pd.read_json, path=path, @@ -1553,6 +1576,7 @@ def read_json( boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, chunksize=chunksize, + dataset=dataset, **pandas_kwargs, ) @@ -1564,11 +1588,18 @@ def _read_text( boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, chunksize: Optional[int] = None, + dataset: bool = False, **pandas_kwargs, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: if "iterator" in pandas_kwargs: raise exceptions.InvalidArgument("Please, use chunksize instead of iterator.") session: boto3.Session = _utils.ensure_session(session=boto3_session) + if (dataset is True) and (not isinstance(path, str)): # pragma: no cover + raise exceptions.InvalidArgument("The path argument must be a string Amazon S3 prefix if dataset=True.") + if dataset is True: + path_root: str = str(path) + else: + path_root = "" paths: List[str] = _path2list(path=path, boto3_session=session) _logger.debug("paths:\n%s", paths) if chunksize is not None: @@ -1579,6 +1610,8 @@ def _read_text( chunksize=chunksize, pandas_args=pandas_kwargs, s3_additional_kwargs=s3_additional_kwargs, + dataset=dataset, + path_root=path_root, ) return dfs if (use_threads is False) or (boto3_session is not None): @@ -1590,6 +1623,8 @@ def _read_text( boto3_session=session, pandas_args=pandas_kwargs, s3_additional_kwargs=s3_additional_kwargs, + dataset=dataset, + path_root=path_root, ) for p in paths ], @@ -1603,10 +1638,12 @@ def _read_text( objs=executor.map( _read_text_full, repeat(parser_func), + repeat(path_root), paths, repeat(None), # Boto3.Session repeat(pandas_kwargs), repeat(s3_additional_kwargs), + repeat(dataset), ), ignore_index=True, sort=False, @@ -1616,37 +1653,54 @@ def _read_text( def _read_text_chunksize( parser_func: Callable, + path_root: str, paths: List[str], boto3_session: boto3.Session, chunksize: int, pandas_args: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]] = None, + dataset: bool = False, ) -> Iterator[pd.DataFrame]: fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) for path in paths: _logger.debug("path: %s", path) + partitions: Dict[str, Any] = {} + if dataset is True: + partitions = _utils.extract_partitions_from_path(path_root=path_root, path=path) if pandas_args.get("compression", "infer") == "infer": pandas_args["compression"] = infer_compression(path, compression="infer") mode: str = "r" if pandas_args.get("compression") is None else "rb" with fs.open(path, mode) as f: reader: pandas.io.parsers.TextFileReader = parser_func(f, chunksize=chunksize, **pandas_args) for df in reader: + if dataset is True: + for column_name, value in partitions.items(): + df[column_name] = value yield df def _read_text_full( parser_func: Callable, + path_root: str, path: str, boto3_session: boto3.Session, pandas_args: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]] = None, + dataset: bool = False, ) -> pd.DataFrame: fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) if pandas_args.get("compression", "infer") == "infer": pandas_args["compression"] = infer_compression(path, compression="infer") mode: str = "r" if pandas_args.get("compression") is None else "rb" - with fs.open(path, mode) as f: - return parser_func(f, **pandas_args) + encoding: Optional[str] = pandas_args.get("encoding", None) + newline: Optional[str] = pandas_args.get("lineterminator", None) + with fs.open(path=path, mode=mode, encoding=encoding, newline=newline) as f: + df: pd.DataFrame = parser_func(f, **pandas_args) + if dataset is True: + partitions: Dict[str, Any] = _utils.extract_partitions_from_path(path_root=path_root, path=path) + for column_name, value in partitions.items(): + df[column_name] = value + return df def _read_parquet_init( @@ -1660,14 +1714,15 @@ def _read_parquet_init( s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> pyarrow.parquet.ParquetDataset: """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset.""" + session: boto3.Session = _utils.ensure_session(session=boto3_session) if dataset is False: - path_or_paths: Union[str, List[str]] = _path2list(path=path, boto3_session=boto3_session) + path_or_paths: Union[str, List[str]] = _path2list(path=path, boto3_session=session) elif isinstance(path, str): path_or_paths = path[:-1] if path.endswith("/") else path else: path_or_paths = path _logger.debug("path_or_paths: %s", path_or_paths) - fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) + fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset( path_or_paths=path_or_paths, @@ -1677,6 +1732,7 @@ def _read_parquet_init( read_dictionary=categories, validate_schema=validate_schema, split_row_groups=False, + use_legacy_dataset=True, ) return data @@ -1723,7 +1779,8 @@ def read_parquet( path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). filters: Union[List[Tuple], List[List[Tuple]]], optional - List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. + List of filters to apply on PARTITION columns (PUSH-DOWN filter), like ``[[('x', '=', 0), ...], ...]``. + Ignored if `dataset=False`. columns : List[str], optional Names of columns to read from the file(s). validate_schema: @@ -1994,7 +2051,7 @@ def _read_parquet_metadata( partitions_types: Optional[Dict[str, str]] = None partitions_values: Optional[Dict[str, List[str]]] = None if (dataset is True) and (_path is not None): - partitions_types, partitions_values = _utils.extract_partitions_from_paths(path=_path, paths=paths) + partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths(path=_path, paths=paths) if dtype: for k, v in dtype.items(): if columns_types and k in columns_types: diff --git a/requirements-dev.txt b/requirements-dev.txt index 4850658c4..e0abc8e4a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -19,4 +19,4 @@ wheel~=0.34.2 sphinx~=3.0.4 sphinx_bootstrap_theme~=0.7.1 moto~=1.3.14 -jupyterlab~=2.1.3 \ No newline at end of file +jupyterlab~=2.1.4 \ No newline at end of file diff --git a/testing/test_awswrangler/_utils.py b/testing/test_awswrangler/_utils.py index 40481c689..4375e7ab5 100644 --- a/testing/test_awswrangler/_utils.py +++ b/testing/test_awswrangler/_utils.py @@ -1,9 +1,13 @@ import random +import time from datetime import datetime from decimal import Decimal +import boto3 import pandas as pd +import awswrangler as wr + ts = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f") # noqa dt = lambda x: datetime.strptime(x, "%Y-%m-%d").date() # noqa @@ -407,3 +411,26 @@ def ensure_data_types_csv(df): def get_time_str_with_random_suffix(): time_str = datetime.utcnow().strftime("%Y%m%d%H%M%S%f") return f"{time_str}_{random.randrange(16**4):04x}" + + +def path_generator(bucket): + s3_path = f"s3://{bucket}/{get_time_str_with_random_suffix()}/" + print(f"S3 Path: {s3_path}") + time.sleep(1) + objs = wr.s3.list_objects(s3_path) + wr.s3.delete_objects(path=objs) + wr.s3.wait_objects_not_exist(objs) + yield s3_path + time.sleep(1) + objs = wr.s3.list_objects(s3_path) + wr.s3.delete_objects(path=objs) + wr.s3.wait_objects_not_exist(objs) + + +def extract_cloudformation_outputs(): + response = boto3.client("cloudformation").describe_stacks(StackName="aws-data-wrangler") + stack = [x for x in response.get("Stacks") if x["StackStatus"] in CFN_VALID_STATUS][0] + outputs = {} + for output in stack.get("Outputs"): + outputs[output.get("OutputKey")] = output.get("OutputValue") + return outputs diff --git a/testing/test_awswrangler/test_athena_projection.py b/testing/test_awswrangler/test_athena_projection.py deleted file mode 100644 index 66506ac37..000000000 --- a/testing/test_awswrangler/test_athena_projection.py +++ /dev/null @@ -1,170 +0,0 @@ -import logging -import time - -import boto3 -import pandas as pd -import pytest - -import awswrangler as wr - -from ._utils import CFN_VALID_STATUS, dt, get_time_str_with_random_suffix, ts - -logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") -logging.getLogger("awswrangler").setLevel(logging.DEBUG) -logging.getLogger("botocore.credentials").setLevel(logging.CRITICAL) - - -@pytest.fixture(scope="module") -def cloudformation_outputs(): - response = boto3.client("cloudformation").describe_stacks(StackName="aws-data-wrangler") - stack = [x for x in response.get("Stacks") if x["StackStatus"] in CFN_VALID_STATUS][0] - outputs = {} - for output in stack.get("Outputs"): - outputs[output.get("OutputKey")] = output.get("OutputValue") - yield outputs - - -@pytest.fixture(scope="module") -def region(cloudformation_outputs): - yield cloudformation_outputs["Region"] - - -@pytest.fixture(scope="module") -def bucket(cloudformation_outputs): - yield cloudformation_outputs["BucketName"] - - -@pytest.fixture(scope="module") -def database(cloudformation_outputs): - yield cloudformation_outputs["GlueDatabaseName"] - - -@pytest.fixture(scope="module") -def external_schema(cloudformation_outputs, database): - region = cloudformation_outputs.get("Region") - sql = f""" - CREATE EXTERNAL SCHEMA IF NOT EXISTS aws_data_wrangler_external FROM data catalog - DATABASE '{database}' - IAM_ROLE '{cloudformation_outputs["RedshiftRole"]}' - REGION '{region}'; - """ - engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") - with engine.connect() as con: - con.execute(sql) - yield "aws_data_wrangler_external" - - -@pytest.fixture(scope="function") -def path(bucket): - s3_path = f"s3://{bucket}/{get_time_str_with_random_suffix()}/" - print(f"S3 Path: {s3_path}") - time.sleep(1) - objs = wr.s3.list_objects(s3_path) - wr.s3.delete_objects(path=objs) - wr.s3.wait_objects_not_exist(objs) - yield s3_path - time.sleep(1) - objs = wr.s3.list_objects(s3_path) - wr.s3.delete_objects(path=objs) - wr.s3.wait_objects_not_exist(objs) - - -@pytest.fixture(scope="function") -def table(database): - name = f"tbl_{get_time_str_with_random_suffix()}" - print(f"Table name: {name}") - wr.catalog.delete_table_if_exists(database=database, table=name) - yield name - wr.catalog.delete_table_if_exists(database=database, table=name) - - -def test_to_parquet_projection_integer(database, table, path): - df = pd.DataFrame({"c0": [0, 1, 2], "c1": [0, 1, 2], "c2": [0, 100, 200], "c3": [0, 1, 2]}) - paths = wr.s3.to_parquet( - df=df, - path=path, - dataset=True, - database=database, - table=table, - partition_cols=["c1", "c2", "c3"], - regular_partitions=False, - projection_enabled=True, - projection_types={"c1": "integer", "c2": "integer", "c3": "integer"}, - projection_ranges={"c1": "0,2", "c2": "0,200", "c3": "0,2"}, - projection_intervals={"c2": "100"}, - projection_digits={"c3": "1"}, - )["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.athena.read_sql_table(table, database) - assert df.shape == df2.shape - assert df.c0.sum() == df2.c0.sum() - assert df.c1.sum() == df2.c1.sum() - assert df.c2.sum() == df2.c2.sum() - assert df.c3.sum() == df2.c3.sum() - - -def test_to_parquet_projection_enum(database, table, path): - df = pd.DataFrame({"c0": [0, 1, 2], "c1": [1, 2, 3], "c2": ["foo", "boo", "bar"]}) - paths = wr.s3.to_parquet( - df=df, - path=path, - dataset=True, - database=database, - table=table, - partition_cols=["c1", "c2"], - regular_partitions=False, - projection_enabled=True, - projection_types={"c1": "enum", "c2": "enum"}, - projection_values={"c1": "1,2,3", "c2": "foo,boo,bar"}, - )["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.athena.read_sql_table(table, database) - assert df.shape == df2.shape - assert df.c0.sum() == df2.c0.sum() - assert df.c1.sum() == df2.c1.sum() - - -def test_to_parquet_projection_date(database, table, path): - df = pd.DataFrame( - { - "c0": [0, 1, 2], - "c1": [dt("2020-01-01"), dt("2020-01-02"), dt("2020-01-03")], - "c2": [ts("2020-01-01 01:01:01.0"), ts("2020-01-01 01:01:02.0"), ts("2020-01-01 01:01:03.0")], - } - ) - paths = wr.s3.to_parquet( - df=df, - path=path, - dataset=True, - database=database, - table=table, - partition_cols=["c1", "c2"], - regular_partitions=False, - projection_enabled=True, - projection_types={"c1": "date", "c2": "date"}, - projection_ranges={"c1": "2020-01-01,2020-01-03", "c2": "2020-01-01 01:01:00,2020-01-01 01:01:03"}, - )["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.athena.read_sql_table(table, database) - print(df2) - assert df.shape == df2.shape - assert df.c0.sum() == df2.c0.sum() - - -def test_to_parquet_projection_injected(database, table, path): - df = pd.DataFrame({"c0": [0, 1, 2], "c1": ["foo", "boo", "bar"], "c2": ["0", "1", "2"]}) - paths = wr.s3.to_parquet( - df=df, - path=path, - dataset=True, - database=database, - table=table, - partition_cols=["c1", "c2"], - regular_partitions=False, - projection_enabled=True, - projection_types={"c1": "injected", "c2": "injected"}, - )["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.athena.read_sql_query(f"SELECT * FROM {table} WHERE c1='foo' AND c2='0'", database) - assert df2.shape == (1, 3) - assert df2.c0.iloc[0] == 0 diff --git a/testing/test_awswrangler/test_cloudwatch.py b/testing/test_awswrangler/test_cloudwatch.py index 6ac4e527a..592080510 100644 --- a/testing/test_awswrangler/test_cloudwatch.py +++ b/testing/test_awswrangler/test_cloudwatch.py @@ -7,7 +7,7 @@ import awswrangler as wr from awswrangler import exceptions -from ._utils import CFN_VALID_STATUS +from ._utils import extract_cloudformation_outputs logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") logging.getLogger("awswrangler").setLevel(logging.DEBUG) @@ -16,12 +16,7 @@ @pytest.fixture(scope="module") def cloudformation_outputs(): - response = boto3.client("cloudformation").describe_stacks(StackName="aws-data-wrangler") - stack = [x for x in response.get("Stacks") if x["StackStatus"] in CFN_VALID_STATUS][0] - outputs = {} - for output in stack.get("Outputs"): - outputs[output.get("OutputKey")] = output.get("OutputValue") - yield outputs + yield extract_cloudformation_outputs() @pytest.fixture(scope="module") diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index bcd6e3f6d..19fb1ca19 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -1,11 +1,9 @@ import bz2 import datetime import gzip -import itertools import logging import lzma import math -import time from io import BytesIO, TextIOWrapper import boto3 @@ -14,9 +12,20 @@ import awswrangler as wr -from ._utils import (CFN_VALID_STATUS, ensure_data_types, ensure_data_types_category, ensure_data_types_csv, get_df, - get_df_cast, get_df_category, get_df_csv, get_df_list, get_query_long, - get_time_str_with_random_suffix) +from ._utils import ( + ensure_data_types, + ensure_data_types_category, + ensure_data_types_csv, + extract_cloudformation_outputs, + get_df, + get_df_cast, + get_df_category, + get_df_csv, + get_df_list, + get_query_long, + get_time_str_with_random_suffix, + path_generator, +) logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") logging.getLogger("awswrangler").setLevel(logging.DEBUG) @@ -25,12 +34,7 @@ @pytest.fixture(scope="module") def cloudformation_outputs(): - response = boto3.client("cloudformation").describe_stacks(StackName="aws-data-wrangler") - stack = [x for x in response.get("Stacks") if x["StackStatus"] in CFN_VALID_STATUS][0] - outputs = {} - for output in stack.get("Outputs"): - outputs[output.get("OutputKey")] = output.get("OutputValue") - yield outputs + yield extract_cloudformation_outputs() @pytest.fixture(scope="module") @@ -161,21 +165,6 @@ def workgroup3(bucket, kms_key): yield wkg_name -@pytest.fixture(scope="function") -def path(bucket): - s3_path = f"s3://{bucket}/{get_time_str_with_random_suffix()}/" - print(f"S3 Path: {s3_path}") - time.sleep(1) - objs = wr.s3.list_objects(s3_path) - wr.s3.delete_objects(path=objs) - wr.s3.wait_objects_not_exist(objs) - yield s3_path - time.sleep(1) - objs = wr.s3.list_objects(s3_path) - wr.s3.delete_objects(path=objs) - wr.s3.wait_objects_not_exist(objs) - - @pytest.fixture(scope="function") def table(database): name = f"tbl_{get_time_str_with_random_suffix()}" @@ -185,21 +174,6 @@ def table(database): wr.catalog.delete_table_if_exists(database=database, table=name) -@pytest.fixture(scope="function") -def path2(bucket): - s3_path = f"s3://{bucket}/{get_time_str_with_random_suffix()}/" - print(f"S3 Path: {s3_path}") - time.sleep(1) - objs = wr.s3.list_objects(s3_path) - wr.s3.delete_objects(path=objs) - wr.s3.wait_objects_not_exist(objs) - yield s3_path - time.sleep(1) - objs = wr.s3.list_objects(s3_path) - wr.s3.delete_objects(path=objs) - wr.s3.wait_objects_not_exist(objs) - - @pytest.fixture(scope="function") def table2(database): name = f"tbl_{get_time_str_with_random_suffix()}" @@ -209,1976 +183,1815 @@ def table2(database): wr.catalog.delete_table_if_exists(database=database, table=name) -def test_athena_ctas(bucket, database, kms_key): - wr.s3.delete_objects(path=f"s3://{bucket}/test_athena_ctas/") - wr.s3.delete_objects(path=f"s3://{bucket}/test_athena_ctas_result/") - df = get_df_list() - columns_types, partitions_types = wr.catalog.extract_athena_types(df=df, partition_cols=["par0", "par1"]) - assert len(columns_types) == 16 - assert len(partitions_types) == 2 - with pytest.raises(wr.exceptions.InvalidArgumentValue): - wr.catalog.extract_athena_types(df=df, file_format="avro") +@pytest.fixture(scope="function") +def path(bucket): + yield from path_generator(bucket) + + +@pytest.fixture(scope="function") +def path2(bucket): + yield from path_generator(bucket) + + +@pytest.fixture(scope="function") +def path3(bucket): + yield from path_generator(bucket) + + +def test_to_parquet_modes(database, table, path, external_schema): + + # Round 1 - Warm up + df = pd.DataFrame({"c0": [0, None]}, dtype="Int64") paths = wr.s3.to_parquet( - df=get_df_list(), - path=f"s3://{bucket}/test_athena_ctas", - index=True, - use_threads=True, + df=df, + path=path, dataset=True, mode="overwrite", database=database, - table="test_athena_ctas", - partition_cols=["par0", "par1"], + table=table, + description="c0", + parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index))}, + columns_comments={"c0": "0"}, )["paths"] wr.s3.wait_objects_exist(paths=paths) - dirs = wr.s3.list_directories(path=f"s3://{bucket}/test_athena_ctas/") - for d in dirs: - assert d.startswith(f"s3://{bucket}/test_athena_ctas/par0=") - df = wr.s3.read_parquet_table(table="test_athena_ctas", database=database) - assert len(df.index) == 3 - ensure_data_types(df=df, has_list=True) - df = wr.athena.read_sql_table( - table="test_athena_ctas", - database=database, - ctas_approach=True, - encryption="SSE_KMS", - kms_key=kms_key, - s3_output=f"s3://{bucket}/test_athena_ctas_result", - keep_files=False, - ) - assert len(df.index) == 3 - ensure_data_types(df=df, has_list=True) - temp_table = "test_athena_ctas2" - s3_output = f"s3://{bucket}/s3_output/" - final_destination = f"{s3_output}{temp_table}/" + df2 = wr.athena.read_sql_table(table, database) + assert df.shape == df2.shape + assert df.c0.sum() == df2.c0.sum() + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == str(len(df2.columns)) + assert parameters["num_rows"] == str(len(df2.index)) + assert wr.catalog.get_table_description(database, table) == "c0" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c0"] == "0" - # keep_files=False - wr.s3.delete_objects(path=s3_output) - dfs = wr.athena.read_sql_query( - sql="SELECT * FROM test_athena_ctas", + # Round 2 - Overwrite + df = pd.DataFrame({"c1": [None, 1, None]}, dtype="Int16") + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + mode="overwrite", database=database, - ctas_approach=True, - chunksize=1, - keep_files=False, - ctas_temp_table_name=temp_table, - s3_output=s3_output, - ) - assert wr.catalog.does_table_exist(database=database, table=temp_table) is False - assert len(wr.s3.list_objects(path=s3_output)) > 2 - assert len(wr.s3.list_objects(path=final_destination)) > 0 - for df in dfs: - ensure_data_types(df=df, has_list=True) - assert len(wr.s3.list_objects(path=s3_output)) == 0 + table=table, + description="c1", + parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index))}, + columns_comments={"c1": "1"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + df2 = wr.athena.read_sql_table(table, database) + assert df.shape == df2.shape + assert df.c1.sum() == df2.c1.sum() + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == str(len(df2.columns)) + assert parameters["num_rows"] == str(len(df2.index)) + assert wr.catalog.get_table_description(database, table) == "c1" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c1"] == "1" - # keep_files=True - wr.s3.delete_objects(path=s3_output) - dfs = wr.athena.read_sql_query( - sql="SELECT * FROM test_athena_ctas", + # Round 3 - Append + df = pd.DataFrame({"c1": [None, 2, None]}, dtype="Int8") + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + mode="append", database=database, - ctas_approach=True, - chunksize=2, - keep_files=True, - ctas_temp_table_name=temp_table, - s3_output=s3_output, - ) - assert wr.catalog.does_table_exist(database=database, table=temp_table) is False - assert len(wr.s3.list_objects(path=s3_output)) > 2 - assert len(wr.s3.list_objects(path=final_destination)) > 0 - for df in dfs: - ensure_data_types(df=df, has_list=True) - assert len(wr.s3.list_objects(path=s3_output)) > 2 + table=table, + description="c1", + parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index) * 2)}, + columns_comments={"c1": "1"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + df2 = wr.athena.read_sql_table(table, database) + assert len(df.columns) == len(df2.columns) + assert len(df.index) * 2 == len(df2.index) + assert df.c1.sum() + 1 == df2.c1.sum() + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == str(len(df2.columns)) + assert parameters["num_rows"] == str(len(df2.index)) + assert wr.catalog.get_table_description(database, table) == "c1" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c1"] == "1" - # Cleaning Up - wr.catalog.delete_table_if_exists(database=database, table="test_athena_ctas") - wr.s3.delete_objects(path=paths) - wr.s3.wait_objects_not_exist(paths=paths) - wr.s3.delete_objects(path=f"s3://{bucket}/test_athena_ctas_result/") + # Round 4 - Append + New Column + df = pd.DataFrame({"c2": ["a", None, "b"], "c1": [None, None, None]}) + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + mode="append", + database=database, + table=table, + description="c1+c2", + parameters={"num_cols": "2", "num_rows": "9"}, + columns_comments={"c1": "1", "c2": "2"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + df2 = wr.athena.read_sql_table(table, database) + assert len(df2.columns) == 2 + assert len(df2.index) == 9 + assert df2.c1.sum() == 3 + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == "2" + assert parameters["num_rows"] == "9" + assert wr.catalog.get_table_description(database, table) == "c1+c2" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c1"] == "1" + assert comments["c2"] == "2" + # Round 5 - Append + New Column + Wrong Types + df = pd.DataFrame({"c2": [1], "c3": [True], "c1": ["1"]}) + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + mode="append", + database=database, + table=table, + description="c1+c2+c3", + parameters={"num_cols": "3", "num_rows": "10"}, + columns_comments={"c1": "1!", "c2": "2!", "c3": "3"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + df2 = wr.athena.read_sql_table(table, database) + assert len(df2.columns) == 3 + assert len(df2.index) == 10 + assert df2.c1.sum() == 4 + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == "3" + assert parameters["num_rows"] == "10" + assert wr.catalog.get_table_description(database, table) == "c1+c2+c3" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c1"] == "1!" + assert comments["c2"] == "2!" + assert comments["c3"] == "3" + engine = wr.catalog.get_engine("aws-data-wrangler-redshift") + df3 = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) + assert len(df3.columns) == 3 + assert len(df3.index) == 10 + assert df3.c1.sum() == 4 -def test_athena(path, database, kms_key, workgroup0, workgroup1): - wr.catalog.delete_table_if_exists(database=database, table="__test_athena") + # Round 6 - Overwrite Partitioned + df = pd.DataFrame({"c0": ["foo", None], "c1": [0, 1]}) paths = wr.s3.to_parquet( - df=get_df(), + df=df, path=path, - index=True, - use_threads=True, dataset=True, mode="overwrite", database=database, - table="__test_athena", - partition_cols=["par0", "par1"], + table=table, + partition_cols=["c1"], + description="c0+c1", + parameters={"num_cols": "2", "num_rows": "2"}, + columns_comments={"c0": "zero", "c1": "one"}, )["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - dfs = wr.athena.read_sql_query( - sql="SELECT * FROM __test_athena", - database=database, - ctas_approach=False, - chunksize=1, - encryption="SSE_KMS", - kms_key=kms_key, - workgroup=workgroup0, - keep_files=False, - ) - for df2 in dfs: - print(df2) - ensure_data_types(df=df2) - df = wr.athena.read_sql_query( - sql="SELECT * FROM __test_athena", - database=database, - ctas_approach=False, - workgroup=workgroup1, - keep_files=False, - ) - assert len(df.index) == 3 - ensure_data_types(df=df) - wr.athena.repair_table(table="__test_athena", database=database) - wr.catalog.delete_table_if_exists(database=database, table="__test_athena") - + wr.s3.wait_objects_exist(paths=paths) + df2 = wr.athena.read_sql_table(table, database) + assert df.shape == df2.shape + assert df.c1.sum() == df2.c1.sum() + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == "2" + assert parameters["num_rows"] == "2" + assert wr.catalog.get_table_description(database, table) == "c0+c1" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c0"] == "zero" + assert comments["c1"] == "one" -def test_csv(bucket): - session = boto3.Session() - df = pd.DataFrame({"id": [1, 2, 3]}) - path0 = f"s3://{bucket}/test_csv0.csv" - path1 = f"s3://{bucket}/test_csv1.csv" - path2 = f"s3://{bucket}/test_csv2.csv" - wr.s3.to_csv(df=df, path=path0, index=False) - wr.s3.wait_objects_exist(paths=[path0]) - assert wr.s3.does_object_exist(path=path0) is True - assert wr.s3.size_objects(path=[path0], use_threads=False)[path0] == 9 - assert wr.s3.size_objects(path=[path0], use_threads=True)[path0] == 9 - wr.s3.to_csv(df=df, path=path1, index=False, boto3_session=None) - wr.s3.to_csv(df=df, path=path2, index=False, boto3_session=session) - assert df.equals(wr.s3.read_csv(path=path0, use_threads=False)) - assert df.equals(wr.s3.read_csv(path=path0, use_threads=True)) - assert df.equals(wr.s3.read_csv(path=path0, use_threads=False, boto3_session=session)) - assert df.equals(wr.s3.read_csv(path=path0, use_threads=True, boto3_session=session)) - paths = [path0, path1, path2] - df2 = pd.concat(objs=[df, df, df], sort=False, ignore_index=True) - assert df2.equals(wr.s3.read_csv(path=paths, use_threads=False)) - assert df2.equals(wr.s3.read_csv(path=paths, use_threads=True)) - assert df2.equals(wr.s3.read_csv(path=paths, use_threads=False, boto3_session=session)) - assert df2.equals(wr.s3.read_csv(path=paths, use_threads=True, boto3_session=session)) - with pytest.raises(wr.exceptions.InvalidArgumentType): - wr.s3.read_csv(path=1) - with pytest.raises(wr.exceptions.InvalidArgument): - wr.s3.read_csv(path=paths, iterator=True) - wr.s3.delete_objects(path=paths, use_threads=False) - wr.s3.wait_objects_not_exist(paths=paths, use_threads=False) - - -def test_json(bucket): - df0 = pd.DataFrame({"id": [1, 2, 3]}) - path0 = f"s3://{bucket}/test_json0.json" - path1 = f"s3://{bucket}/test_json1.json" - wr.s3.to_json(df=df0, path=path0) - wr.s3.to_json(df=df0, path=path1) - wr.s3.wait_objects_exist(paths=[path0, path1]) - assert df0.equals(wr.s3.read_json(path=path0, use_threads=False)) - df1 = pd.concat(objs=[df0, df0], sort=False, ignore_index=True) - assert df1.equals(wr.s3.read_json(path=[path0, path1], use_threads=True)) - wr.s3.delete_objects(path=[path0, path1], use_threads=False) - - -def test_fwf(path): - text = "1 Herfelingen27-12-18\n2 Lambusart14-06-18\n3Spormaggiore15-04-18" - client_s3 = boto3.client("s3") - path0 = f"{path}/0.txt" - bucket, key = wr._utils.parse_path(path0) - client_s3.put_object(Body=text, Bucket=bucket, Key=key) - path1 = f"{path}/1.txt" - bucket, key = wr._utils.parse_path(path1) - client_s3.put_object(Body=text, Bucket=bucket, Key=key) - wr.s3.wait_objects_exist(paths=[path0, path1]) - df = wr.s3.read_fwf(path=path0, use_threads=False, widths=[1, 12, 8], names=["id", "name", "date"]) - assert len(df.index) == 3 - assert len(df.columns) == 3 - df = wr.s3.read_fwf(path=[path0, path1], use_threads=True, widths=[1, 12, 8], names=["id", "name", "date"]) - assert len(df.index) == 6 - assert len(df.columns) == 3 - - -def test_parquet(bucket): - wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet/") - df_file = pd.DataFrame({"id": [1, 2, 3]}) - path_file = f"s3://{bucket}/test_parquet/test_parquet_file.parquet" - df_dataset = pd.DataFrame({"id": [1, 2, 3], "partition": ["A", "A", "B"]}) - df_dataset["partition"] = df_dataset["partition"].astype("category") - path_dataset = f"s3://{bucket}/test_parquet/test_parquet_dataset" - with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_parquet(df=df_file, path=path_file, mode="append") - with pytest.raises(wr.exceptions.InvalidCompression): - wr.s3.to_parquet(df=df_file, path=path_file, compression="WRONG") - with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_parquet(df=df_dataset, path=path_dataset, partition_cols=["col2"]) - with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_parquet(df=df_dataset, path=path_dataset, description="foo") - with pytest.raises(wr.exceptions.InvalidArgumentValue): - wr.s3.to_parquet(df=df_dataset, path=path_dataset, partition_cols=["col2"], dataset=True, mode="WRONG") - paths = wr.s3.to_parquet(df=df_file, path=path_file)["paths"] - wr.s3.wait_objects_exist(paths=paths) - assert len(wr.s3.read_parquet(path=path_file, use_threads=True, boto3_session=None).index) == 3 - assert len(wr.s3.read_parquet(path=[path_file], use_threads=False, boto3_session=boto3.Session()).index) == 3 - paths = wr.s3.to_parquet(df=df_dataset, path=path_dataset, dataset=True)["paths"] - wr.s3.wait_objects_exist(paths=paths) - assert len(wr.s3.read_parquet(path=paths, dataset=True).index) == 3 - assert len(wr.s3.read_parquet(path=path_dataset, use_threads=True, boto3_session=boto3.Session()).index) == 3 - dataset_paths = wr.s3.to_parquet( - df=df_dataset, path=path_dataset, dataset=True, partition_cols=["partition"], mode="overwrite" + # Round 7 - Overwrite Partitions + df = pd.DataFrame({"c0": [None, None], "c1": [0, 2]}) + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + mode="overwrite_partitions", + database=database, + table=table, + partition_cols=["c1"], + description="c0+c1", + parameters={"num_cols": "2", "num_rows": "3"}, + columns_comments={"c0": "zero", "c1": "one"}, )["paths"] - wr.s3.wait_objects_exist(paths=dataset_paths) - assert len(wr.s3.read_parquet(path=path_dataset, use_threads=True, boto3_session=None).index) == 3 - assert len(wr.s3.read_parquet(path=dataset_paths, use_threads=True).index) == 3 - assert len(wr.s3.read_parquet(path=path_dataset, dataset=True, use_threads=True).index) == 3 - wr.s3.to_parquet(df=df_dataset, path=path_dataset, dataset=True, partition_cols=["partition"], mode="overwrite") - wr.s3.to_parquet( - df=df_dataset, path=path_dataset, dataset=True, partition_cols=["partition"], mode="overwrite_partitions" - ) - wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet/") - + wr.s3.wait_objects_exist(paths=paths) + df2 = wr.athena.read_sql_table(table, database) + assert len(df2.columns) == 2 + assert len(df2.index) == 3 + assert df2.c1.sum() == 3 + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == "2" + assert parameters["num_rows"] == "3" + assert wr.catalog.get_table_description(database, table) == "c0+c1" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c0"] == "zero" + assert comments["c1"] == "one" -def test_parquet_catalog(bucket, database): - with pytest.raises(wr.exceptions.UndetectedType): - wr.s3.to_parquet( - df=pd.DataFrame({"A": [None]}), - path=f"s3://{bucket}/test_parquet_catalog", - dataset=True, - database=database, - table="test_parquet_catalog", - ) - df = get_df_list() - with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_parquet( - df=df, - path=f"s3://{bucket}/test_parquet_catalog", - use_threads=True, - dataset=False, - mode="overwrite", - database=database, - table="test_parquet_catalog", - ) - with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_parquet( - df=df, - path=f"s3://{bucket}/test_parquet_catalog", - use_threads=True, - dataset=False, - table="test_parquet_catalog", - ) - with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_parquet( - df=df, - path=f"s3://{bucket}/test_parquet_catalog", - use_threads=True, - dataset=True, - mode="overwrite", - database=database, - ) - wr.s3.to_parquet( + # Round 8 - Overwrite Partitions + New Column + Wrong Type + df = pd.DataFrame({"c0": [1, 2], "c1": ["1", "3"], "c2": [True, False]}) + paths = wr.s3.to_parquet( df=df, - path=f"s3://{bucket}/test_parquet_catalog", - use_threads=True, + path=path, dataset=True, - mode="overwrite", + mode="overwrite_partitions", database=database, - table="test_parquet_catalog", - ) - wr.s3.to_parquet( - df=df, - path=f"s3://{bucket}/test_parquet_catalog2", - index=True, - use_threads=True, + table=table, + partition_cols=["c1"], + description="c0+c1+c2", + parameters={"num_cols": "3", "num_rows": "4"}, + columns_comments={"c0": "zero", "c1": "one", "c2": "two"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + df2 = wr.athena.read_sql_table(table, database) + assert len(df2.columns) == 3 + assert len(df2.index) == 4 + assert df2.c1.sum() == 6 + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == "3" + assert parameters["num_rows"] == "4" + assert wr.catalog.get_table_description(database, table) == "c0+c1+c2" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c0"] == "zero" + assert comments["c1"] == "one" + assert comments["c2"] == "two" + engine = wr.catalog.get_engine("aws-data-wrangler-redshift") + df3 = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) + assert len(df3.columns) == 3 + assert len(df3.index) == 4 + assert df3.c1.sum() == 6 + + +def test_store_parquet_metadata_modes(database, table, path, external_schema): + + # Round 1 - Warm up + df = pd.DataFrame({"c0": [0, None]}, dtype="Int64") + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite")["paths"] + wr.s3.wait_objects_exist(paths=paths) + wr.s3.store_parquet_metadata( + path=path, dataset=True, mode="overwrite", database=database, - table="test_parquet_catalog2", - partition_cols=["iint8", "iint16"], - ) - columns_types, partitions_types = wr.s3.read_parquet_metadata( - path=f"s3://{bucket}/test_parquet_catalog2", dataset=True - ) - assert len(columns_types) == 17 - assert len(partitions_types) == 2 - columns_types, partitions_types, partitions_values = wr.s3.store_parquet_metadata( - path=f"s3://{bucket}/test_parquet_catalog2", database=database, table="test_parquet_catalog2", dataset=True + table=table, + description="c0", + parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index))}, + columns_comments={"c0": "0"}, ) - assert len(columns_types) == 17 - assert len(partitions_types) == 2 - assert len(partitions_values) == 2 - wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_catalog/") - wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_catalog2/") - assert wr.catalog.delete_table_if_exists(database=database, table="test_parquet_catalog") is True - assert wr.catalog.delete_table_if_exists(database=database, table="test_parquet_catalog2") is True - + df2 = wr.athena.read_sql_table(table, database) + assert df.shape == df2.shape + assert df.c0.sum() == df2.c0.sum() + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == str(len(df2.columns)) + assert parameters["num_rows"] == str(len(df2.index)) + assert wr.catalog.get_table_description(database, table) == "c0" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c0"] == "0" -def test_parquet_catalog_duplicated(bucket, database): - path = f"s3://{bucket}/test_parquet_catalog_dedup/" - df = pd.DataFrame({"A": [1], "a": [1]}) - wr.s3.to_parquet( - df=df, + # Round 2 - Overwrite + df = pd.DataFrame({"c1": [None, 1, None]}, dtype="Int16") + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite")["paths"] + wr.s3.wait_objects_exist(paths=paths) + wr.s3.store_parquet_metadata( path=path, - index=False, dataset=True, mode="overwrite", database=database, - table="test_parquet_catalog_dedup", + table=table, + description="c1", + parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index))}, + columns_comments={"c1": "1"}, ) - df = wr.s3.read_parquet(path=path) - assert len(df.index) == 1 - assert len(df.columns) == 1 - wr.s3.delete_objects(path=path) - assert wr.catalog.delete_table_if_exists(database=database, table="test_parquet_catalog_dedup") is True - + df2 = wr.athena.read_sql_table(table, database) + assert df.shape == df2.shape + assert df.c1.sum() == df2.c1.sum() + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == str(len(df2.columns)) + assert parameters["num_rows"] == str(len(df2.index)) + assert wr.catalog.get_table_description(database, table) == "c1" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c1"] == "1" -def test_parquet_catalog_casting(bucket, database): - path = f"s3://{bucket}/test_parquet_catalog_casting/" - paths = wr.s3.to_parquet( - df=get_df_cast(), + # Round 3 - Append + df = pd.DataFrame({"c1": [None, 2, None]}, dtype="Int16") + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="append")["paths"] + wr.s3.wait_objects_exist(paths=paths) + wr.s3.store_parquet_metadata( path=path, - index=False, dataset=True, - mode="overwrite", + mode="append", database=database, - table="__test_parquet_catalog_casting", - dtype={ - "iint8": "tinyint", - "iint16": "smallint", - "iint32": "int", - "iint64": "bigint", - "float": "float", - "double": "double", - "decimal": "decimal(3,2)", - "string": "string", - "date": "date", - "timestamp": "timestamp", - "bool": "boolean", - "binary": "binary", - "category": "double", - "par0": "bigint", - "par1": "string", - }, - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - df = wr.s3.read_parquet(path=path) - assert len(df.index) == 3 - assert len(df.columns) == 15 - ensure_data_types(df=df, has_list=False) - df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=database, ctas_approach=True) - assert len(df.index) == 3 - assert len(df.columns) == 15 - ensure_data_types(df=df, has_list=False) - df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=database, ctas_approach=False) - assert len(df.index) == 3 - assert len(df.columns) == 15 - ensure_data_types(df=df, has_list=False) - wr.s3.delete_objects(path=path) - assert wr.catalog.delete_table_if_exists(database=database, table="__test_parquet_catalog_casting") is True + table=table, + description="c1", + parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index) * 2)}, + columns_comments={"c1": "1"}, + ) + df2 = wr.athena.read_sql_table(table, database) + assert len(df.columns) == len(df2.columns) + assert len(df.index) * 2 == len(df2.index) + assert df.c1.sum() + 1 == df2.c1.sum() + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == str(len(df2.columns)) + assert parameters["num_rows"] == str(len(df2.index)) + assert wr.catalog.get_table_description(database, table) == "c1" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c1"] == "1" + # Round 4 - Append + New Column + df = pd.DataFrame({"c2": ["a", None, "b"], "c1": [None, 1, None]}) + df["c1"] = df["c1"].astype("Int16") + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="append")["paths"] + wr.s3.wait_objects_exist(paths=paths) + wr.s3.store_parquet_metadata( + path=path, + dataset=True, + mode="append", + database=database, + table=table, + description="c1+c2", + parameters={"num_cols": "2", "num_rows": "9"}, + columns_comments={"c1": "1", "c2": "2"}, + ) + df2 = wr.athena.read_sql_table(table, database) + assert len(df2.columns) == 2 + assert len(df2.index) == 9 + assert df2.c1.sum() == 4 + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == "2" + assert parameters["num_rows"] == "9" + assert wr.catalog.get_table_description(database, table) == "c1+c2" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c1"] == "1" + assert comments["c2"] == "2" -def test_catalog(path, database, table): - account_id = boto3.client("sts").get_caller_identity().get("Account") - assert wr.catalog.does_table_exist(database=database, table=table) is False - wr.catalog.create_parquet_table( + # Round 5 - Overwrite Partitioned + df = pd.DataFrame({"c0": ["foo", None], "c1": [0, 1]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite", partition_cols=["c1"])["paths"] + wr.s3.wait_objects_exist(paths=paths) + wr.s3.store_parquet_metadata( + path=path, + dataset=True, + mode="overwrite", database=database, table=table, + description="c0+c1", + parameters={"num_cols": "2", "num_rows": "2"}, + columns_comments={"c0": "zero", "c1": "one"}, + ) + df2 = wr.athena.read_sql_table(table, database) + assert df.shape == df2.shape + assert df.c1.sum() == df2.c1.astype(int).sum() + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == "2" + assert parameters["num_rows"] == "2" + assert wr.catalog.get_table_description(database, table) == "c0+c1" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c0"] == "zero" + assert comments["c1"] == "one" + + # Round 6 - Overwrite Partitions + df = pd.DataFrame({"c0": [None, "boo"], "c1": [0, 2]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite_partitions", partition_cols=["c1"])[ + "paths" + ] + wr.s3.wait_objects_exist(paths=paths) + wr.s3.store_parquet_metadata( path=path, - columns_types={"col0": "int", "col1": "double"}, - partitions_types={"y": "int", "m": "int"}, - compression="snappy", + dataset=True, + mode="append", + database=database, + table=table, + description="c0+c1", + parameters={"num_cols": "2", "num_rows": "3"}, + columns_comments={"c0": "zero", "c1": "one"}, ) - with pytest.raises(wr.exceptions.InvalidArgumentValue): - wr.catalog.create_parquet_table( - database=database, table=table, path=path, columns_types={"col0": "string"}, mode="append" - ) - assert wr.catalog.does_table_exist(database=database, table=table) is True - assert wr.catalog.delete_table_if_exists(database=database, table=table) is True - assert wr.catalog.delete_table_if_exists(database=database, table=table) is False - wr.catalog.create_parquet_table( + df2 = wr.athena.read_sql_table(table, database) + assert len(df2.columns) == 2 + assert len(df2.index) == 3 + assert df2.c1.astype(int).sum() == 3 + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == "2" + assert parameters["num_rows"] == "3" + assert wr.catalog.get_table_description(database, table) == "c0+c1" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c0"] == "zero" + assert comments["c1"] == "one" + + # Round 7 - Overwrite Partitions + New Column + df = pd.DataFrame({"c0": ["bar", None], "c1": [1, 3], "c2": [True, False]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite_partitions", partition_cols=["c1"])[ + "paths" + ] + wr.s3.wait_objects_exist(paths=paths) + wr.s3.store_parquet_metadata( + path=path, + dataset=True, + mode="append", database=database, table=table, + description="c0+c1+c2", + parameters={"num_cols": "3", "num_rows": "4"}, + columns_comments={"c0": "zero", "c1": "one", "c2": "two"}, + ) + df2 = wr.athena.read_sql_table(table, database) + assert len(df2.columns) == 3 + assert len(df2.index) == 4 + assert df2.c1.astype(int).sum() == 6 + parameters = wr.catalog.get_table_parameters(database, table) + assert len(parameters) >= 5 + assert parameters["num_cols"] == "3" + assert parameters["num_rows"] == "4" + assert wr.catalog.get_table_description(database, table) == "c0+c1+c2" + comments = wr.catalog.get_columns_comments(database, table) + assert len(comments) == len(df.columns) + assert comments["c0"] == "zero" + assert comments["c1"] == "one" + assert comments["c2"] == "two" + engine = wr.catalog.get_engine("aws-data-wrangler-redshift") + df3 = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) + assert len(df3.columns) == 3 + assert len(df3.index) == 4 + assert df3.c1.astype(int).sum() == 6 + + +def test_athena_ctas(path, path2, path3, table, table2, database, kms_key): + df = get_df_list() + columns_types, partitions_types = wr.catalog.extract_athena_types(df=df, partition_cols=["par0", "par1"]) + assert len(columns_types) == 16 + assert len(partitions_types) == 2 + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.catalog.extract_athena_types(df=df, file_format="avro") + paths = wr.s3.to_parquet( + df=get_df_list(), path=path, - columns_types={"col0": "int", "col1": "double"}, - partitions_types={"y": "int", "m": "int"}, - compression="snappy", - description="Foo boo bar", - parameters={"tag": "test"}, - columns_comments={"col0": "my int", "y": "year"}, + index=True, + use_threads=True, + dataset=True, mode="overwrite", - ) - wr.catalog.add_parquet_partitions( database=database, table=table, - partitions_values={f"{path}y=2020/m=1/": ["2020", "1"], f"{path}y=2021/m=2/": ["2021", "2"]}, - compression="snappy", - ) - assert wr.catalog.get_table_location(database=database, table=table) == path - partitions_values = wr.catalog.get_parquet_partitions(database=database, table=table) - assert len(partitions_values) == 2 - partitions_values = wr.catalog.get_parquet_partitions( - database=database, table=table, catalog_id=account_id, expression="y = 2021 AND m = 2" + partition_cols=["par0", "par1"], + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + dirs = wr.s3.list_directories(path=path) + for d in dirs: + assert d.startswith(f"{path}par0=") + df = wr.s3.read_parquet_table(table=table, database=database) + assert len(df.index) == 3 + ensure_data_types(df=df, has_list=True) + df = wr.athena.read_sql_table( + table=table, + database=database, + ctas_approach=True, + encryption="SSE_KMS", + kms_key=kms_key, + s3_output=path2, + keep_files=False, ) - assert len(partitions_values) == 1 - assert len(set(partitions_values[f"{path}y=2021/m=2/"]) & {"2021", "2"}) == 2 - dtypes = wr.catalog.get_table_types(database=database, table=table) - assert dtypes["col0"] == "int" - assert dtypes["col1"] == "double" - assert dtypes["y"] == "int" - assert dtypes["m"] == "int" - df_dbs = wr.catalog.databases() - assert len(wr.catalog.databases(catalog_id=account_id)) == len(df_dbs) - assert database in df_dbs["Database"].to_list() - tables = list(wr.catalog.get_tables()) - assert len(tables) > 0 - for tbl in tables: - if tbl["Name"] == table: - assert tbl["TableType"] == "EXTERNAL_TABLE" - tables = list(wr.catalog.get_tables(database=database)) - assert len(tables) > 0 - for tbl in tables: - assert tbl["DatabaseName"] == database - # search - tables = list(wr.catalog.search_tables(text="parquet", catalog_id=account_id)) - assert len(tables) > 0 - for tbl in tables: - if tbl["Name"] == table: - assert tbl["TableType"] == "EXTERNAL_TABLE" - # prefix - tables = list(wr.catalog.get_tables(name_prefix=table[:4], catalog_id=account_id)) - assert len(tables) > 0 - for tbl in tables: - if tbl["Name"] == table: - assert tbl["TableType"] == "EXTERNAL_TABLE" - # suffix - tables = list(wr.catalog.get_tables(name_suffix=table[-4:], catalog_id=account_id)) - assert len(tables) > 0 - for tbl in tables: - if tbl["Name"] == table: - assert tbl["TableType"] == "EXTERNAL_TABLE" - # name_contains - tables = list(wr.catalog.get_tables(name_contains=table[4:-4], catalog_id=account_id)) - assert len(tables) > 0 - for tbl in tables: - if tbl["Name"] == table: - assert tbl["TableType"] == "EXTERNAL_TABLE" - # prefix & suffix & name_contains - tables = list( - wr.catalog.get_tables( - name_prefix=table[0], name_contains=table[3], name_suffix=table[-1], catalog_id=account_id - ) - ) - assert len(tables) > 0 - for tbl in tables: - if tbl["Name"] == table: - assert tbl["TableType"] == "EXTERNAL_TABLE" - # prefix & suffix - tables = list(wr.catalog.get_tables(name_prefix=table[0], name_suffix=table[-1], catalog_id=account_id)) - assert len(tables) > 0 - for tbl in tables: - if tbl["Name"] == table: - assert tbl["TableType"] == "EXTERNAL_TABLE" - # DataFrames - assert len(wr.catalog.databases().index) > 0 - assert len(wr.catalog.tables().index) > 0 - assert ( - len( - wr.catalog.tables( - database=database, - search_text="parquet", - name_prefix=table[0], - name_contains=table[3], - name_suffix=table[-1], - catalog_id=account_id, - ).index - ) - > 0 - ) - assert len(wr.catalog.table(database=database, table=table).index) > 0 - assert len(wr.catalog.table(database=database, table=table, catalog_id=account_id).index) > 0 - with pytest.raises(wr.exceptions.InvalidTable): - wr.catalog.overwrite_table_parameters({"foo": "boo"}, database, "fake_table") - - -def test_s3_get_bucket_region(bucket, region): - assert wr.s3.get_bucket_region(bucket=bucket) == region - assert wr.s3.get_bucket_region(bucket=bucket, boto3_session=boto3.Session()) == region - - -def test_catalog_get_databases(database): - dbs = list(wr.catalog.get_databases()) - assert len(dbs) > 0 - for db in dbs: - if db["Name"] == database: - assert db["Description"] == "AWS Data Wrangler Test Arena - Glue Database" - - -def test_athena_query_cancelled(database): - session = boto3.Session() - query_execution_id = wr.athena.start_query_execution(sql=get_query_long(), database=database, boto3_session=session) - wr.athena.stop_query_execution(query_execution_id=query_execution_id, boto3_session=session) - with pytest.raises(wr.exceptions.QueryCancelled): - assert wr.athena.wait_query(query_execution_id=query_execution_id) - - -def test_athena_query_failed(database): - query_execution_id = wr.athena.start_query_execution(sql="SELECT random(-1)", database=database) - with pytest.raises(wr.exceptions.QueryFailed): - assert wr.athena.wait_query(query_execution_id=query_execution_id) - - -def test_athena_read_list(database): - with pytest.raises(wr.exceptions.UnsupportedType): - wr.athena.read_sql_query(sql="SELECT ARRAY[1, 2, 3]", database=database, ctas_approach=False) - - -def test_sanitize_names(): - assert wr.catalog.sanitize_column_name("CamelCase") == "camel_case" - assert wr.catalog.sanitize_column_name("CamelCase2") == "camel_case2" - assert wr.catalog.sanitize_column_name("Camel_Case3") == "camel_case3" - assert wr.catalog.sanitize_column_name("Cámël_Casë4仮") == "camel_case4_" - assert wr.catalog.sanitize_column_name("Camel__Case5") == "camel__case5" - assert wr.catalog.sanitize_column_name("Camel{}Case6") == "camel_case6" - assert wr.catalog.sanitize_column_name("Camel.Case7") == "camel_case7" - assert wr.catalog.sanitize_column_name("xyz_cd") == "xyz_cd" - assert wr.catalog.sanitize_column_name("xyz_Cd") == "xyz_cd" - assert wr.catalog.sanitize_table_name("CamelCase") == "camel_case" - assert wr.catalog.sanitize_table_name("CamelCase2") == "camel_case2" - assert wr.catalog.sanitize_table_name("Camel_Case3") == "camel_case3" - assert wr.catalog.sanitize_table_name("Cámël_Casë4仮") == "camel_case4_" - assert wr.catalog.sanitize_table_name("Camel__Case5") == "camel__case5" - assert wr.catalog.sanitize_table_name("Camel{}Case6") == "camel_case6" - assert wr.catalog.sanitize_table_name("Camel.Case7") == "camel_case7" - assert wr.catalog.sanitize_table_name("xyz_cd") == "xyz_cd" - assert wr.catalog.sanitize_table_name("xyz_Cd") == "xyz_cd" - - -def test_athena_ctas_empty(database): - sql = """ - WITH dataset AS ( - SELECT 0 AS id - ) - SELECT id - FROM dataset - WHERE id != 0 - """ - assert wr.athena.read_sql_query(sql=sql, database=database).empty is True - assert len(list(wr.athena.read_sql_query(sql=sql, database=database, chunksize=1))) == 0 - - -def test_s3_empty_dfs(): - df = pd.DataFrame() - with pytest.raises(wr.exceptions.EmptyDataFrame): - wr.s3.to_parquet(df=df, path="") - with pytest.raises(wr.exceptions.EmptyDataFrame): - wr.s3.to_csv(df=df, path="") - - -def test_absent_object(bucket): - path = f"s3://{bucket}/test_absent_object" - assert wr.s3.does_object_exist(path=path) is False - assert len(wr.s3.size_objects(path=path)) == 0 - assert wr.s3.wait_objects_exist(paths=[]) is None - - -def test_athena_struct(database): - sql = "SELECT CAST(ROW(1, 'foo') AS ROW(id BIGINT, value VARCHAR)) AS col0" - with pytest.raises(wr.exceptions.UnsupportedType): - wr.athena.read_sql_query(sql=sql, database=database, ctas_approach=False) - df = wr.athena.read_sql_query(sql=sql, database=database, ctas_approach=True) - assert len(df.index) == 1 - assert len(df.columns) == 1 - assert df["col0"].iloc[0]["id"] == 1 - assert df["col0"].iloc[0]["value"] == "foo" - sql = "SELECT ROW(1, ROW(2, ROW(3, '4'))) AS col0" - df = wr.athena.read_sql_query(sql=sql, database=database, ctas_approach=True) - assert len(df.index) == 1 - assert len(df.columns) == 1 - assert df["col0"].iloc[0]["field0"] == 1 - assert df["col0"].iloc[0]["field1"]["field0"] == 2 - assert df["col0"].iloc[0]["field1"]["field1"]["field0"] == 3 - assert df["col0"].iloc[0]["field1"]["field1"]["field1"] == "4" + assert len(df.index) == 3 + ensure_data_types(df=df, has_list=True) + final_destination = f"{path3}{table2}/" + # keep_files=False + wr.s3.delete_objects(path=path3) + dfs = wr.athena.read_sql_query( + sql=f"SELECT * FROM {table}", + database=database, + ctas_approach=True, + chunksize=1, + keep_files=False, + ctas_temp_table_name=table2, + s3_output=path3, + ) + assert wr.catalog.does_table_exist(database=database, table=table2) is False + assert len(wr.s3.list_objects(path=path3)) > 2 + assert len(wr.s3.list_objects(path=final_destination)) > 0 + for df in dfs: + ensure_data_types(df=df, has_list=True) + assert len(wr.s3.list_objects(path=path3)) == 0 -def test_athena_time_zone(database): - sql = "SELECT current_timestamp AS value, typeof(current_timestamp) AS type" - df = wr.athena.read_sql_query(sql=sql, database=database, ctas_approach=False) - assert len(df.index) == 1 - assert len(df.columns) == 2 - assert df["type"][0] == "timestamp with time zone" - assert df["value"][0].year == datetime.datetime.utcnow().year + # keep_files=True + wr.s3.delete_objects(path=path3) + dfs = wr.athena.read_sql_query( + sql=f"SELECT * FROM {table}", + database=database, + ctas_approach=True, + chunksize=2, + keep_files=True, + ctas_temp_table_name=table2, + s3_output=path3, + ) + assert wr.catalog.does_table_exist(database=database, table=table2) is False + assert len(wr.s3.list_objects(path=path3)) > 2 + assert len(wr.s3.list_objects(path=final_destination)) > 0 + for df in dfs: + ensure_data_types(df=df, has_list=True) + assert len(wr.s3.list_objects(path=path3)) > 2 -def test_category(bucket, database): - df = get_df_category() - path = f"s3://{bucket}/test_category/" +def test_athena(path, database, kms_key, workgroup0, workgroup1): + wr.catalog.delete_table_if_exists(database=database, table="__test_athena") paths = wr.s3.to_parquet( - df=df, + df=get_df(), path=path, + index=True, + use_threads=True, dataset=True, - database=database, - table="test_category", mode="overwrite", + database=database, + table="__test_athena", partition_cols=["par0", "par1"], )["paths"] wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.s3.read_parquet(path=path, dataset=True, categories=[c for c in df.columns if c not in ["par0", "par1"]]) - ensure_data_types_category(df2) - df2 = wr.athena.read_sql_query("SELECT * FROM test_category", database=database, categories=list(df.columns)) - ensure_data_types_category(df2) - df2 = wr.athena.read_sql_table(table="test_category", database=database, categories=list(df.columns)) - ensure_data_types_category(df2) - df2 = wr.athena.read_sql_query( - "SELECT * FROM test_category", database=database, categories=list(df.columns), ctas_approach=False - ) - ensure_data_types_category(df2) dfs = wr.athena.read_sql_query( - "SELECT * FROM test_category", database=database, categories=list(df.columns), ctas_approach=False, chunksize=1 + sql="SELECT * FROM __test_athena", + database=database, + ctas_approach=False, + chunksize=1, + encryption="SSE_KMS", + kms_key=kms_key, + workgroup=workgroup0, + keep_files=False, ) for df2 in dfs: - ensure_data_types_category(df2) - dfs = wr.athena.read_sql_query( - "SELECT * FROM test_category", database=database, categories=list(df.columns), ctas_approach=True, chunksize=1 + print(df2) + ensure_data_types(df=df2) + df = wr.athena.read_sql_query( + sql="SELECT * FROM __test_athena", + database=database, + ctas_approach=False, + workgroup=workgroup1, + keep_files=False, ) - for df2 in dfs: - ensure_data_types_category(df2) - wr.s3.delete_objects(path=paths) - assert wr.catalog.delete_table_if_exists(database=database, table="test_category") is True + assert len(df.index) == 3 + ensure_data_types(df=df) + wr.athena.repair_table(table="__test_athena", database=database) + wr.catalog.delete_table_if_exists(database=database, table="__test_athena") -def test_parquet_validate_schema(path): +def test_csv(bucket): + session = boto3.Session() df = pd.DataFrame({"id": [1, 2, 3]}) - path_file = f"{path}0.parquet" - wr.s3.to_parquet(df=df, path=path_file) - wr.s3.wait_objects_exist(paths=[path_file]) - df2 = pd.DataFrame({"id2": [1, 2, 3], "val": ["foo", "boo", "bar"]}) - path_file2 = f"{path}1.parquet" - wr.s3.to_parquet(df=df2, path=path_file2) - wr.s3.wait_objects_exist(paths=[path_file2], use_threads=False) - df3 = wr.s3.read_parquet(path=path, validate_schema=False) - assert len(df3.index) == 6 - assert len(df3.columns) == 3 - with pytest.raises(ValueError): - wr.s3.read_parquet(path=path, validate_schema=True) + path0 = f"s3://{bucket}/test_csv0.csv" + path1 = f"s3://{bucket}/test_csv1.csv" + path2 = f"s3://{bucket}/test_csv2.csv" + wr.s3.to_csv(df=df, path=path0, index=False) + wr.s3.wait_objects_exist(paths=[path0]) + assert wr.s3.does_object_exist(path=path0) is True + assert wr.s3.size_objects(path=[path0], use_threads=False)[path0] == 9 + assert wr.s3.size_objects(path=[path0], use_threads=True)[path0] == 9 + wr.s3.to_csv(df=df, path=path1, index=False, boto3_session=None) + wr.s3.to_csv(df=df, path=path2, index=False, boto3_session=session) + assert df.equals(wr.s3.read_csv(path=path0, use_threads=False)) + assert df.equals(wr.s3.read_csv(path=path0, use_threads=True)) + assert df.equals(wr.s3.read_csv(path=path0, use_threads=False, boto3_session=session)) + assert df.equals(wr.s3.read_csv(path=path0, use_threads=True, boto3_session=session)) + paths = [path0, path1, path2] + df2 = pd.concat(objs=[df, df, df], sort=False, ignore_index=True) + assert df2.equals(wr.s3.read_csv(path=paths, use_threads=False)) + assert df2.equals(wr.s3.read_csv(path=paths, use_threads=True)) + assert df2.equals(wr.s3.read_csv(path=paths, use_threads=False, boto3_session=session)) + assert df2.equals(wr.s3.read_csv(path=paths, use_threads=True, boto3_session=session)) + with pytest.raises(wr.exceptions.InvalidArgumentType): + wr.s3.read_csv(path=1) + with pytest.raises(wr.exceptions.InvalidArgument): + wr.s3.read_csv(path=paths, iterator=True) + wr.s3.delete_objects(path=paths, use_threads=False) + wr.s3.wait_objects_not_exist(paths=paths, use_threads=False) -def test_csv_dataset(bucket, database): - path = f"s3://{bucket}/test_csv_dataset/" - with pytest.raises(wr.exceptions.UndetectedType): - wr.s3.to_csv(pd.DataFrame({"A": [None]}), path, dataset=True, database=database, table="test_csv_dataset") - df = get_df_csv() +def test_json(bucket): + df0 = pd.DataFrame({"id": [1, 2, 3]}) + path0 = f"s3://{bucket}/test_json0.json" + path1 = f"s3://{bucket}/test_json1.json" + wr.s3.to_json(df=df0, path=path0) + wr.s3.to_json(df=df0, path=path1) + wr.s3.wait_objects_exist(paths=[path0, path1]) + assert df0.equals(wr.s3.read_json(path=path0, use_threads=False)) + df1 = pd.concat(objs=[df0, df0], sort=False, ignore_index=True) + assert df1.equals(wr.s3.read_json(path=[path0, path1], use_threads=True)) + wr.s3.delete_objects(path=[path0, path1], use_threads=False) + + +def test_fwf(path): + text = "1 Herfelingen27-12-18\n2 Lambusart14-06-18\n3Spormaggiore15-04-18" + client_s3 = boto3.client("s3") + path0 = f"{path}/0.txt" + bucket, key = wr._utils.parse_path(path0) + client_s3.put_object(Body=text, Bucket=bucket, Key=key) + path1 = f"{path}/1.txt" + bucket, key = wr._utils.parse_path(path1) + client_s3.put_object(Body=text, Bucket=bucket, Key=key) + wr.s3.wait_objects_exist(paths=[path0, path1]) + df = wr.s3.read_fwf(path=path0, use_threads=False, widths=[1, 12, 8], names=["id", "name", "date"]) + assert len(df.index) == 3 + assert len(df.columns) == 3 + df = wr.s3.read_fwf(path=[path0, path1], use_threads=True, widths=[1, 12, 8], names=["id", "name", "date"]) + assert len(df.index) == 6 + assert len(df.columns) == 3 + + +def test_parquet(bucket): + wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet/") + df_file = pd.DataFrame({"id": [1, 2, 3]}) + path_file = f"s3://{bucket}/test_parquet/test_parquet_file.parquet" + df_dataset = pd.DataFrame({"id": [1, 2, 3], "partition": ["A", "A", "B"]}) + df_dataset["partition"] = df_dataset["partition"].astype("category") + path_dataset = f"s3://{bucket}/test_parquet/test_parquet_dataset" with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_csv(df, path, dataset=False, mode="overwrite", database=database, table="test_csv_dataset") + wr.s3.to_parquet(df=df_file, path=path_file, mode="append") + with pytest.raises(wr.exceptions.InvalidCompression): + wr.s3.to_parquet(df=df_file, path=path_file, compression="WRONG") with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_csv(df, path, dataset=False, table="test_csv_dataset") + wr.s3.to_parquet(df=df_dataset, path=path_dataset, partition_cols=["col2"]) with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_csv(df, path, dataset=True, mode="overwrite", database=database) + wr.s3.to_parquet(df=df_dataset, path=path_dataset, description="foo") + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.s3.to_parquet(df=df_dataset, path=path_dataset, partition_cols=["col2"], dataset=True, mode="WRONG") + paths = wr.s3.to_parquet(df=df_file, path=path_file)["paths"] + wr.s3.wait_objects_exist(paths=paths) + assert len(wr.s3.read_parquet(path=path_file, use_threads=True, boto3_session=None).index) == 3 + assert len(wr.s3.read_parquet(path=[path_file], use_threads=False, boto3_session=boto3.Session()).index) == 3 + paths = wr.s3.to_parquet(df=df_dataset, path=path_dataset, dataset=True)["paths"] + wr.s3.wait_objects_exist(paths=paths) + assert len(wr.s3.read_parquet(path=paths, dataset=True).index) == 3 + assert len(wr.s3.read_parquet(path=path_dataset, use_threads=True, boto3_session=boto3.Session()).index) == 3 + dataset_paths = wr.s3.to_parquet( + df=df_dataset, path=path_dataset, dataset=True, partition_cols=["partition"], mode="overwrite" + )["paths"] + wr.s3.wait_objects_exist(paths=dataset_paths) + assert len(wr.s3.read_parquet(path=path_dataset, use_threads=True, boto3_session=None).index) == 3 + assert len(wr.s3.read_parquet(path=dataset_paths, use_threads=True).index) == 3 + assert len(wr.s3.read_parquet(path=path_dataset, dataset=True, use_threads=True).index) == 3 + wr.s3.to_parquet(df=df_dataset, path=path_dataset, dataset=True, partition_cols=["partition"], mode="overwrite") + wr.s3.to_parquet( + df=df_dataset, path=path_dataset, dataset=True, partition_cols=["partition"], mode="overwrite_partitions" + ) + wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet/") + + +def test_parquet_catalog(bucket, database): + with pytest.raises(wr.exceptions.UndetectedType): + wr.s3.to_parquet( + df=pd.DataFrame({"A": [None]}), + path=f"s3://{bucket}/test_parquet_catalog", + dataset=True, + database=database, + table="test_parquet_catalog", + ) + df = get_df_list() with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_csv(df=df, path=path, mode="append") + wr.s3.to_parquet( + df=df, + path=f"s3://{bucket}/test_parquet_catalog", + use_threads=True, + dataset=False, + mode="overwrite", + database=database, + table="test_parquet_catalog", + ) with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_csv(df=df, path=path, partition_cols=["col2"]) + wr.s3.to_parquet( + df=df, + path=f"s3://{bucket}/test_parquet_catalog", + use_threads=True, + dataset=False, + table="test_parquet_catalog", + ) with pytest.raises(wr.exceptions.InvalidArgumentCombination): - wr.s3.to_csv(df=df, path=path, description="foo") - with pytest.raises(wr.exceptions.InvalidArgumentValue): - wr.s3.to_csv(df=df, path=path, partition_cols=["col2"], dataset=True, mode="WRONG") - paths = wr.s3.to_csv( + wr.s3.to_parquet( + df=df, + path=f"s3://{bucket}/test_parquet_catalog", + use_threads=True, + dataset=True, + mode="overwrite", + database=database, + ) + wr.s3.to_parquet( df=df, - path=path, - sep="|", - index=False, + path=f"s3://{bucket}/test_parquet_catalog", use_threads=True, - boto3_session=None, - s3_additional_kwargs=None, dataset=True, - partition_cols=["par0", "par1"], mode="overwrite", - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - df2 = wr.s3.read_csv(path=paths, sep="|", header=None) - assert len(df2.index) == 3 - assert len(df2.columns) == 8 - assert df2[0].sum() == 6 - wr.s3.delete_objects(path=paths) - - -def test_csv_catalog(bucket, database): - path = f"s3://{bucket}/test_csv_catalog/" - df = get_df_csv() - paths = wr.s3.to_csv( + database=database, + table="test_parquet_catalog", + ) + wr.s3.to_parquet( df=df, - path=path, - sep="\t", + path=f"s3://{bucket}/test_parquet_catalog2", index=True, use_threads=True, - boto3_session=None, - s3_additional_kwargs=None, - dataset=True, - partition_cols=["par0", "par1"], - mode="overwrite", - table="test_csv_catalog", - database=database, - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table("test_csv_catalog", database) - assert len(df2.index) == 3 - assert len(df2.columns) == 11 - assert df2["id"].sum() == 6 - ensure_data_types_csv(df2) - wr.s3.delete_objects(path=paths) - assert wr.catalog.delete_table_if_exists(database=database, table="test_csv_catalog") is True - - -def test_csv_catalog_columns(bucket, database): - path = f"s3://{bucket}/test_csv_catalog_columns /" - paths = wr.s3.to_csv( - df=get_df_csv(), - path=path, - sep="|", - columns=["id", "date", "timestamp", "par0", "par1"], - index=False, - use_threads=False, - boto3_session=None, - s3_additional_kwargs=None, dataset=True, - partition_cols=["par0", "par1"], mode="overwrite", - table="test_csv_catalog_columns", - database=database, - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table("test_csv_catalog_columns", database) - assert len(df2.index) == 3 - assert len(df2.columns) == 5 - assert df2["id"].sum() == 6 - ensure_data_types_csv(df2) - - paths = wr.s3.to_csv( - df=pd.DataFrame({"id": [4], "date": [None], "timestamp": [None], "par0": [1], "par1": ["a"]}), - path=path, - sep="|", - index=False, - use_threads=False, - boto3_session=None, - s3_additional_kwargs=None, - dataset=True, - partition_cols=["par0", "par1"], - mode="overwrite_partitions", - table="test_csv_catalog_columns", database=database, - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table("test_csv_catalog_columns", database) - assert len(df2.index) == 3 - assert len(df2.columns) == 5 - assert df2["id"].sum() == 9 - ensure_data_types_csv(df2) - - wr.s3.delete_objects(path=path) - assert wr.catalog.delete_table_if_exists(database=database, table="test_csv_catalog_columns") is True - - -def test_athena_types(bucket, database): - path = f"s3://{bucket}/test_athena_types/" - df = get_df_csv() - paths = wr.s3.to_csv( - df=df, - path=path, - sep=",", - index=False, - use_threads=True, - boto3_session=None, - s3_additional_kwargs=None, - dataset=True, - partition_cols=["par0", "par1"], - mode="overwrite", - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - columns_types, partitions_types = wr.catalog.extract_athena_types( - df=df, index=False, partition_cols=["par0", "par1"], file_format="csv" - ) - wr.catalog.create_csv_table( - table="test_athena_types", - database=database, - path=path, - partitions_types=partitions_types, - columns_types=columns_types, - ) - wr.catalog.create_csv_table( - database=database, table="test_athena_types", path=path, columns_types={"col0": "string"}, mode="append" - ) - wr.athena.repair_table("test_athena_types", database) - assert len(wr.catalog.get_csv_partitions(database, "test_athena_types")) == 3 - df2 = wr.athena.read_sql_table("test_athena_types", database) - assert len(df2.index) == 3 - assert len(df2.columns) == 10 - assert df2["id"].sum() == 6 - ensure_data_types_csv(df2) - wr.s3.delete_objects(path=paths) - assert wr.catalog.delete_table_if_exists(database=database, table="test_athena_types") is True - - -def test_parquet_catalog_columns(bucket, database): - path = f"s3://{bucket}/test_parquet_catalog_columns/" - paths = wr.s3.to_parquet( - df=get_df_csv()[["id", "date", "timestamp", "par0", "par1"]], - path=path, - index=False, - use_threads=False, - boto3_session=None, - s3_additional_kwargs=None, - dataset=True, - partition_cols=["par0", "par1"], - mode="overwrite", - table="test_parquet_catalog_columns", - database=database, - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table("test_parquet_catalog_columns", database) - assert len(df2.index) == 3 - assert len(df2.columns) == 5 - assert df2["id"].sum() == 6 - ensure_data_types_csv(df2) - - paths = wr.s3.to_parquet( - df=pd.DataFrame({"id": [4], "date": [None], "timestamp": [None], "par0": [1], "par1": ["a"]}), - path=path, - index=False, - use_threads=False, - boto3_session=None, - s3_additional_kwargs=None, - dataset=True, - partition_cols=["par0", "par1"], - mode="overwrite_partitions", - table="test_parquet_catalog_columns", - database=database, - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table("test_parquet_catalog_columns", database) - assert len(df2.index) == 3 - assert len(df2.columns) == 5 - assert df2["id"].sum() == 9 - ensure_data_types_csv(df2) - - wr.s3.delete_objects(path=path) - assert wr.catalog.delete_table_if_exists(database=database, table="test_parquet_catalog_columns") is True - - -@pytest.mark.parametrize("compression", [None, "gzip", "snappy"]) -def test_parquet_compress(bucket, database, compression): - path = f"s3://{bucket}/test_parquet_compress_{compression}/" - paths = wr.s3.to_parquet( - df=get_df(), - path=path, - compression=compression, - dataset=True, - database=database, - table=f"test_parquet_compress_{compression}", - mode="overwrite", - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table(f"test_parquet_compress_{compression}", database) - ensure_data_types(df2) - df2 = wr.s3.read_parquet(path=path) - wr.s3.delete_objects(path=path) - assert wr.catalog.delete_table_if_exists(database=database, table=f"test_parquet_compress_{compression}") is True - ensure_data_types(df2) - - -@pytest.mark.parametrize("compression", ["gzip", "bz2", "xz"]) -def test_csv_compress(bucket, compression): - path = f"s3://{bucket}/test_csv_compress_{compression}/" - wr.s3.delete_objects(path=path) - df = get_df_csv() - if compression == "gzip": - buffer = BytesIO() - with gzip.GzipFile(mode="w", fileobj=buffer) as zipped_file: - df.to_csv(TextIOWrapper(zipped_file, "utf8"), index=False, header=None) - s3_resource = boto3.resource("s3") - s3_object = s3_resource.Object(bucket, f"test_csv_compress_{compression}/test.csv.gz") - s3_object.put(Body=buffer.getvalue()) - file_path = f"s3://{bucket}/test_csv_compress_{compression}/test.csv.gz" - elif compression == "bz2": - buffer = BytesIO() - with bz2.BZ2File(mode="w", filename=buffer) as zipped_file: - df.to_csv(TextIOWrapper(zipped_file, "utf8"), index=False, header=None) - s3_resource = boto3.resource("s3") - s3_object = s3_resource.Object(bucket, f"test_csv_compress_{compression}/test.csv.bz2") - s3_object.put(Body=buffer.getvalue()) - file_path = f"s3://{bucket}/test_csv_compress_{compression}/test.csv.bz2" - elif compression == "xz": - buffer = BytesIO() - with lzma.LZMAFile(mode="w", filename=buffer) as zipped_file: - df.to_csv(TextIOWrapper(zipped_file, "utf8"), index=False, header=None) - s3_resource = boto3.resource("s3") - s3_object = s3_resource.Object(bucket, f"test_csv_compress_{compression}/test.csv.xz") - s3_object.put(Body=buffer.getvalue()) - file_path = f"s3://{bucket}/test_csv_compress_{compression}/test.csv.xz" - else: - file_path = f"s3://{bucket}/test_csv_compress_{compression}/test.csv" - wr.s3.to_csv(df=df, path=file_path, index=False, header=None) - - wr.s3.wait_objects_exist(paths=[file_path]) - df2 = wr.s3.read_csv(path=[file_path], names=df.columns) - assert len(df2.index) == 3 - assert len(df2.columns) == 10 - dfs = wr.s3.read_csv(path=[file_path], names=df.columns, chunksize=1) - for df3 in dfs: - assert len(df3.columns) == 10 - wr.s3.delete_objects(path=path) - - -def test_parquet_char_length(path, database, table, external_schema): - df = pd.DataFrame( - {"id": [1, 2], "cchar": ["foo", "boo"], "date": [datetime.date(2020, 1, 1), datetime.date(2020, 1, 2)]} - ) - wr.s3.to_parquet( - df=df, - path=path, - dataset=True, - database=database, - table=table, - mode="overwrite", - partition_cols=["date"], - dtype={"cchar": "char(3)"}, + table="test_parquet_catalog2", + partition_cols=["iint8", "iint16"], ) - - df2 = wr.s3.read_parquet(path, dataset=True) - assert len(df2.index) == 2 - assert len(df2.columns) == 3 - assert df2.id.sum() == 3 - - df2 = wr.athena.read_sql_table(table=table, database=database) - assert len(df2.index) == 2 - assert len(df2.columns) == 3 - assert df2.id.sum() == 3 - - engine = wr.catalog.get_engine("aws-data-wrangler-redshift") - df2 = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) - assert len(df2.index) == 2 - assert len(df2.columns) == 3 - assert df2.id.sum() == 3 - - -def test_merge(bucket): - path = f"s3://{bucket}/test_merge/" - df = pd.DataFrame({"id": [1, 2, 3], "par": [1, 2, 3]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] - wr.s3.wait_objects_exist(paths=paths) - df = wr.s3.read_parquet(path=path, dataset=True) - assert df.id.sum() == 6 - assert df.par.astype("Int64").sum() == 6 - - path2 = f"s3://{bucket}/test_merge2/" - df = pd.DataFrame({"id": [1, 2, 3], "par": [1, 2, 3]}) - paths = wr.s3.to_parquet(df=df, path=path2, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] - wr.s3.wait_objects_exist(paths=paths) - paths = wr.s3.merge_datasets(source_path=path2, target_path=path, mode="append", use_threads=True) - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = wr.s3.read_parquet(path=path, dataset=True) - assert df.id.sum() == 12 - assert df.par.astype("Int64").sum() == 12 - - paths = wr.s3.merge_datasets(source_path=path2, target_path=path, mode="overwrite", use_threads=False) - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = wr.s3.read_parquet(path=path, dataset=True) - assert df.id.sum() == 6 - assert df.par.astype("Int64").sum() == 6 - - df = pd.DataFrame({"id": [4], "par": [3]}) - paths = wr.s3.to_parquet(df=df, path=path2, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] - wr.s3.wait_objects_exist(paths=paths) - paths = wr.s3.merge_datasets(source_path=path2, target_path=path, mode="overwrite_partitions", use_threads=True) - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = wr.s3.read_parquet(path=path, dataset=True) - assert df.id.sum() == 7 - assert df.par.astype("Int64").sum() == 6 - - with pytest.raises(wr.exceptions.InvalidArgumentValue): - wr.s3.merge_datasets(source_path=path, target_path="bar", mode="WRONG") - - assert len(wr.s3.merge_datasets(source_path=f"s3://{bucket}/empty/", target_path="bar")) == 0 - - wr.s3.delete_objects(path=path) - wr.s3.delete_objects(path=path2) - - -def test_copy(bucket): - path = f"s3://{bucket}/test_copy/" - df = pd.DataFrame({"id": [1, 2, 3], "par": [1, 2, 3]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] - wr.s3.wait_objects_exist(paths=paths) - df = wr.s3.read_parquet(path=path, dataset=True) - assert df.id.sum() == 6 - assert df.par.astype("Int64").sum() == 6 - - path2 = f"s3://{bucket}/test_copy2/" - df = pd.DataFrame({"id": [1, 2, 3], "par": [1, 2, 3]}) - paths = wr.s3.to_parquet(df=df, path=path2, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] - wr.s3.wait_objects_exist(paths=paths) - paths = wr.s3.copy_objects(paths, source_path=path2, target_path=path, use_threads=True) - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = wr.s3.read_parquet(path=path, dataset=True) - assert df.id.sum() == 12 - assert df.par.astype("Int64").sum() == 12 - - assert len(wr.s3.copy_objects([], source_path="boo", target_path="bar")) == 0 - - wr.s3.delete_objects(path=path) - wr.s3.delete_objects(path=path2) - - -@pytest.mark.parametrize("col2", [[1, 1, 1, 1, 1], [1, 2, 3, 4, 5], [1, 1, 1, 1, 2], [1, 2, 2, 2, 2]]) -@pytest.mark.parametrize("chunked", [True, 1, 2, 100]) -def test_parquet_chunked(bucket, database, col2, chunked): - table = f"test_parquet_chunked_{chunked}_{''.join([str(x) for x in col2])}" - path = f"s3://{bucket}/{table}/" - wr.s3.delete_objects(path=path) - values = list(range(5)) - df = pd.DataFrame({"col1": values, "col2": col2}) - paths = wr.s3.to_parquet( - df, path, index=False, dataset=True, database=database, table=table, partition_cols=["col2"], mode="overwrite" - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - - dfs = list(wr.s3.read_parquet(path=path, dataset=True, chunked=chunked)) - assert sum(values) == pd.concat(dfs, ignore_index=True).col1.sum() - if chunked is not True: - assert len(dfs) == int(math.ceil(len(df) / chunked)) - for df2 in dfs[:-1]: - assert chunked == len(df2) - assert chunked >= len(dfs[-1]) - else: - assert len(dfs) == len(set(col2)) - - dfs = list(wr.athena.read_sql_table(database=database, table=table, chunksize=chunked)) - assert sum(values) == pd.concat(dfs, ignore_index=True).col1.sum() - if chunked is not True: - assert len(dfs) == int(math.ceil(len(df) / chunked)) - for df2 in dfs[:-1]: - assert chunked == len(df2) - assert chunked >= len(dfs[-1]) - - wr.s3.delete_objects(path=paths) - assert wr.catalog.delete_table_if_exists(database=database, table=table) is True - - -@pytest.mark.parametrize("workgroup", [None, 0, 1, 2, 3]) -@pytest.mark.parametrize("encryption", [None, "SSE_S3", "SSE_KMS"]) -# @pytest.mark.parametrize("workgroup", [3]) -# @pytest.mark.parametrize("encryption", [None]) -def test_athena_encryption( - path, path2, database, table, table2, kms_key, encryption, workgroup, workgroup0, workgroup1, workgroup2, workgroup3 -): - kms_key = None if (encryption == "SSE_S3") or (encryption is None) else kms_key - if workgroup == 0: - workgroup = workgroup0 - elif workgroup == 1: - workgroup = workgroup1 - elif workgroup == 2: - workgroup = workgroup2 - elif workgroup == 3: - workgroup = workgroup3 - df = pd.DataFrame({"a": [1, 2], "b": ["foo", "boo"]}) - paths = wr.s3.to_parquet( - df=df, path=path, dataset=True, mode="overwrite", database=database, table=table, s3_additional_kwargs=None - )["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.athena.read_sql_table( - table=table, - ctas_approach=True, - database=database, - encryption=encryption, - workgroup=workgroup, - kms_key=kms_key, - keep_files=True, - ctas_temp_table_name=table2, - s3_output=path2, + columns_types, partitions_types = wr.s3.read_parquet_metadata( + path=f"s3://{bucket}/test_parquet_catalog2", dataset=True ) - assert wr.catalog.does_table_exist(database=database, table=table2) is False - assert len(df2.index) == 2 - assert len(df2.columns) == 2 - - -def test_athena_nested(path, database, table): - df = pd.DataFrame( - { - "c0": [[1, 2, 3], [4, 5, 6]], - "c1": [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], - "c2": [[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]], - "c3": [[], [[[[[[[[1]]]]]]]]], - "c4": [{"a": 1}, {"a": 1}], - "c5": [{"a": {"b": {"c": [1, 2]}}}, {"a": {"b": {"c": [3, 4]}}}], - } + assert len(columns_types) == 17 + assert len(partitions_types) == 2 + columns_types, partitions_types, partitions_values = wr.s3.store_parquet_metadata( + path=f"s3://{bucket}/test_parquet_catalog2", database=database, table="test_parquet_catalog2", dataset=True ) - paths = wr.s3.to_parquet( - df=df, path=path, index=False, use_threads=True, dataset=True, mode="overwrite", database=database, table=table - )["paths"] - wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database) - assert len(df2.index) == 2 - assert len(df2.columns) == 4 - - -def test_catalog_versioning(bucket, database): - table = "test_catalog_versioning" - wr.catalog.delete_table_if_exists(database=database, table=table) - path = f"s3://{bucket}/{table}/" - wr.s3.delete_objects(path=path) - - # Version 0 - df = pd.DataFrame({"c0": [1, 2]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table, mode="overwrite")["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) == 2 - assert len(df.columns) == 1 - assert str(df.c0.dtype).startswith("Int") - - # Version 1 - df = pd.DataFrame({"c1": ["foo", "boo"]}) - paths1 = wr.s3.to_parquet( - df=df, path=path, dataset=True, database=database, table=table, mode="overwrite", catalog_versioning=True - )["paths"] - wr.s3.wait_objects_exist(paths=paths1, use_threads=False) - df = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) == 2 - assert len(df.columns) == 1 - assert str(df.c1.dtype) == "string" + assert len(columns_types) == 17 + assert len(partitions_types) == 2 + assert len(partitions_values) == 2 + wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_catalog/") + wr.s3.delete_objects(path=f"s3://{bucket}/test_parquet_catalog2/") + assert wr.catalog.delete_table_if_exists(database=database, table="test_parquet_catalog") is True + assert wr.catalog.delete_table_if_exists(database=database, table="test_parquet_catalog2") is True - # Version 2 - df = pd.DataFrame({"c1": [1.0, 2.0]}) - paths2 = wr.s3.to_csv( + +def test_parquet_catalog_duplicated(bucket, database): + path = f"s3://{bucket}/test_parquet_catalog_dedup/" + df = pd.DataFrame({"A": [1], "a": [1]}) + wr.s3.to_parquet( df=df, path=path, + index=False, dataset=True, - database=database, - table=table, mode="overwrite", - catalog_versioning=True, - index=False, - )["paths"] - wr.s3.wait_objects_exist(paths=paths2, use_threads=False) - wr.s3.wait_objects_not_exist(paths=paths1, use_threads=False) - df = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) == 2 + database=database, + table="test_parquet_catalog_dedup", + ) + df = wr.s3.read_parquet(path=path) + assert len(df.index) == 1 assert len(df.columns) == 1 - assert str(df.c1.dtype).startswith("float") + wr.s3.delete_objects(path=path) + assert wr.catalog.delete_table_if_exists(database=database, table="test_parquet_catalog_dedup") is True - # Version 3 (removing version 2) - df = pd.DataFrame({"c1": [True, False]}) - paths3 = wr.s3.to_csv( - df=df, + +def test_parquet_catalog_casting(bucket, database): + path = f"s3://{bucket}/test_parquet_catalog_casting/" + paths = wr.s3.to_parquet( + df=get_df_cast(), path=path, + index=False, dataset=True, - database=database, - table=table, mode="overwrite", - catalog_versioning=False, - index=False, + database=database, + table="__test_parquet_catalog_casting", + dtype={ + "iint8": "tinyint", + "iint16": "smallint", + "iint32": "int", + "iint64": "bigint", + "float": "float", + "double": "double", + "decimal": "decimal(3,2)", + "string": "string", + "date": "date", + "timestamp": "timestamp", + "bool": "boolean", + "binary": "binary", + "category": "double", + "par0": "bigint", + "par1": "string", + }, )["paths"] - wr.s3.wait_objects_exist(paths=paths3, use_threads=False) - wr.s3.wait_objects_not_exist(paths=paths2, use_threads=False) - df = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) == 2 - assert len(df.columns) == 1 - assert str(df.c1.dtype).startswith("boolean") - - # Cleaning Up - wr.catalog.delete_table_if_exists(database=database, table=table) + wr.s3.wait_objects_exist(paths=paths) + df = wr.s3.read_parquet(path=path) + assert len(df.index) == 3 + assert len(df.columns) == 15 + ensure_data_types(df=df, has_list=False) + df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=database, ctas_approach=True) + assert len(df.index) == 3 + assert len(df.columns) == 15 + ensure_data_types(df=df, has_list=False) + df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=database, ctas_approach=False) + assert len(df.index) == 3 + assert len(df.columns) == 15 + ensure_data_types(df=df, has_list=False) wr.s3.delete_objects(path=path) + assert wr.catalog.delete_table_if_exists(database=database, table="__test_parquet_catalog_casting") is True -def test_copy_replacing_filename(bucket): - path = f"s3://{bucket}/test_copy_replacing_filename/" - wr.s3.delete_objects(path=path) - df = pd.DataFrame({"c0": [1, 2]}) - file_path = f"{path}myfile.parquet" - wr.s3.to_parquet(df=df, path=file_path) - wr.s3.wait_objects_exist(paths=[file_path], use_threads=False) - path2 = f"s3://{bucket}/test_copy_replacing_filename2/" - wr.s3.copy_objects( - paths=[file_path], source_path=path, target_path=path2, replace_filenames={"myfile.parquet": "myfile2.parquet"} +def test_catalog(path, database, table): + account_id = boto3.client("sts").get_caller_identity().get("Account") + assert wr.catalog.does_table_exist(database=database, table=table) is False + wr.catalog.create_parquet_table( + database=database, + table=table, + path=path, + columns_types={"col0": "int", "col1": "double"}, + partitions_types={"y": "int", "m": "int"}, + compression="snappy", ) - expected_file = f"{path2}myfile2.parquet" - wr.s3.wait_objects_exist(paths=[expected_file], use_threads=False) - objs = wr.s3.list_objects(path=path2) - assert objs[0] == expected_file - wr.s3.delete_objects(path=path) - wr.s3.delete_objects(path=path2) - - -def test_unsigned_parquet(bucket, database, external_schema): - table = "test_unsigned_parquet" - path = f"s3://{bucket}/{table}/" - wr.s3.delete_objects(path=path) - df = pd.DataFrame({"c0": [0, 0, (2 ** 8) - 1], "c1": [0, 0, (2 ** 16) - 1], "c2": [0, 0, (2 ** 32) - 1]}) - df["c0"] = df.c0.astype("uint8") - df["c1"] = df.c1.astype("uint16") - df["c2"] = df.c2.astype("uint32") - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table, mode="overwrite")["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = wr.athena.read_sql_table(table=table, database=database) - assert df.c0.sum() == (2 ** 8) - 1 - assert df.c1.sum() == (2 ** 16) - 1 - assert df.c2.sum() == (2 ** 32) - 1 - schema = wr.s3.read_parquet_metadata(path=path)[0] - assert schema["c0"] == "smallint" - assert schema["c1"] == "int" - assert schema["c2"] == "bigint" - df = wr.s3.read_parquet(path=path) - assert df.c0.sum() == (2 ** 8) - 1 - assert df.c1.sum() == (2 ** 16) - 1 - assert df.c2.sum() == (2 ** 32) - 1 - engine = wr.catalog.get_engine("aws-data-wrangler-redshift") - df = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) - assert df.c0.sum() == (2 ** 8) - 1 - assert df.c1.sum() == (2 ** 16) - 1 - assert df.c2.sum() == (2 ** 32) - 1 - - df = pd.DataFrame({"c0": [0, 0, (2 ** 64) - 1]}) - df["c0"] = df.c0.astype("uint64") - with pytest.raises(wr.exceptions.UnsupportedType): - wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table, mode="overwrite") - - wr.s3.delete_objects(path=path) - wr.catalog.delete_table_if_exists(database=database, table=table) + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.catalog.create_parquet_table( + database=database, table=table, path=path, columns_types={"col0": "string"}, mode="append" + ) + assert wr.catalog.does_table_exist(database=database, table=table) is True + assert wr.catalog.delete_table_if_exists(database=database, table=table) is True + assert wr.catalog.delete_table_if_exists(database=database, table=table) is False + wr.catalog.create_parquet_table( + database=database, + table=table, + path=path, + columns_types={"col0": "int", "col1": "double"}, + partitions_types={"y": "int", "m": "int"}, + compression="snappy", + description="Foo boo bar", + parameters={"tag": "test"}, + columns_comments={"col0": "my int", "y": "year"}, + mode="overwrite", + ) + wr.catalog.add_parquet_partitions( + database=database, + table=table, + partitions_values={f"{path}y=2020/m=1/": ["2020", "1"], f"{path}y=2021/m=2/": ["2021", "2"]}, + compression="snappy", + ) + assert wr.catalog.get_table_location(database=database, table=table) == path + partitions_values = wr.catalog.get_parquet_partitions(database=database, table=table) + assert len(partitions_values) == 2 + partitions_values = wr.catalog.get_parquet_partitions( + database=database, table=table, catalog_id=account_id, expression="y = 2021 AND m = 2" + ) + assert len(partitions_values) == 1 + assert len(set(partitions_values[f"{path}y=2021/m=2/"]) & {"2021", "2"}) == 2 + dtypes = wr.catalog.get_table_types(database=database, table=table) + assert dtypes["col0"] == "int" + assert dtypes["col1"] == "double" + assert dtypes["y"] == "int" + assert dtypes["m"] == "int" + df_dbs = wr.catalog.databases() + assert len(wr.catalog.databases(catalog_id=account_id)) == len(df_dbs) + assert database in df_dbs["Database"].to_list() + tables = list(wr.catalog.get_tables()) + assert len(tables) > 0 + for tbl in tables: + if tbl["Name"] == table: + assert tbl["TableType"] == "EXTERNAL_TABLE" + tables = list(wr.catalog.get_tables(database=database)) + assert len(tables) > 0 + for tbl in tables: + assert tbl["DatabaseName"] == database + # search + tables = list(wr.catalog.search_tables(text="parquet", catalog_id=account_id)) + assert len(tables) > 0 + for tbl in tables: + if tbl["Name"] == table: + assert tbl["TableType"] == "EXTERNAL_TABLE" + # prefix + tables = list(wr.catalog.get_tables(name_prefix=table[:4], catalog_id=account_id)) + assert len(tables) > 0 + for tbl in tables: + if tbl["Name"] == table: + assert tbl["TableType"] == "EXTERNAL_TABLE" + # suffix + tables = list(wr.catalog.get_tables(name_suffix=table[-4:], catalog_id=account_id)) + assert len(tables) > 0 + for tbl in tables: + if tbl["Name"] == table: + assert tbl["TableType"] == "EXTERNAL_TABLE" + # name_contains + tables = list(wr.catalog.get_tables(name_contains=table[4:-4], catalog_id=account_id)) + assert len(tables) > 0 + for tbl in tables: + if tbl["Name"] == table: + assert tbl["TableType"] == "EXTERNAL_TABLE" + # prefix & suffix & name_contains + tables = list( + wr.catalog.get_tables( + name_prefix=table[0], name_contains=table[3], name_suffix=table[-1], catalog_id=account_id + ) + ) + assert len(tables) > 0 + for tbl in tables: + if tbl["Name"] == table: + assert tbl["TableType"] == "EXTERNAL_TABLE" + # prefix & suffix + tables = list(wr.catalog.get_tables(name_prefix=table[0], name_suffix=table[-1], catalog_id=account_id)) + assert len(tables) > 0 + for tbl in tables: + if tbl["Name"] == table: + assert tbl["TableType"] == "EXTERNAL_TABLE" + # DataFrames + assert len(wr.catalog.databases().index) > 0 + assert len(wr.catalog.tables().index) > 0 + assert ( + len( + wr.catalog.tables( + database=database, + search_text="parquet", + name_prefix=table[0], + name_contains=table[3], + name_suffix=table[-1], + catalog_id=account_id, + ).index + ) + > 0 + ) + assert len(wr.catalog.table(database=database, table=table).index) > 0 + assert len(wr.catalog.table(database=database, table=table, catalog_id=account_id).index) > 0 + with pytest.raises(wr.exceptions.InvalidTable): + wr.catalog.overwrite_table_parameters({"foo": "boo"}, database, "fake_table") -def test_parquet_uint64(bucket): - path = f"s3://{bucket}/test_parquet_uint64/" - wr.s3.delete_objects(path=path) - df = pd.DataFrame( - { - "c0": [0, 0, (2 ** 8) - 1], - "c1": [0, 0, (2 ** 16) - 1], - "c2": [0, 0, (2 ** 32) - 1], - "c3": [0, 0, (2 ** 64) - 1], - "c4": [0, 1, 2], - } - ) - print(df) - df["c0"] = df.c0.astype("uint8") - df["c1"] = df.c1.astype("uint16") - df["c2"] = df.c2.astype("uint32") - df["c3"] = df.c3.astype("uint64") - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite", partition_cols=["c4"])["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = wr.s3.read_parquet(path=path, dataset=True) - print(df) - print(df.dtypes) - assert len(df.index) == 3 - assert len(df.columns) == 5 - assert df.c0.max() == (2 ** 8) - 1 - assert df.c1.max() == (2 ** 16) - 1 - assert df.c2.max() == (2 ** 32) - 1 - assert df.c3.max() == (2 ** 64) - 1 - assert df.c4.astype("uint8").sum() == 3 - wr.s3.delete_objects(path=path) +def test_s3_get_bucket_region(bucket, region): + assert wr.s3.get_bucket_region(bucket=bucket) == region + assert wr.s3.get_bucket_region(bucket=bucket, boto3_session=boto3.Session()) == region -def test_parquet_overwrite_partition_cols(path, database, table, external_schema): - df = pd.DataFrame({"c0": [1, 2, 1, 2], "c1": [1, 2, 1, 2], "c2": [2, 1, 2, 1]}) +def test_catalog_get_databases(database): + dbs = list(wr.catalog.get_databases()) + assert len(dbs) > 0 + for db in dbs: + if db["Name"] == database: + assert db["Description"] == "AWS Data Wrangler Test Arena - Glue Database" - paths = wr.s3.to_parquet( - df=df, path=path, dataset=True, database=database, table=table, mode="overwrite", partition_cols=["c2"] - )["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) == 4 - assert len(df.columns) == 3 - assert df.c0.sum() == 6 - assert df.c1.sum() == 6 - assert df.c2.sum() == 6 - paths = wr.s3.to_parquet( - df=df, path=path, dataset=True, database=database, table=table, mode="overwrite", partition_cols=["c1", "c2"] - )["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) == 4 - assert len(df.columns) == 3 - assert df.c0.sum() == 6 - assert df.c1.sum() == 6 - assert df.c2.sum() == 6 +def test_athena_query_cancelled(database): + session = boto3.Session() + query_execution_id = wr.athena.start_query_execution(sql=get_query_long(), database=database, boto3_session=session) + wr.athena.stop_query_execution(query_execution_id=query_execution_id, boto3_session=session) + with pytest.raises(wr.exceptions.QueryCancelled): + assert wr.athena.wait_query(query_execution_id=query_execution_id) - engine = wr.catalog.get_engine("aws-data-wrangler-redshift") - df = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) - assert len(df.index) == 4 - assert len(df.columns) == 3 - assert df.c0.sum() == 6 - assert df.c1.sum() == 6 - assert df.c2.sum() == 6 +def test_athena_query_failed(database): + query_execution_id = wr.athena.start_query_execution(sql="SELECT random(-1)", database=database) + with pytest.raises(wr.exceptions.QueryFailed): + assert wr.athena.wait_query(query_execution_id=query_execution_id) -def test_catalog_parameters(bucket, database): - table = "test_catalog_parameters" - path = f"s3://{bucket}/{table}/" - wr.s3.delete_objects(path=path) - wr.catalog.delete_table_if_exists(database=database, table=table) - wr.s3.to_parquet( - df=pd.DataFrame({"c0": [1, 2]}), - path=path, - dataset=True, - database=database, - table=table, - mode="overwrite", - parameters={"a": "1", "b": "2"}, - ) - pars = wr.catalog.get_table_parameters(database=database, table=table) - assert pars["a"] == "1" - assert pars["b"] == "2" - pars["a"] = "0" - pars["c"] = "3" - wr.catalog.upsert_table_parameters(parameters=pars, database=database, table=table) - pars = wr.catalog.get_table_parameters(database=database, table=table) - assert pars["a"] == "0" - assert pars["b"] == "2" - assert pars["c"] == "3" - wr.catalog.overwrite_table_parameters(parameters={"d": "4"}, database=database, table=table) - pars = wr.catalog.get_table_parameters(database=database, table=table) - assert pars.get("a") is None - assert pars.get("b") is None - assert pars.get("c") is None - assert pars["d"] == "4" - df = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) == 2 - assert len(df.columns) == 1 - assert df.c0.sum() == 3 +def test_athena_read_list(database): + with pytest.raises(wr.exceptions.UnsupportedType): + wr.athena.read_sql_query(sql="SELECT ARRAY[1, 2, 3]", database=database, ctas_approach=False) - wr.s3.to_parquet( - df=pd.DataFrame({"c0": [3, 4]}), - path=path, - dataset=True, - database=database, - table=table, - mode="append", - parameters={"e": "5"}, - ) - pars = wr.catalog.get_table_parameters(database=database, table=table) - assert pars.get("a") is None - assert pars.get("b") is None - assert pars.get("c") is None - assert pars["d"] == "4" - assert pars["e"] == "5" - df = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) == 4 - assert len(df.columns) == 1 - assert df.c0.sum() == 10 - wr.s3.delete_objects(path=path) - wr.catalog.delete_table_if_exists(database=database, table=table) +def test_sanitize_names(): + assert wr.catalog.sanitize_column_name("CamelCase") == "camel_case" + assert wr.catalog.sanitize_column_name("CamelCase2") == "camel_case2" + assert wr.catalog.sanitize_column_name("Camel_Case3") == "camel_case3" + assert wr.catalog.sanitize_column_name("Cámël_Casë4仮") == "camel_case4_" + assert wr.catalog.sanitize_column_name("Camel__Case5") == "camel__case5" + assert wr.catalog.sanitize_column_name("Camel{}Case6") == "camel_case6" + assert wr.catalog.sanitize_column_name("Camel.Case7") == "camel_case7" + assert wr.catalog.sanitize_column_name("xyz_cd") == "xyz_cd" + assert wr.catalog.sanitize_column_name("xyz_Cd") == "xyz_cd" + assert wr.catalog.sanitize_table_name("CamelCase") == "camel_case" + assert wr.catalog.sanitize_table_name("CamelCase2") == "camel_case2" + assert wr.catalog.sanitize_table_name("Camel_Case3") == "camel_case3" + assert wr.catalog.sanitize_table_name("Cámël_Casë4仮") == "camel_case4_" + assert wr.catalog.sanitize_table_name("Camel__Case5") == "camel__case5" + assert wr.catalog.sanitize_table_name("Camel{}Case6") == "camel_case6" + assert wr.catalog.sanitize_table_name("Camel.Case7") == "camel_case7" + assert wr.catalog.sanitize_table_name("xyz_cd") == "xyz_cd" + assert wr.catalog.sanitize_table_name("xyz_Cd") == "xyz_cd" -def test_metadata_partitions(path): - path = f"{path}0.parquet" - df = pd.DataFrame({"c0": [0, 1, 2], "c1": ["3", "4", "5"], "c2": [6.0, 7.0, 8.0]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=False)["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - columns_types, partitions_types = wr.s3.read_parquet_metadata(path=path, dataset=False) - assert len(columns_types) == len(df.columns) - assert columns_types.get("c0") == "bigint" - assert columns_types.get("c1") == "string" - assert columns_types.get("c2") == "double" +def test_athena_ctas_empty(database): + sql = """ + WITH dataset AS ( + SELECT 0 AS id + ) + SELECT id + FROM dataset + WHERE id != 0 + """ + assert wr.athena.read_sql_query(sql=sql, database=database).empty is True + assert len(list(wr.athena.read_sql_query(sql=sql, database=database, chunksize=1))) == 0 -@pytest.mark.parametrize("partition_cols", [None, ["c2"], ["c1", "c2"]]) -def test_metadata_partitions_dataset(path, partition_cols): - df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=partition_cols)["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - columns_types, partitions_types = wr.s3.read_parquet_metadata(path=path, dataset=True) - partitions_types = partitions_types if partitions_types is not None else {} - assert len(columns_types) + len(partitions_types) == len(df.columns) - assert columns_types.get("c0") == "bigint" - assert (columns_types.get("c1") == "bigint") or (partitions_types.get("c1") == "string") - assert (columns_types.get("c1") == "bigint") or (partitions_types.get("c1") == "string") +def test_s3_empty_dfs(): + df = pd.DataFrame() + with pytest.raises(wr.exceptions.EmptyDataFrame): + wr.s3.to_parquet(df=df, path="") + with pytest.raises(wr.exceptions.EmptyDataFrame): + wr.s3.to_csv(df=df, path="") -@pytest.mark.parametrize("partition_cols", [None, ["c2"], ["c1", "c2"]]) -def test_store_metadata_partitions_dataset(database, table, path, partition_cols): - df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=partition_cols)["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - wr.s3.store_parquet_metadata(path=path, database=database, table=table, dataset=True) - df2 = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) == len(df2.index) - assert len(df.columns) == len(df2.columns) - assert df.c0.sum() == df2.c0.sum() - assert df.c1.sum() == df2.c1.astype(int).sum() - assert df.c2.sum() == df2.c2.astype(int).sum() - - -def test_json_chunksize(path): - num_files = 10 - df = pd.DataFrame({"id": [1, 2, 3], "value": ["foo", "boo", "bar"]}) - paths = [f"{path}{i}.json" for i in range(num_files)] - for p in paths: - wr.s3.to_json(df, p, orient="records", lines=True) - wr.s3.wait_objects_exist(paths) - dfs = list(wr.s3.read_json(paths, lines=True, chunksize=1)) - assert len(dfs) == (3 * num_files) - for d in dfs: - assert len(d.columns) == 2 - assert d.id.iloc[0] in (1, 2, 3) - assert d.value.iloc[0] in ("foo", "boo", "bar") - - -def test_parquet_cast_string(path): - df = pd.DataFrame({"id": [1, 2, 3], "value": ["foo", "boo", "bar"]}) - path_file = f"{path}0.parquet" - wr.s3.to_parquet(df, path_file, dtype={"id": "string"}) - wr.s3.wait_objects_exist([path_file]) - df2 = wr.s3.read_parquet(path_file) - assert str(df2.id.dtypes) == "string" - df2["id"] = df2["id"].astype(int) - assert df.shape == df2.shape - for col, row in tuple(itertools.product(df.columns, range(3))): - assert df[col].iloc[row] == df2[col].iloc[row] +def test_absent_object(bucket): + path = f"s3://{bucket}/test_absent_object" + assert wr.s3.does_object_exist(path=path) is False + assert len(wr.s3.size_objects(path=path)) == 0 + assert wr.s3.wait_objects_exist(paths=[]) is None -@pytest.mark.parametrize("partition_cols", [None, ["c2"], ["value", "c2"]]) -def test_parquet_cast_string_dataset(path, partition_cols): - df = pd.DataFrame({"id": [1, 2, 3], "value": ["foo", "boo", "bar"], "c2": [4, 5, 6], "c3": [7.0, 8.0, 9.0]}) - paths = wr.s3.to_parquet( - df, path, dataset=True, partition_cols=partition_cols, dtype={"id": "string", "c3": "string"} - )["paths"] - wr.s3.wait_objects_exist(paths) - df2 = wr.s3.read_parquet(path, dataset=True).sort_values("id", ignore_index=True) - assert str(df2.id.dtypes) == "string" - assert str(df2.c3.dtypes) == "string" - df2["id"] = df2["id"].astype(int) - df2["c3"] = df2["c3"].astype(float) - assert df.shape == df2.shape - for col, row in tuple(itertools.product(df.columns, range(3))): - assert df[col].iloc[row] == df2[col].iloc[row] +def test_athena_struct(database): + sql = "SELECT CAST(ROW(1, 'foo') AS ROW(id BIGINT, value VARCHAR)) AS col0" + with pytest.raises(wr.exceptions.UnsupportedType): + wr.athena.read_sql_query(sql=sql, database=database, ctas_approach=False) + df = wr.athena.read_sql_query(sql=sql, database=database, ctas_approach=True) + assert len(df.index) == 1 + assert len(df.columns) == 1 + assert df["col0"].iloc[0]["id"] == 1 + assert df["col0"].iloc[0]["value"] == "foo" + sql = "SELECT ROW(1, ROW(2, ROW(3, '4'))) AS col0" + df = wr.athena.read_sql_query(sql=sql, database=database, ctas_approach=True) + assert len(df.index) == 1 + assert len(df.columns) == 1 + assert df["col0"].iloc[0]["field0"] == 1 + assert df["col0"].iloc[0]["field1"]["field0"] == 2 + assert df["col0"].iloc[0]["field1"]["field1"]["field0"] == 3 + assert df["col0"].iloc[0]["field1"]["field1"]["field1"] == "4" -@pytest.mark.parametrize("partition_cols", [None, ["c2"], ["c1", "c2"]]) -def test_store_metadata_partitions_sample_dataset(database, table, path, partition_cols): - num_files = 10 - df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]}) - for _ in range(num_files): - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=partition_cols)["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - wr.s3.store_parquet_metadata( - path=path, database=database, table=table, dtype={"c1": "bigint", "c2": "smallint"}, sampling=0.25, dataset=True - ) - df2 = wr.athena.read_sql_table(table=table, database=database) - assert len(df.index) * num_files == len(df2.index) - assert len(df.columns) == len(df2.columns) - assert df.c0.sum() * num_files == df2.c0.sum() - assert df.c1.sum() * num_files == df2.c1.sum() - assert df.c2.sum() * num_files == df2.c2.sum() +def test_athena_time_zone(database): + sql = "SELECT current_timestamp AS value, typeof(current_timestamp) AS type" + df = wr.athena.read_sql_query(sql=sql, database=database, ctas_approach=False) + assert len(df.index) == 1 + assert len(df.columns) == 2 + assert df["type"][0] == "timestamp with time zone" + assert df["value"][0].year == datetime.datetime.utcnow().year -def test_athena_undefined_column(database): - with pytest.raises(wr.exceptions.InvalidArgumentValue): - wr.athena.read_sql_query("SELECT 1", database) - with pytest.raises(wr.exceptions.InvalidArgumentValue): - wr.athena.read_sql_query("SELECT NULL AS my_null", database) +def test_category(bucket, database): + df = get_df_category() + path = f"s3://{bucket}/test_category/" + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + database=database, + table="test_category", + mode="overwrite", + partition_cols=["par0", "par1"], + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.s3.read_parquet(path=path, dataset=True, categories=[c for c in df.columns if c not in ["par0", "par1"]]) + ensure_data_types_category(df2) + df2 = wr.athena.read_sql_query("SELECT * FROM test_category", database=database, categories=list(df.columns)) + ensure_data_types_category(df2) + df2 = wr.athena.read_sql_table(table="test_category", database=database, categories=list(df.columns)) + ensure_data_types_category(df2) + df2 = wr.athena.read_sql_query( + "SELECT * FROM test_category", database=database, categories=list(df.columns), ctas_approach=False + ) + ensure_data_types_category(df2) + dfs = wr.athena.read_sql_query( + "SELECT * FROM test_category", database=database, categories=list(df.columns), ctas_approach=False, chunksize=1 + ) + for df2 in dfs: + ensure_data_types_category(df2) + dfs = wr.athena.read_sql_query( + "SELECT * FROM test_category", database=database, categories=list(df.columns), ctas_approach=True, chunksize=1 + ) + for df2 in dfs: + ensure_data_types_category(df2) + wr.s3.delete_objects(path=paths) + assert wr.catalog.delete_table_if_exists(database=database, table="test_category") is True -def test_to_parquet_file_sanitize(path): - df = pd.DataFrame({"C0": [0, 1], "camelCase": [2, 3], "c**--2": [4, 5]}) +def test_parquet_validate_schema(path): + df = pd.DataFrame({"id": [1, 2, 3]}) path_file = f"{path}0.parquet" - wr.s3.to_parquet(df, path_file) - wr.s3.wait_objects_exist([path_file]) - df2 = wr.s3.read_parquet(path_file) - assert df.shape == df2.shape - assert list(df2.columns) == ["c0", "camel_case", "c_2"] - assert df2.c0.sum() == 1 - assert df2.camel_case.sum() == 5 - assert df2.c_2.sum() == 9 - + wr.s3.to_parquet(df=df, path=path_file) + wr.s3.wait_objects_exist(paths=[path_file]) + df2 = pd.DataFrame({"id2": [1, 2, 3], "val": ["foo", "boo", "bar"]}) + path_file2 = f"{path}1.parquet" + wr.s3.to_parquet(df=df2, path=path_file2) + wr.s3.wait_objects_exist(paths=[path_file2], use_threads=False) + df3 = wr.s3.read_parquet(path=path, validate_schema=False) + assert len(df3.index) == 6 + assert len(df3.columns) == 3 + with pytest.raises(ValueError): + wr.s3.read_parquet(path=path, validate_schema=True) -def test_to_parquet_modes(database, table, path, external_schema): - # Round 1 - Warm up - df = pd.DataFrame({"c0": [0, None]}, dtype="Int64") - paths = wr.s3.to_parquet( +def test_csv_dataset(bucket, database): + path = f"s3://{bucket}/test_csv_dataset/" + with pytest.raises(wr.exceptions.UndetectedType): + wr.s3.to_csv(pd.DataFrame({"A": [None]}), path, dataset=True, database=database, table="test_csv_dataset") + df = get_df_csv() + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + wr.s3.to_csv(df, path, dataset=False, mode="overwrite", database=database, table="test_csv_dataset") + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + wr.s3.to_csv(df, path, dataset=False, table="test_csv_dataset") + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + wr.s3.to_csv(df, path, dataset=True, mode="overwrite", database=database) + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + wr.s3.to_csv(df=df, path=path, mode="append") + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + wr.s3.to_csv(df=df, path=path, partition_cols=["col2"]) + with pytest.raises(wr.exceptions.InvalidArgumentCombination): + wr.s3.to_csv(df=df, path=path, description="foo") + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.s3.to_csv(df=df, path=path, partition_cols=["col2"], dataset=True, mode="WRONG") + paths = wr.s3.to_csv( df=df, path=path, + sep="|", + index=False, + use_threads=True, + boto3_session=None, + s3_additional_kwargs=None, dataset=True, + partition_cols=["par0", "par1"], mode="overwrite", - database=database, - table=table, - description="c0", - parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index))}, - columns_comments={"c0": "0"}, )["paths"] wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table(table, database) - assert df.shape == df2.shape - assert df.c0.sum() == df2.c0.sum() - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == str(len(df2.columns)) - assert parameters["num_rows"] == str(len(df2.index)) - assert wr.catalog.get_table_description(database, table) == "c0" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c0"] == "0" + df2 = wr.s3.read_csv(path=paths, sep="|", header=None) + assert len(df2.index) == 3 + assert len(df2.columns) == 8 + assert df2[0].sum() == 6 + wr.s3.delete_objects(path=paths) - # Round 2 - Overwrite - df = pd.DataFrame({"c1": [None, 1, None]}, dtype="Int16") - paths = wr.s3.to_parquet( + +def test_csv_catalog(bucket, database): + path = f"s3://{bucket}/test_csv_catalog/" + df = get_df_csv() + paths = wr.s3.to_csv( df=df, path=path, + sep="\t", + index=True, + use_threads=True, + boto3_session=None, + s3_additional_kwargs=None, dataset=True, + partition_cols=["par0", "par1"], mode="overwrite", + table="test_csv_catalog", database=database, - table=table, - description="c1", - parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index))}, - columns_comments={"c1": "1"}, )["paths"] wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table(table, database) - assert df.shape == df2.shape - assert df.c1.sum() == df2.c1.sum() - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == str(len(df2.columns)) - assert parameters["num_rows"] == str(len(df2.index)) - assert wr.catalog.get_table_description(database, table) == "c1" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c1"] == "1" + df2 = wr.athena.read_sql_table("test_csv_catalog", database) + assert len(df2.index) == 3 + assert len(df2.columns) == 11 + assert df2["id"].sum() == 6 + ensure_data_types_csv(df2) + wr.s3.delete_objects(path=paths) + assert wr.catalog.delete_table_if_exists(database=database, table="test_csv_catalog") is True - # Round 3 - Append - df = pd.DataFrame({"c1": [None, 2, None]}, dtype="Int8") - paths = wr.s3.to_parquet( - df=df, + +def test_csv_catalog_columns(bucket, database): + path = f"s3://{bucket}/test_csv_catalog_columns /" + paths = wr.s3.to_csv( + df=get_df_csv(), path=path, + sep="|", + columns=["id", "date", "timestamp", "par0", "par1"], + index=False, + use_threads=False, + boto3_session=None, + s3_additional_kwargs=None, dataset=True, - mode="append", + partition_cols=["par0", "par1"], + mode="overwrite", + table="test_csv_catalog_columns", database=database, - table=table, - description="c1", - parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index) * 2)}, - columns_comments={"c1": "1"}, )["paths"] wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table(table, database) - assert len(df.columns) == len(df2.columns) - assert len(df.index) * 2 == len(df2.index) - assert df.c1.sum() + 1 == df2.c1.sum() - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == str(len(df2.columns)) - assert parameters["num_rows"] == str(len(df2.index)) - assert wr.catalog.get_table_description(database, table) == "c1" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c1"] == "1" + df2 = wr.athena.read_sql_table("test_csv_catalog_columns", database) + assert len(df2.index) == 3 + assert len(df2.columns) == 5 + assert df2["id"].sum() == 6 + ensure_data_types_csv(df2) - # Round 4 - Append + New Column - df = pd.DataFrame({"c2": ["a", None, "b"], "c1": [None, None, None]}) - paths = wr.s3.to_parquet( - df=df, + paths = wr.s3.to_csv( + df=pd.DataFrame({"id": [4], "date": [None], "timestamp": [None], "par0": [1], "par1": ["a"]}), path=path, + sep="|", + index=False, + use_threads=False, + boto3_session=None, + s3_additional_kwargs=None, dataset=True, - mode="append", + partition_cols=["par0", "par1"], + mode="overwrite_partitions", + table="test_csv_catalog_columns", database=database, - table=table, - description="c1+c2", - parameters={"num_cols": "2", "num_rows": "9"}, - columns_comments={"c1": "1", "c2": "2"}, )["paths"] wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table(table, database) - assert len(df2.columns) == 2 - assert len(df2.index) == 9 - assert df2.c1.sum() == 3 - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == "2" - assert parameters["num_rows"] == "9" - assert wr.catalog.get_table_description(database, table) == "c1+c2" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c1"] == "1" - assert comments["c2"] == "2" + df2 = wr.athena.read_sql_table("test_csv_catalog_columns", database) + assert len(df2.index) == 3 + assert len(df2.columns) == 5 + assert df2["id"].sum() == 9 + ensure_data_types_csv(df2) - # Round 5 - Append + New Column + Wrong Types - df = pd.DataFrame({"c2": [1], "c3": [True], "c1": ["1"]}) - paths = wr.s3.to_parquet( + wr.s3.delete_objects(path=path) + assert wr.catalog.delete_table_if_exists(database=database, table="test_csv_catalog_columns") is True + + +def test_athena_types(bucket, database): + path = f"s3://{bucket}/test_athena_types/" + df = get_df_csv() + paths = wr.s3.to_csv( df=df, path=path, + sep=",", + index=False, + use_threads=True, + boto3_session=None, + s3_additional_kwargs=None, dataset=True, - mode="append", - database=database, - table=table, - description="c1+c2+c3", - parameters={"num_cols": "3", "num_rows": "10"}, - columns_comments={"c1": "1!", "c2": "2!", "c3": "3"}, + partition_cols=["par0", "par1"], + mode="overwrite", )["paths"] wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table(table, database) - assert len(df2.columns) == 3 - assert len(df2.index) == 10 - assert df2.c1.sum() == 4 - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == "3" - assert parameters["num_rows"] == "10" - assert wr.catalog.get_table_description(database, table) == "c1+c2+c3" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c1"] == "1!" - assert comments["c2"] == "2!" - assert comments["c3"] == "3" - engine = wr.catalog.get_engine("aws-data-wrangler-redshift") - df3 = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) - assert len(df3.columns) == 3 - assert len(df3.index) == 10 - assert df3.c1.sum() == 4 + columns_types, partitions_types = wr.catalog.extract_athena_types( + df=df, index=False, partition_cols=["par0", "par1"], file_format="csv" + ) + wr.catalog.create_csv_table( + table="test_athena_types", + database=database, + path=path, + partitions_types=partitions_types, + columns_types=columns_types, + ) + wr.catalog.create_csv_table( + database=database, table="test_athena_types", path=path, columns_types={"col0": "string"}, mode="append" + ) + wr.athena.repair_table("test_athena_types", database) + assert len(wr.catalog.get_csv_partitions(database, "test_athena_types")) == 3 + df2 = wr.athena.read_sql_table("test_athena_types", database) + assert len(df2.index) == 3 + assert len(df2.columns) == 10 + assert df2["id"].sum() == 6 + ensure_data_types_csv(df2) + wr.s3.delete_objects(path=paths) + assert wr.catalog.delete_table_if_exists(database=database, table="test_athena_types") is True - # Round 6 - Overwrite Partitioned - df = pd.DataFrame({"c0": ["foo", None], "c1": [0, 1]}) + +def test_parquet_catalog_columns(bucket, database): + path = f"s3://{bucket}/test_parquet_catalog_columns/" paths = wr.s3.to_parquet( - df=df, + df=get_df_csv()[["id", "date", "timestamp", "par0", "par1"]], path=path, + index=False, + use_threads=False, + boto3_session=None, + s3_additional_kwargs=None, dataset=True, + partition_cols=["par0", "par1"], mode="overwrite", + table="test_parquet_catalog_columns", database=database, - table=table, - partition_cols=["c1"], - description="c0+c1", - parameters={"num_cols": "2", "num_rows": "2"}, - columns_comments={"c0": "zero", "c1": "one"}, )["paths"] wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table(table, database) - assert df.shape == df2.shape - assert df.c1.sum() == df2.c1.sum() - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == "2" - assert parameters["num_rows"] == "2" - assert wr.catalog.get_table_description(database, table) == "c0+c1" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c0"] == "zero" - assert comments["c1"] == "one" + df2 = wr.athena.read_sql_table("test_parquet_catalog_columns", database) + assert len(df2.index) == 3 + assert len(df2.columns) == 5 + assert df2["id"].sum() == 6 + ensure_data_types_csv(df2) - # Round 7 - Overwrite Partitions - df = pd.DataFrame({"c0": [None, None], "c1": [0, 2]}) paths = wr.s3.to_parquet( - df=df, + df=pd.DataFrame({"id": [4], "date": [None], "timestamp": [None], "par0": [1], "par1": ["a"]}), path=path, + index=False, + use_threads=False, + boto3_session=None, + s3_additional_kwargs=None, dataset=True, + partition_cols=["par0", "par1"], mode="overwrite_partitions", + table="test_parquet_catalog_columns", database=database, - table=table, - partition_cols=["c1"], - description="c0+c1", - parameters={"num_cols": "2", "num_rows": "3"}, - columns_comments={"c0": "zero", "c1": "one"}, )["paths"] wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table(table, database) - assert len(df2.columns) == 2 + df2 = wr.athena.read_sql_table("test_parquet_catalog_columns", database) assert len(df2.index) == 3 - assert df2.c1.sum() == 3 - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == "2" - assert parameters["num_rows"] == "3" - assert wr.catalog.get_table_description(database, table) == "c0+c1" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c0"] == "zero" - assert comments["c1"] == "one" + assert len(df2.columns) == 5 + assert df2["id"].sum() == 9 + ensure_data_types_csv(df2) - # Round 8 - Overwrite Partitions + New Column + Wrong Type - df = pd.DataFrame({"c0": [1, 2], "c1": ["1", "3"], "c2": [True, False]}) + wr.s3.delete_objects(path=path) + assert wr.catalog.delete_table_if_exists(database=database, table="test_parquet_catalog_columns") is True + + +@pytest.mark.parametrize("compression", [None, "gzip", "snappy"]) +def test_parquet_compress(bucket, database, compression): + path = f"s3://{bucket}/test_parquet_compress_{compression}/" paths = wr.s3.to_parquet( - df=df, + df=get_df(), path=path, + compression=compression, dataset=True, - mode="overwrite_partitions", database=database, - table=table, - partition_cols=["c1"], - description="c0+c1+c2", - parameters={"num_cols": "3", "num_rows": "4"}, - columns_comments={"c0": "zero", "c1": "one", "c2": "two"}, + table=f"test_parquet_compress_{compression}", + mode="overwrite", )["paths"] wr.s3.wait_objects_exist(paths=paths) - df2 = wr.athena.read_sql_table(table, database) + df2 = wr.athena.read_sql_table(f"test_parquet_compress_{compression}", database) + ensure_data_types(df2) + df2 = wr.s3.read_parquet(path=path) + wr.s3.delete_objects(path=path) + assert wr.catalog.delete_table_if_exists(database=database, table=f"test_parquet_compress_{compression}") is True + ensure_data_types(df2) + + +@pytest.mark.parametrize("compression", ["gzip", "bz2", "xz"]) +def test_csv_compress(bucket, compression): + path = f"s3://{bucket}/test_csv_compress_{compression}/" + wr.s3.delete_objects(path=path) + df = get_df_csv() + if compression == "gzip": + buffer = BytesIO() + with gzip.GzipFile(mode="w", fileobj=buffer) as zipped_file: + df.to_csv(TextIOWrapper(zipped_file, "utf8"), index=False, header=None) + s3_resource = boto3.resource("s3") + s3_object = s3_resource.Object(bucket, f"test_csv_compress_{compression}/test.csv.gz") + s3_object.put(Body=buffer.getvalue()) + file_path = f"s3://{bucket}/test_csv_compress_{compression}/test.csv.gz" + elif compression == "bz2": + buffer = BytesIO() + with bz2.BZ2File(mode="w", filename=buffer) as zipped_file: + df.to_csv(TextIOWrapper(zipped_file, "utf8"), index=False, header=None) + s3_resource = boto3.resource("s3") + s3_object = s3_resource.Object(bucket, f"test_csv_compress_{compression}/test.csv.bz2") + s3_object.put(Body=buffer.getvalue()) + file_path = f"s3://{bucket}/test_csv_compress_{compression}/test.csv.bz2" + elif compression == "xz": + buffer = BytesIO() + with lzma.LZMAFile(mode="w", filename=buffer) as zipped_file: + df.to_csv(TextIOWrapper(zipped_file, "utf8"), index=False, header=None) + s3_resource = boto3.resource("s3") + s3_object = s3_resource.Object(bucket, f"test_csv_compress_{compression}/test.csv.xz") + s3_object.put(Body=buffer.getvalue()) + file_path = f"s3://{bucket}/test_csv_compress_{compression}/test.csv.xz" + else: + file_path = f"s3://{bucket}/test_csv_compress_{compression}/test.csv" + wr.s3.to_csv(df=df, path=file_path, index=False, header=None) + + wr.s3.wait_objects_exist(paths=[file_path]) + df2 = wr.s3.read_csv(path=[file_path], names=df.columns) + assert len(df2.index) == 3 + assert len(df2.columns) == 10 + dfs = wr.s3.read_csv(path=[file_path], names=df.columns, chunksize=1) + for df3 in dfs: + assert len(df3.columns) == 10 + wr.s3.delete_objects(path=path) + + +def test_parquet_char_length(path, database, table, external_schema): + df = pd.DataFrame( + {"id": [1, 2], "cchar": ["foo", "boo"], "date": [datetime.date(2020, 1, 1), datetime.date(2020, 1, 2)]} + ) + wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + database=database, + table=table, + mode="overwrite", + partition_cols=["date"], + dtype={"cchar": "char(3)"}, + ) + + df2 = wr.s3.read_parquet(path, dataset=True) + assert len(df2.index) == 2 assert len(df2.columns) == 3 - assert len(df2.index) == 4 - assert df2.c1.sum() == 6 - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == "3" - assert parameters["num_rows"] == "4" - assert wr.catalog.get_table_description(database, table) == "c0+c1+c2" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c0"] == "zero" - assert comments["c1"] == "one" - assert comments["c2"] == "two" + assert df2.id.sum() == 3 + + df2 = wr.athena.read_sql_table(table=table, database=database) + assert len(df2.index) == 2 + assert len(df2.columns) == 3 + assert df2.id.sum() == 3 + engine = wr.catalog.get_engine("aws-data-wrangler-redshift") - df3 = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) - assert len(df3.columns) == 3 - assert len(df3.index) == 4 - assert df3.c1.sum() == 6 + df2 = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) + assert len(df2.index) == 2 + assert len(df2.columns) == 3 + assert df2.id.sum() == 3 + + +def test_merge(bucket): + path = f"s3://{bucket}/test_merge/" + df = pd.DataFrame({"id": [1, 2, 3], "par": [1, 2, 3]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] + wr.s3.wait_objects_exist(paths=paths) + df = wr.s3.read_parquet(path=path, dataset=True) + assert df.id.sum() == 6 + assert df.par.astype("Int64").sum() == 6 + + path2 = f"s3://{bucket}/test_merge2/" + df = pd.DataFrame({"id": [1, 2, 3], "par": [1, 2, 3]}) + paths = wr.s3.to_parquet(df=df, path=path2, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] + wr.s3.wait_objects_exist(paths=paths) + paths = wr.s3.merge_datasets(source_path=path2, target_path=path, mode="append", use_threads=True) + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.s3.read_parquet(path=path, dataset=True) + assert df.id.sum() == 12 + assert df.par.astype("Int64").sum() == 12 + + paths = wr.s3.merge_datasets(source_path=path2, target_path=path, mode="overwrite", use_threads=False) + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.s3.read_parquet(path=path, dataset=True) + assert df.id.sum() == 6 + assert df.par.astype("Int64").sum() == 6 + + df = pd.DataFrame({"id": [4], "par": [3]}) + paths = wr.s3.to_parquet(df=df, path=path2, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] + wr.s3.wait_objects_exist(paths=paths) + paths = wr.s3.merge_datasets(source_path=path2, target_path=path, mode="overwrite_partitions", use_threads=True) + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.s3.read_parquet(path=path, dataset=True) + assert df.id.sum() == 7 + assert df.par.astype("Int64").sum() == 6 + + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.s3.merge_datasets(source_path=path, target_path="bar", mode="WRONG") + + assert len(wr.s3.merge_datasets(source_path=f"s3://{bucket}/empty/", target_path="bar")) == 0 + + wr.s3.delete_objects(path=path) + wr.s3.delete_objects(path=path2) + + +def test_copy(bucket): + path = f"s3://{bucket}/test_copy/" + df = pd.DataFrame({"id": [1, 2, 3], "par": [1, 2, 3]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] + wr.s3.wait_objects_exist(paths=paths) + df = wr.s3.read_parquet(path=path, dataset=True) + assert df.id.sum() == 6 + assert df.par.astype("Int64").sum() == 6 + + path2 = f"s3://{bucket}/test_copy2/" + df = pd.DataFrame({"id": [1, 2, 3], "par": [1, 2, 3]}) + paths = wr.s3.to_parquet(df=df, path=path2, dataset=True, partition_cols=["par"], mode="overwrite")["paths"] + wr.s3.wait_objects_exist(paths=paths) + paths = wr.s3.copy_objects(paths, source_path=path2, target_path=path, use_threads=True) + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.s3.read_parquet(path=path, dataset=True) + assert df.id.sum() == 12 + assert df.par.astype("Int64").sum() == 12 + + assert len(wr.s3.copy_objects([], source_path="boo", target_path="bar")) == 0 + + wr.s3.delete_objects(path=path) + wr.s3.delete_objects(path=path2) + + +@pytest.mark.parametrize("col2", [[1, 1, 1, 1, 1], [1, 2, 3, 4, 5], [1, 1, 1, 1, 2], [1, 2, 2, 2, 2]]) +@pytest.mark.parametrize("chunked", [True, 1, 2, 100]) +def test_parquet_chunked(bucket, database, col2, chunked): + table = f"test_parquet_chunked_{chunked}_{''.join([str(x) for x in col2])}" + path = f"s3://{bucket}/{table}/" + wr.s3.delete_objects(path=path) + values = list(range(5)) + df = pd.DataFrame({"col1": values, "col2": col2}) + paths = wr.s3.to_parquet( + df, path, index=False, dataset=True, database=database, table=table, partition_cols=["col2"], mode="overwrite" + )["paths"] + wr.s3.wait_objects_exist(paths=paths) + dfs = list(wr.s3.read_parquet(path=path, dataset=True, chunked=chunked)) + assert sum(values) == pd.concat(dfs, ignore_index=True).col1.sum() + if chunked is not True: + assert len(dfs) == int(math.ceil(len(df) / chunked)) + for df2 in dfs[:-1]: + assert chunked == len(df2) + assert chunked >= len(dfs[-1]) + else: + assert len(dfs) == len(set(col2)) -def test_store_parquet_metadata_modes(database, table, path, external_schema): + dfs = list(wr.athena.read_sql_table(database=database, table=table, chunksize=chunked)) + assert sum(values) == pd.concat(dfs, ignore_index=True).col1.sum() + if chunked is not True: + assert len(dfs) == int(math.ceil(len(df) / chunked)) + for df2 in dfs[:-1]: + assert chunked == len(df2) + assert chunked >= len(dfs[-1]) - # Round 1 - Warm up - df = pd.DataFrame({"c0": [0, None]}, dtype="Int64") - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite")["paths"] - wr.s3.wait_objects_exist(paths=paths) - wr.s3.store_parquet_metadata( - path=path, - dataset=True, - mode="overwrite", - database=database, - table=table, - description="c0", - parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index))}, - columns_comments={"c0": "0"}, - ) - df2 = wr.athena.read_sql_table(table, database) - assert df.shape == df2.shape - assert df.c0.sum() == df2.c0.sum() - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == str(len(df2.columns)) - assert parameters["num_rows"] == str(len(df2.index)) - assert wr.catalog.get_table_description(database, table) == "c0" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c0"] == "0" + wr.s3.delete_objects(path=paths) + assert wr.catalog.delete_table_if_exists(database=database, table=table) is True - # Round 2 - Overwrite - df = pd.DataFrame({"c1": [None, 1, None]}, dtype="Int16") - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite")["paths"] - wr.s3.wait_objects_exist(paths=paths) - wr.s3.store_parquet_metadata( - path=path, - dataset=True, - mode="overwrite", - database=database, - table=table, - description="c1", - parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index))}, - columns_comments={"c1": "1"}, - ) - df2 = wr.athena.read_sql_table(table, database) - assert df.shape == df2.shape - assert df.c1.sum() == df2.c1.sum() - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == str(len(df2.columns)) - assert parameters["num_rows"] == str(len(df2.index)) - assert wr.catalog.get_table_description(database, table) == "c1" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c1"] == "1" - # Round 3 - Append - df = pd.DataFrame({"c1": [None, 2, None]}, dtype="Int16") - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="append")["paths"] - wr.s3.wait_objects_exist(paths=paths) - wr.s3.store_parquet_metadata( - path=path, - dataset=True, - mode="append", - database=database, +@pytest.mark.parametrize("workgroup", [None, 0, 1, 2, 3]) +@pytest.mark.parametrize("encryption", [None, "SSE_S3", "SSE_KMS"]) +# @pytest.mark.parametrize("workgroup", [3]) +# @pytest.mark.parametrize("encryption", [None]) +def test_athena_encryption( + path, path2, database, table, table2, kms_key, encryption, workgroup, workgroup0, workgroup1, workgroup2, workgroup3 +): + kms_key = None if (encryption == "SSE_S3") or (encryption is None) else kms_key + if workgroup == 0: + workgroup = workgroup0 + elif workgroup == 1: + workgroup = workgroup1 + elif workgroup == 2: + workgroup = workgroup2 + elif workgroup == 3: + workgroup = workgroup3 + df = pd.DataFrame({"a": [1, 2], "b": ["foo", "boo"]}) + paths = wr.s3.to_parquet( + df=df, path=path, dataset=True, mode="overwrite", database=database, table=table, s3_additional_kwargs=None + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_table( table=table, - description="c1", - parameters={"num_cols": str(len(df.columns)), "num_rows": str(len(df.index) * 2)}, - columns_comments={"c1": "1"}, - ) - df2 = wr.athena.read_sql_table(table, database) - assert len(df.columns) == len(df2.columns) - assert len(df.index) * 2 == len(df2.index) - assert df.c1.sum() + 1 == df2.c1.sum() - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == str(len(df2.columns)) - assert parameters["num_rows"] == str(len(df2.index)) - assert wr.catalog.get_table_description(database, table) == "c1" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c1"] == "1" - - # Round 4 - Append + New Column - df = pd.DataFrame({"c2": ["a", None, "b"], "c1": [None, 1, None]}) - df["c1"] = df["c1"].astype("Int16") - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="append")["paths"] - wr.s3.wait_objects_exist(paths=paths) - wr.s3.store_parquet_metadata( - path=path, - dataset=True, - mode="append", + ctas_approach=True, database=database, - table=table, - description="c1+c2", - parameters={"num_cols": "2", "num_rows": "9"}, - columns_comments={"c1": "1", "c2": "2"}, + encryption=encryption, + workgroup=workgroup, + kms_key=kms_key, + keep_files=True, + ctas_temp_table_name=table2, + s3_output=path2, ) - df2 = wr.athena.read_sql_table(table, database) + assert wr.catalog.does_table_exist(database=database, table=table2) is False + assert len(df2.index) == 2 assert len(df2.columns) == 2 - assert len(df2.index) == 9 - assert df2.c1.sum() == 4 - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == "2" - assert parameters["num_rows"] == "9" - assert wr.catalog.get_table_description(database, table) == "c1+c2" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c1"] == "1" - assert comments["c2"] == "2" - # Round 5 - Overwrite Partitioned - df = pd.DataFrame({"c0": ["foo", None], "c1": [0, 1]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite", partition_cols=["c1"])["paths"] - wr.s3.wait_objects_exist(paths=paths) - wr.s3.store_parquet_metadata( - path=path, - dataset=True, - mode="overwrite", - database=database, - table=table, - description="c0+c1", - parameters={"num_cols": "2", "num_rows": "2"}, - columns_comments={"c0": "zero", "c1": "one"}, - ) - df2 = wr.athena.read_sql_table(table, database) - assert df.shape == df2.shape - assert df.c1.sum() == df2.c1.astype(int).sum() - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == "2" - assert parameters["num_rows"] == "2" - assert wr.catalog.get_table_description(database, table) == "c0+c1" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c0"] == "zero" - assert comments["c1"] == "one" - # Round 6 - Overwrite Partitions - df = pd.DataFrame({"c0": [None, "boo"], "c1": [0, 2]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite_partitions", partition_cols=["c1"])[ - "paths" - ] +def test_athena_nested(path, database, table): + df = pd.DataFrame( + { + "c0": [[1, 2, 3], [4, 5, 6]], + "c1": [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], + "c2": [[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]], + "c3": [[], [[[[[[[[1]]]]]]]]], + "c4": [{"a": 1}, {"a": 1}], + "c5": [{"a": {"b": {"c": [1, 2]}}}, {"a": {"b": {"c": [3, 4]}}}], + } + ) + paths = wr.s3.to_parquet( + df=df, path=path, index=False, use_threads=True, dataset=True, mode="overwrite", database=database, table=table + )["paths"] wr.s3.wait_objects_exist(paths=paths) - wr.s3.store_parquet_metadata( + df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database) + assert len(df2.index) == 2 + assert len(df2.columns) == 4 + + +def test_catalog_versioning(bucket, database): + table = "test_catalog_versioning" + wr.catalog.delete_table_if_exists(database=database, table=table) + path = f"s3://{bucket}/{table}/" + wr.s3.delete_objects(path=path) + + # Version 0 + df = pd.DataFrame({"c0": [1, 2]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table, mode="overwrite")["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 2 + assert len(df.columns) == 1 + assert str(df.c0.dtype).startswith("Int") + + # Version 1 + df = pd.DataFrame({"c1": ["foo", "boo"]}) + paths1 = wr.s3.to_parquet( + df=df, path=path, dataset=True, database=database, table=table, mode="overwrite", catalog_versioning=True + )["paths"] + wr.s3.wait_objects_exist(paths=paths1, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 2 + assert len(df.columns) == 1 + assert str(df.c1.dtype) == "string" + + # Version 2 + df = pd.DataFrame({"c1": [1.0, 2.0]}) + paths2 = wr.s3.to_csv( + df=df, path=path, dataset=True, - mode="append", database=database, table=table, - description="c0+c1", - parameters={"num_cols": "2", "num_rows": "3"}, - columns_comments={"c0": "zero", "c1": "one"}, - ) - df2 = wr.athena.read_sql_table(table, database) - assert len(df2.columns) == 2 - assert len(df2.index) == 3 - assert df2.c1.astype(int).sum() == 3 - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == "2" - assert parameters["num_rows"] == "3" - assert wr.catalog.get_table_description(database, table) == "c0+c1" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c0"] == "zero" - assert comments["c1"] == "one" + mode="overwrite", + catalog_versioning=True, + index=False, + )["paths"] + wr.s3.wait_objects_exist(paths=paths2, use_threads=False) + wr.s3.wait_objects_not_exist(paths=paths1, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 2 + assert len(df.columns) == 1 + assert str(df.c1.dtype).startswith("float") - # Round 7 - Overwrite Partitions + New Column - df = pd.DataFrame({"c0": ["bar", None], "c1": [1, 3], "c2": [True, False]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite_partitions", partition_cols=["c1"])[ - "paths" - ] - wr.s3.wait_objects_exist(paths=paths) - wr.s3.store_parquet_metadata( + # Version 3 (removing version 2) + df = pd.DataFrame({"c1": [True, False]}) + paths3 = wr.s3.to_csv( + df=df, path=path, dataset=True, - mode="append", database=database, table=table, - description="c0+c1+c2", - parameters={"num_cols": "3", "num_rows": "4"}, - columns_comments={"c0": "zero", "c1": "one", "c2": "two"}, + mode="overwrite", + catalog_versioning=False, + index=False, + )["paths"] + wr.s3.wait_objects_exist(paths=paths3, use_threads=False) + wr.s3.wait_objects_not_exist(paths=paths2, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 2 + assert len(df.columns) == 1 + assert str(df.c1.dtype).startswith("boolean") + + # Cleaning Up + wr.catalog.delete_table_if_exists(database=database, table=table) + wr.s3.delete_objects(path=path) + + +def test_copy_replacing_filename(bucket): + path = f"s3://{bucket}/test_copy_replacing_filename/" + wr.s3.delete_objects(path=path) + df = pd.DataFrame({"c0": [1, 2]}) + file_path = f"{path}myfile.parquet" + wr.s3.to_parquet(df=df, path=file_path) + wr.s3.wait_objects_exist(paths=[file_path], use_threads=False) + path2 = f"s3://{bucket}/test_copy_replacing_filename2/" + wr.s3.copy_objects( + paths=[file_path], source_path=path, target_path=path2, replace_filenames={"myfile.parquet": "myfile2.parquet"} ) - df2 = wr.athena.read_sql_table(table, database) - assert len(df2.columns) == 3 - assert len(df2.index) == 4 - assert df2.c1.astype(int).sum() == 6 - parameters = wr.catalog.get_table_parameters(database, table) - assert len(parameters) >= 5 - assert parameters["num_cols"] == "3" - assert parameters["num_rows"] == "4" - assert wr.catalog.get_table_description(database, table) == "c0+c1+c2" - comments = wr.catalog.get_columns_comments(database, table) - assert len(comments) == len(df.columns) - assert comments["c0"] == "zero" - assert comments["c1"] == "one" - assert comments["c2"] == "two" - engine = wr.catalog.get_engine("aws-data-wrangler-redshift") - df3 = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) - assert len(df3.columns) == 3 - assert len(df3.index) == 4 - assert df3.c1.astype(int).sum() == 6 + expected_file = f"{path2}myfile2.parquet" + wr.s3.wait_objects_exist(paths=[expected_file], use_threads=False) + objs = wr.s3.list_objects(path=path2) + assert objs[0] == expected_file + wr.s3.delete_objects(path=path) + wr.s3.delete_objects(path=path2) -@pytest.mark.parametrize("partition_cols", [None, ["c1"], ["c2"], ["c1", "c2"], ["c2", "c1"]]) -def test_to_parquet_reverse_partitions(database, table, path, partition_cols): - df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]}) - paths = wr.s3.to_parquet( - df=df, path=path, dataset=True, database=database, table=table, partition_cols=partition_cols - )["paths"] +def test_unsigned_parquet(bucket, database, external_schema): + table = "test_unsigned_parquet" + path = f"s3://{bucket}/{table}/" + wr.s3.delete_objects(path=path) + df = pd.DataFrame({"c0": [0, 0, (2 ** 8) - 1], "c1": [0, 0, (2 ** 16) - 1], "c2": [0, 0, (2 ** 32) - 1]}) + df["c0"] = df.c0.astype("uint8") + df["c1"] = df.c1.astype("uint16") + df["c2"] = df.c2.astype("uint32") + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table, mode="overwrite")["paths"] wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.athena.read_sql_table(table=table, database=database) - assert df.shape == df2.shape - assert df.c0.sum() == df2.c0.sum() - assert df.c1.sum() == df2.c1.sum() - assert df.c2.sum() == df2.c2.sum() + df = wr.athena.read_sql_table(table=table, database=database) + assert df.c0.sum() == (2 ** 8) - 1 + assert df.c1.sum() == (2 ** 16) - 1 + assert df.c2.sum() == (2 ** 32) - 1 + schema = wr.s3.read_parquet_metadata(path=path)[0] + assert schema["c0"] == "smallint" + assert schema["c1"] == "int" + assert schema["c2"] == "bigint" + df = wr.s3.read_parquet(path=path) + assert df.c0.sum() == (2 ** 8) - 1 + assert df.c1.sum() == (2 ** 16) - 1 + assert df.c2.sum() == (2 ** 32) - 1 + engine = wr.catalog.get_engine("aws-data-wrangler-redshift") + df = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) + assert df.c0.sum() == (2 ** 8) - 1 + assert df.c1.sum() == (2 ** 16) - 1 + assert df.c2.sum() == (2 ** 32) - 1 + + df = pd.DataFrame({"c0": [0, 0, (2 ** 64) - 1]}) + df["c0"] = df.c0.astype("uint64") + with pytest.raises(wr.exceptions.UnsupportedType): + wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table, mode="overwrite") + wr.s3.delete_objects(path=path) + wr.catalog.delete_table_if_exists(database=database, table=table) -def test_to_parquet_nested_append(database, table, path): + +def test_parquet_uint64(bucket): + path = f"s3://{bucket}/test_parquet_uint64/" + wr.s3.delete_objects(path=path) df = pd.DataFrame( { - "c0": [[1, 2, 3], [4, 5, 6]], - "c1": [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], - "c2": [[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]], - "c3": [[], [[[[[[[[1]]]]]]]]], - "c4": [{"a": 1}, {"a": 1}], - "c5": [{"a": {"b": {"c": [1, 2]}}}, {"a": {"b": {"c": [3, 4]}}}], + "c0": [0, 0, (2 ** 8) - 1], + "c1": [0, 0, (2 ** 16) - 1], + "c2": [0, 0, (2 ** 32) - 1], + "c3": [0, 0, (2 ** 64) - 1], + "c4": [0, 1, 2], } ) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table)["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database) - assert len(df2.index) == 2 - assert len(df2.columns) == 4 - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table)["paths"] + print(df) + df["c0"] = df.c0.astype("uint8") + df["c1"] = df.c1.astype("uint16") + df["c2"] = df.c2.astype("uint32") + df["c3"] = df.c3.astype("uint64") + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, mode="overwrite", partition_cols=["c4"])["paths"] wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database) - assert len(df2.index) == 4 - assert len(df2.columns) == 4 + df = wr.s3.read_parquet(path=path, dataset=True) + print(df) + print(df.dtypes) + assert len(df.index) == 3 + assert len(df.columns) == 5 + assert df.c0.max() == (2 ** 8) - 1 + assert df.c1.max() == (2 ** 16) - 1 + assert df.c2.max() == (2 ** 32) - 1 + assert df.c3.max() == (2 ** 64) - 1 + assert df.c4.astype("uint8").sum() == 3 + wr.s3.delete_objects(path=path) + +def test_parquet_overwrite_partition_cols(path, database, table, external_schema): + df = pd.DataFrame({"c0": [1, 2, 1, 2], "c1": [1, 2, 1, 2], "c2": [2, 1, 2, 1]}) -def test_to_parquet_nested_cast(database, table, path): - df = pd.DataFrame({"c0": [[1, 2, 3], [4, 5, 6]], "c1": [[], []], "c2": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]}) paths = wr.s3.to_parquet( - df=df, + df=df, path=path, dataset=True, database=database, table=table, mode="overwrite", partition_cols=["c2"] + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 4 + assert len(df.columns) == 3 + assert df.c0.sum() == 6 + assert df.c1.sum() == 6 + assert df.c2.sum() == 6 + + paths = wr.s3.to_parquet( + df=df, path=path, dataset=True, database=database, table=table, mode="overwrite", partition_cols=["c1", "c2"] + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 4 + assert len(df.columns) == 3 + assert df.c0.sum() == 6 + assert df.c1.sum() == 6 + assert df.c2.sum() == 6 + + engine = wr.catalog.get_engine("aws-data-wrangler-redshift") + df = wr.db.read_sql_table(con=engine, table=table, schema=external_schema) + assert len(df.index) == 4 + assert len(df.columns) == 3 + assert df.c0.sum() == 6 + assert df.c1.sum() == 6 + assert df.c2.sum() == 6 + + +def test_catalog_parameters(bucket, database): + table = "test_catalog_parameters" + path = f"s3://{bucket}/{table}/" + wr.s3.delete_objects(path=path) + wr.catalog.delete_table_if_exists(database=database, table=table) + + wr.s3.to_parquet( + df=pd.DataFrame({"c0": [1, 2]}), path=path, dataset=True, database=database, table=table, - dtype={"c0": "array", "c1": "array", "c2": "struct"}, - )["paths"] - wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df = pd.DataFrame({"c0": [[1, 2, 3], [4, 5, 6]], "c1": [["a"], ["b"]], "c2": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]}) - paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table)["paths"] + mode="overwrite", + parameters={"a": "1", "b": "2"}, + ) + pars = wr.catalog.get_table_parameters(database=database, table=table) + assert pars["a"] == "1" + assert pars["b"] == "2" + pars["a"] = "0" + pars["c"] = "3" + wr.catalog.upsert_table_parameters(parameters=pars, database=database, table=table) + pars = wr.catalog.get_table_parameters(database=database, table=table) + assert pars["a"] == "0" + assert pars["b"] == "2" + assert pars["c"] == "3" + wr.catalog.overwrite_table_parameters(parameters={"d": "4"}, database=database, table=table) + pars = wr.catalog.get_table_parameters(database=database, table=table) + assert pars.get("a") is None + assert pars.get("b") is None + assert pars.get("c") is None + assert pars["d"] == "4" + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 2 + assert len(df.columns) == 1 + assert df.c0.sum() == 3 + + wr.s3.to_parquet( + df=pd.DataFrame({"c0": [3, 4]}), + path=path, + dataset=True, + database=database, + table=table, + mode="append", + parameters={"e": "5"}, + ) + pars = wr.catalog.get_table_parameters(database=database, table=table) + assert pars.get("a") is None + assert pars.get("b") is None + assert pars.get("c") is None + assert pars["d"] == "4" + assert pars["e"] == "5" + df = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == 4 + assert len(df.columns) == 1 + assert df.c0.sum() == 10 + + wr.s3.delete_objects(path=path) + wr.catalog.delete_table_if_exists(database=database, table=table) + + +def test_metadata_partitions(path): + path = f"{path}0.parquet" + df = pd.DataFrame({"c0": [0, 1, 2], "c1": ["3", "4", "5"], "c2": [6.0, 7.0, 8.0]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=False)["paths"] wr.s3.wait_objects_exist(paths=paths, use_threads=False) - df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c2 FROM {table}", database=database) - assert len(df2.index) == 4 - assert len(df2.columns) == 2 + columns_types, partitions_types = wr.s3.read_parquet_metadata(path=path, dataset=False) + assert len(columns_types) == len(df.columns) + assert columns_types.get("c0") == "bigint" + assert columns_types.get("c1") == "string" + assert columns_types.get("c2") == "double" diff --git a/testing/test_awswrangler/test_data_lake2.py b/testing/test_awswrangler/test_data_lake2.py new file mode 100644 index 000000000..cbb78fb41 --- /dev/null +++ b/testing/test_awswrangler/test_data_lake2.py @@ -0,0 +1,425 @@ +import itertools +import logging + +import boto3 +import pandas as pd +import pytest + +import awswrangler as wr + +from ._utils import dt, extract_cloudformation_outputs, get_time_str_with_random_suffix, path_generator, ts + +logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") +logging.getLogger("awswrangler").setLevel(logging.DEBUG) +logging.getLogger("botocore.credentials").setLevel(logging.CRITICAL) + + +@pytest.fixture(scope="module") +def cloudformation_outputs(): + yield extract_cloudformation_outputs() + + +@pytest.fixture(scope="module") +def region(cloudformation_outputs): + yield cloudformation_outputs["Region"] + + +@pytest.fixture(scope="module") +def database(cloudformation_outputs): + yield cloudformation_outputs["GlueDatabaseName"] + + +@pytest.fixture(scope="module") +def external_schema(cloudformation_outputs, database): + region = cloudformation_outputs.get("Region") + sql = f""" + CREATE EXTERNAL SCHEMA IF NOT EXISTS aws_data_wrangler_external FROM data catalog + DATABASE '{database}' + IAM_ROLE '{cloudformation_outputs["RedshiftRole"]}' + REGION '{region}'; + """ + engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") + with engine.connect() as con: + con.execute(sql) + yield "aws_data_wrangler_external" + + +@pytest.fixture(scope="function") +def path(cloudformation_outputs): + yield from path_generator(cloudformation_outputs["BucketName"]) + + +@pytest.fixture(scope="function") +def table(database): + name = f"tbl_{get_time_str_with_random_suffix()}" + print(f"Table name: {name}") + wr.catalog.delete_table_if_exists(database=database, table=name) + yield name + wr.catalog.delete_table_if_exists(database=database, table=name) + + +@pytest.mark.parametrize("partition_cols", [None, ["c2"], ["c1", "c2"]]) +def test_metadata_partitions_dataset(path, partition_cols): + df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=partition_cols)["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + columns_types, partitions_types = wr.s3.read_parquet_metadata(path=path, dataset=True) + partitions_types = partitions_types if partitions_types is not None else {} + assert len(columns_types) + len(partitions_types) == len(df.columns) + assert columns_types.get("c0") == "bigint" + assert (columns_types.get("c1") == "bigint") or (partitions_types.get("c1") == "string") + assert (columns_types.get("c1") == "bigint") or (partitions_types.get("c1") == "string") + + +@pytest.mark.parametrize("partition_cols", [None, ["c2"], ["c1", "c2"]]) +def test_store_metadata_partitions_dataset(database, table, path, partition_cols): + df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=partition_cols)["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + wr.s3.store_parquet_metadata(path=path, database=database, table=table, dataset=True) + df2 = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) == len(df2.index) + assert len(df.columns) == len(df2.columns) + assert df.c0.sum() == df2.c0.sum() + assert df.c1.sum() == df2.c1.astype(int).sum() + assert df.c2.sum() == df2.c2.astype(int).sum() + + +def test_json_chunksize(path): + num_files = 10 + df = pd.DataFrame({"id": [1, 2, 3], "value": ["foo", "boo", "bar"]}) + paths = [f"{path}{i}.json" for i in range(num_files)] + for p in paths: + wr.s3.to_json(df, p, orient="records", lines=True) + wr.s3.wait_objects_exist(paths) + dfs = list(wr.s3.read_json(paths, lines=True, chunksize=1)) + assert len(dfs) == (3 * num_files) + for d in dfs: + assert len(d.columns) == 2 + assert d.id.iloc[0] in (1, 2, 3) + assert d.value.iloc[0] in ("foo", "boo", "bar") + + +def test_parquet_cast_string(path): + df = pd.DataFrame({"id": [1, 2, 3], "value": ["foo", "boo", "bar"]}) + path_file = f"{path}0.parquet" + wr.s3.to_parquet(df, path_file, dtype={"id": "string"}) + wr.s3.wait_objects_exist([path_file]) + df2 = wr.s3.read_parquet(path_file) + assert str(df2.id.dtypes) == "string" + df2["id"] = df2["id"].astype(int) + assert df.shape == df2.shape + for col, row in tuple(itertools.product(df.columns, range(3))): + assert df[col].iloc[row] == df2[col].iloc[row] + + +@pytest.mark.parametrize("partition_cols", [None, ["c2"], ["value", "c2"]]) +def test_parquet_cast_string_dataset(path, partition_cols): + df = pd.DataFrame({"id": [1, 2, 3], "value": ["foo", "boo", "bar"], "c2": [4, 5, 6], "c3": [7.0, 8.0, 9.0]}) + paths = wr.s3.to_parquet( + df, path, dataset=True, partition_cols=partition_cols, dtype={"id": "string", "c3": "string"} + )["paths"] + wr.s3.wait_objects_exist(paths) + df2 = wr.s3.read_parquet(path, dataset=True).sort_values("id", ignore_index=True) + assert str(df2.id.dtypes) == "string" + assert str(df2.c3.dtypes) == "string" + df2["id"] = df2["id"].astype(int) + df2["c3"] = df2["c3"].astype(float) + assert df.shape == df2.shape + for col, row in tuple(itertools.product(df.columns, range(3))): + assert df[col].iloc[row] == df2[col].iloc[row] + + +@pytest.mark.parametrize("partition_cols", [None, ["c2"], ["c1", "c2"]]) +def test_store_metadata_partitions_sample_dataset(database, table, path, partition_cols): + num_files = 10 + df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]}) + for _ in range(num_files): + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, partition_cols=partition_cols)["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + wr.s3.store_parquet_metadata( + path=path, database=database, table=table, dtype={"c1": "bigint", "c2": "smallint"}, sampling=0.25, dataset=True + ) + df2 = wr.athena.read_sql_table(table=table, database=database) + assert len(df.index) * num_files == len(df2.index) + assert len(df.columns) == len(df2.columns) + assert df.c0.sum() * num_files == df2.c0.sum() + assert df.c1.sum() * num_files == df2.c1.sum() + assert df.c2.sum() * num_files == df2.c2.sum() + + +def test_athena_undefined_column(database): + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.athena.read_sql_query("SELECT 1", database) + with pytest.raises(wr.exceptions.InvalidArgumentValue): + wr.athena.read_sql_query("SELECT NULL AS my_null", database) + + +def test_to_parquet_file_sanitize(path): + df = pd.DataFrame({"C0": [0, 1], "camelCase": [2, 3], "c**--2": [4, 5]}) + path_file = f"{path}0.parquet" + wr.s3.to_parquet(df, path_file) + wr.s3.wait_objects_exist([path_file]) + df2 = wr.s3.read_parquet(path_file) + assert df.shape == df2.shape + assert list(df2.columns) == ["c0", "camel_case", "c_2"] + assert df2.c0.sum() == 1 + assert df2.camel_case.sum() == 5 + assert df2.c_2.sum() == 9 + + +@pytest.mark.parametrize("partition_cols", [None, ["c1"], ["c2"], ["c1", "c2"], ["c2", "c1"]]) +def test_to_parquet_reverse_partitions(database, table, path, partition_cols): + df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]}) + paths = wr.s3.to_parquet( + df=df, path=path, dataset=True, database=database, table=table, partition_cols=partition_cols + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_table(table=table, database=database) + assert df.shape == df2.shape + assert df.c0.sum() == df2.c0.sum() + assert df.c1.sum() == df2.c1.sum() + assert df.c2.sum() == df2.c2.sum() + + +def test_to_parquet_nested_append(database, table, path): + df = pd.DataFrame( + { + "c0": [[1, 2, 3], [4, 5, 6]], + "c1": [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], + "c2": [[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]], + "c3": [[], [[[[[[[[1]]]]]]]]], + "c4": [{"a": 1}, {"a": 1}], + "c5": [{"a": {"b": {"c": [1, 2]}}}, {"a": {"b": {"c": [3, 4]}}}], + } + ) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table)["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database) + assert len(df2.index) == 2 + assert len(df2.columns) == 4 + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table)["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c1, c2, c4 FROM {table}", database=database) + assert len(df2.index) == 4 + assert len(df2.columns) == 4 + + +def test_to_parquet_nested_cast(database, table, path): + df = pd.DataFrame({"c0": [[1, 2, 3], [4, 5, 6]], "c1": [[], []], "c2": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]}) + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + database=database, + table=table, + dtype={"c0": "array", "c1": "array", "c2": "struct"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df = pd.DataFrame({"c0": [[1, 2, 3], [4, 5, 6]], "c1": [["a"], ["b"]], "c2": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]}) + paths = wr.s3.to_parquet(df=df, path=path, dataset=True, database=database, table=table)["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_query(sql=f"SELECT c0, c2 FROM {table}", database=database) + assert len(df2.index) == 4 + assert len(df2.columns) == 2 + + +@pytest.mark.parametrize( + "encoding,strings,wrong_encoding,exception", + [ + ("utf-8", ["漢字", "ãóú", "г, д, ж, з, к, л"], "ISO-8859-1", AssertionError), + ("ISO-8859-1", ["Ö, ö, Ü, ü", "ãóú", "øe"], "utf-8", UnicodeDecodeError), + ("ISO-8859-1", ["Ö, ö, Ü, ü", "ãóú", "øe"], None, UnicodeDecodeError), + ], +) +@pytest.mark.parametrize("line_terminator", ["\n", "\r"]) +def test_csv_encoding(path, encoding, strings, wrong_encoding, exception, line_terminator): + file_path = f"{path}0.csv" + df = pd.DataFrame({"c0": [1, 2, 3], "c1": strings}) + wr.s3.to_csv(df, file_path, index=False, encoding=encoding, line_terminator=line_terminator) + wr.s3.wait_objects_exist(paths=[file_path]) + df2 = wr.s3.read_csv(file_path, encoding=encoding, lineterminator=line_terminator) + assert df.equals(df2) + with pytest.raises(exception): + df2 = wr.s3.read_csv(file_path, encoding=wrong_encoding) + assert df.equals(df2) + + +def test_to_parquet_file_dtype(path): + df = pd.DataFrame({"c0": [1.0, None, 2.0], "c1": [pd.NA, pd.NA, pd.NA]}) + file_path = f"{path}0.parquet" + wr.s3.to_parquet(df, file_path, dtype={"c0": "bigint", "c1": "string"}) + wr.s3.wait_objects_exist(paths=[file_path]) + df2 = wr.s3.read_parquet(file_path) + assert df2.shape == df.shape + assert df2.c0.sum() == 3 + assert str(df2.c0.dtype) == "Int64" + assert str(df2.c1.dtype) == "string" + + +def test_to_parquet_projection_integer(database, table, path): + df = pd.DataFrame({"c0": [0, 1, 2], "c1": [0, 1, 2], "c2": [0, 100, 200], "c3": [0, 1, 2]}) + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + database=database, + table=table, + partition_cols=["c1", "c2", "c3"], + regular_partitions=False, + projection_enabled=True, + projection_types={"c1": "integer", "c2": "integer", "c3": "integer"}, + projection_ranges={"c1": "0,2", "c2": "0,200", "c3": "0,2"}, + projection_intervals={"c2": "100"}, + projection_digits={"c3": "1"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_table(table, database) + assert df.shape == df2.shape + assert df.c0.sum() == df2.c0.sum() + assert df.c1.sum() == df2.c1.sum() + assert df.c2.sum() == df2.c2.sum() + assert df.c3.sum() == df2.c3.sum() + + +def test_to_parquet_projection_enum(database, table, path): + df = pd.DataFrame({"c0": [0, 1, 2], "c1": [1, 2, 3], "c2": ["foo", "boo", "bar"]}) + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + database=database, + table=table, + partition_cols=["c1", "c2"], + regular_partitions=False, + projection_enabled=True, + projection_types={"c1": "enum", "c2": "enum"}, + projection_values={"c1": "1,2,3", "c2": "foo,boo,bar"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_table(table, database) + assert df.shape == df2.shape + assert df.c0.sum() == df2.c0.sum() + assert df.c1.sum() == df2.c1.sum() + + +def test_to_parquet_projection_date(database, table, path): + df = pd.DataFrame( + { + "c0": [0, 1, 2], + "c1": [dt("2020-01-01"), dt("2020-01-02"), dt("2020-01-03")], + "c2": [ts("2020-01-01 01:01:01.0"), ts("2020-01-01 01:01:02.0"), ts("2020-01-01 01:01:03.0")], + } + ) + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + database=database, + table=table, + partition_cols=["c1", "c2"], + regular_partitions=False, + projection_enabled=True, + projection_types={"c1": "date", "c2": "date"}, + projection_ranges={"c1": "2020-01-01,2020-01-03", "c2": "2020-01-01 01:01:00,2020-01-01 01:01:03"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_table(table, database) + print(df2) + assert df.shape == df2.shape + assert df.c0.sum() == df2.c0.sum() + + +def test_to_parquet_projection_injected(database, table, path): + df = pd.DataFrame({"c0": [0, 1, 2], "c1": ["foo", "boo", "bar"], "c2": ["0", "1", "2"]}) + paths = wr.s3.to_parquet( + df=df, + path=path, + dataset=True, + database=database, + table=table, + partition_cols=["c1", "c2"], + regular_partitions=False, + projection_enabled=True, + projection_types={"c1": "injected", "c2": "injected"}, + )["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.athena.read_sql_query(f"SELECT * FROM {table} WHERE c1='foo' AND c2='0'", database) + assert df2.shape == (1, 3) + assert df2.c0.iloc[0] == 0 + + +def test_read_parquet_filter_partitions(path): + df = pd.DataFrame({"c0": [0, 1, 2], "c1": [0, 1, 2], "c2": [0, 0, 1]}) + paths = wr.s3.to_parquet(df, path, dataset=True, partition_cols=["c1", "c2"])["paths"] + wr.s3.wait_objects_exist(paths=paths, use_threads=False) + df2 = wr.s3.read_parquet(path, dataset=True, filters=[("c1", "==", "0")]) + assert df2.shape == (1, 3) + assert df2.c0.iloc[0] == 0 + assert df2.c1.iloc[0] == 0 + assert df2.c2.iloc[0] == 0 + df2 = wr.s3.read_parquet(path, dataset=True, filters=[("c1", "==", "1"), ("c2", "==", "0")]) + assert df2.shape == (1, 3) + assert df2.c0.iloc[0] == 1 + assert df2.c1.iloc[0] == 1 + assert df2.c2.iloc[0] == 0 + df2 = wr.s3.read_parquet(path, dataset=True, filters=[("c2", "==", "0")]) + assert df2.shape == (2, 3) + assert df2.c0.astype(int).sum() == 1 + assert df2.c1.astype(int).sum() == 1 + assert df2.c2.astype(int).sum() == 0 + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("chunksize", [None, 1]) +def test_read_partitioned_json(path, use_threads, chunksize): + df = pd.DataFrame({"c0": [0, 1], "c1": ["foo", "boo"]}) + paths = [f"{path}year={y}/month={m}/0.json" for y, m in [(2020, 1), (2020, 2), (2021, 1)]] + for p in paths: + wr.s3.to_json(df, p, orient="records", lines=True) + wr.s3.wait_objects_exist(paths, use_threads=False) + df2 = wr.s3.read_json(path, dataset=True, use_threads=use_threads, chunksize=chunksize) + if chunksize is None: + assert df2.shape == (6, 4) + assert df2.c0.sum() == 3 + else: + for d in df2: + assert d.shape == (1, 4) + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("chunksize", [None, 1]) +def test_read_partitioned_csv(path, use_threads, chunksize): + df = pd.DataFrame({"c0": [0, 1], "c1": ["foo", "boo"]}) + paths = [f"{path}year={y}/month={m}/0.csv" for y, m in [(2020, 1), (2020, 2), (2021, 1)]] + for p in paths: + wr.s3.to_csv(df, p, index=False) + wr.s3.wait_objects_exist(paths, use_threads=False) + df2 = wr.s3.read_csv(path, dataset=True, use_threads=use_threads, chunksize=chunksize) + if chunksize is None: + assert df2.shape == (6, 4) + assert df2.c0.sum() == 3 + else: + for d in df2: + assert d.shape == (1, 4) + + +@pytest.mark.parametrize("use_threads", [True, False]) +@pytest.mark.parametrize("chunksize", [None, 1]) +def test_read_partitioned_fwf(path, use_threads, chunksize): + text = "0foo\n1boo" + client_s3 = boto3.client("s3") + paths = [f"{path}year={y}/month={m}/0.csv" for y, m in [(2020, 1), (2020, 2), (2021, 1)]] + for p in paths: + bucket, key = wr._utils.parse_path(p) + client_s3.put_object(Body=text, Bucket=bucket, Key=key) + wr.s3.wait_objects_exist(paths, use_threads=False) + df2 = wr.s3.read_fwf( + path, dataset=True, use_threads=use_threads, chunksize=chunksize, widths=[1, 3], names=["c0", "c1"] + ) + if chunksize is None: + assert df2.shape == (6, 4) + assert df2.c0.sum() == 3 + else: + for d in df2: + assert d.shape == (1, 4) diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py index 1e07f8bf9..4ff1e68ed 100644 --- a/testing/test_awswrangler/test_db.py +++ b/testing/test_awswrangler/test_db.py @@ -9,7 +9,13 @@ import awswrangler as wr -from ._utils import CFN_VALID_STATUS, ensure_data_types, ensure_data_types_category, get_df, get_df_category +from ._utils import ( + ensure_data_types, + ensure_data_types_category, + extract_cloudformation_outputs, + get_df, + get_df_category, +) logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") logging.getLogger("awswrangler").setLevel(logging.DEBUG) @@ -18,12 +24,7 @@ @pytest.fixture(scope="module") def cloudformation_outputs(): - response = boto3.client("cloudformation").describe_stacks(StackName="aws-data-wrangler") - stack = [x for x in response.get("Stacks") if x["StackStatus"] in CFN_VALID_STATUS][0] - outputs = {} - for output in stack.get("Outputs"): - outputs[output.get("OutputKey")] = output.get("OutputValue") - yield outputs + yield extract_cloudformation_outputs() @pytest.fixture(scope="module") diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py index e414fc2e5..f67150f23 100644 --- a/testing/test_awswrangler/test_emr.py +++ b/testing/test_awswrangler/test_emr.py @@ -1,12 +1,11 @@ import logging import time -import boto3 import pytest import awswrangler as wr -from ._utils import CFN_VALID_STATUS +from ._utils import extract_cloudformation_outputs logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s") logging.getLogger("awswrangler").setLevel(logging.DEBUG) @@ -15,12 +14,7 @@ @pytest.fixture(scope="module") def cloudformation_outputs(): - response = boto3.client("cloudformation").describe_stacks(StackName="aws-data-wrangler") - stack = [x for x in response.get("Stacks") if x["StackStatus"] in CFN_VALID_STATUS][0] - outputs = {} - for output in stack.get("Outputs"): - outputs[output.get("OutputKey")] = output.get("OutputValue") - yield outputs + yield extract_cloudformation_outputs() @pytest.fixture(scope="module") diff --git a/testing/test_awswrangler/test_metadata.py b/testing/test_awswrangler/test_metadata.py index 3dff3f55a..c8f0bc067 100644 --- a/testing/test_awswrangler/test_metadata.py +++ b/testing/test_awswrangler/test_metadata.py @@ -2,7 +2,7 @@ def test_metadata(): - assert wr.__version__ == "1.3.0" + assert wr.__version__ == "1.4.0" assert wr.__title__ == "awswrangler" assert wr.__description__ == "Pandas on AWS." assert wr.__license__ == "Apache License 2.0" diff --git a/testing/test_awswrangler/test_moto.py b/testing/test_awswrangler/test_moto.py index 01bde208f..71367f730 100644 --- a/testing/test_awswrangler/test_moto.py +++ b/testing/test_awswrangler/test_moto.py @@ -1,11 +1,15 @@ +from unittest.mock import ANY + import boto3 import botocore import mock import moto +import pandas as pd import pytest from botocore.exceptions import ClientError import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame, InvalidArgumentCombination from ._utils import ensure_data_types, get_df_csv, get_df_list @@ -217,6 +221,71 @@ def test_csv(s3): assert len(df.columns) == 10 +@mock.patch("pandas.read_csv") +@mock.patch("s3fs.S3FileSystem.open") +def test_read_csv_pass_pandas_arguments_and_encoding_succeed(mock_open, mock_read_csv, s3): + bucket = "bucket" + key = "foo/foo.csv" + path = "s3://{}/{}".format(bucket, key) + s3_object = s3.Object(bucket, key) + s3_object.put(Body=b"foo") + + with pytest.raises(TypeError): + wr.s3.read_csv(path=path, encoding="ISO-8859-1", sep=",", lineterminator="\r\n") + mock_open.assert_called_with(path="s3://bucket/foo/foo.csv", mode="r", encoding="ISO-8859-1", newline="\r\n") + mock_read_csv.assert_called_with(ANY, compression=None, encoding="ISO-8859-1", sep=",", lineterminator="\r\n") + + +def test_to_csv_invalid_argument_combination_raise_when_dataset_false_succeed(s3): + path = "s3://bucket/test.csv" + with pytest.raises(InvalidArgumentCombination): + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, database="foo") + + with pytest.raises(InvalidArgumentCombination): + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, table="foo") + + with pytest.raises(InvalidArgumentCombination): + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=False, partition_cols=["par0", "par1"]) + + with pytest.raises(InvalidArgumentCombination): + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=False, mode="append") + + with pytest.raises(InvalidArgumentCombination): + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=False, partition_cols=["par0", "par1"]) + + with pytest.raises(InvalidArgumentCombination): + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=False, database="default", table="test") + + with pytest.raises(InvalidArgumentCombination): + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=False, description="raise exception") + + with pytest.raises(InvalidArgumentCombination): + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=False, parameters={"key": "value"}) + + with pytest.raises(InvalidArgumentCombination): + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=False, columns_comments={"col0": "test"}) + + +def test_to_csv_valid_argument_combination_when_dataset_true_succeed(s3): + path = "s3://bucket/test.csv" + wr.s3.to_csv(df=get_df_csv(), path=path, index=False) + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=True, partition_cols=["par0", "par1"]) + + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=True, mode="append") + + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=True, description="raise exception") + + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=True, parameters={"key": "value"}) + + wr.s3.to_csv(df=get_df_csv(), path=path, index=False, dataset=True, columns_comments={"col0": "test"}) + + +def test_to_csv_data_empty_raise_succeed(s3): + path = "s3://bucket/test.csv" + with pytest.raises(EmptyDataFrame): + wr.s3.to_csv(df=pd.DataFrame(), path=path, index=False) + + def test_parquet(s3): path = "s3://bucket/test.parquet" wr.s3.to_parquet(df=get_df_list(), path=path, index=False, dataset=True, partition_cols=["par0", "par1"]) diff --git a/testing/validations.sh b/testing/validations.sh index d32fc7808..db65119b2 100755 --- a/testing/validations.sh +++ b/testing/validations.sh @@ -7,8 +7,8 @@ cfn-flip -c -l -n cloudformation.yaml temp.yaml cfn-lint -t temp.yaml mv temp.yaml cloudformation.yaml pushd .. +isort -rc awswrangler testing/test_awswrangler black --line-length 120 --target-version py36 awswrangler testing/test_awswrangler -isort -rc --line-width 120 awswrangler testing/test_awswrangler pydocstyle awswrangler/ --add-ignore=D204,D403 mypy awswrangler flake8 setup.py awswrangler testing/test_awswrangler diff --git a/tutorials/01 - Introduction.ipynb b/tutorials/001 - Introduction.ipynb similarity index 100% rename from tutorials/01 - Introduction.ipynb rename to tutorials/001 - Introduction.ipynb diff --git a/tutorials/02 - Sessions.ipynb b/tutorials/002 - Sessions.ipynb similarity index 100% rename from tutorials/02 - Sessions.ipynb rename to tutorials/002 - Sessions.ipynb diff --git a/tutorials/03 - Amazon S3.ipynb b/tutorials/003 - Amazon S3.ipynb similarity index 100% rename from tutorials/03 - Amazon S3.ipynb rename to tutorials/003 - Amazon S3.ipynb diff --git a/tutorials/04 - Parquet Datasets.ipynb b/tutorials/004 - Parquet Datasets.ipynb similarity index 100% rename from tutorials/04 - Parquet Datasets.ipynb rename to tutorials/004 - Parquet Datasets.ipynb diff --git a/tutorials/05 - Glue Catalog.ipynb b/tutorials/005 - Glue Catalog.ipynb similarity index 100% rename from tutorials/05 - Glue Catalog.ipynb rename to tutorials/005 - Glue Catalog.ipynb diff --git a/tutorials/06 - Amazon Athena.ipynb b/tutorials/006 - Amazon Athena.ipynb similarity index 100% rename from tutorials/06 - Amazon Athena.ipynb rename to tutorials/006 - Amazon Athena.ipynb diff --git a/tutorials/07 - Redshift, MySQL, PostgreSQL.ipynb b/tutorials/007 - Redshift, MySQL, PostgreSQL.ipynb similarity index 100% rename from tutorials/07 - Redshift, MySQL, PostgreSQL.ipynb rename to tutorials/007 - Redshift, MySQL, PostgreSQL.ipynb diff --git a/tutorials/08 - Redshift - Copy & Unload.ipynb b/tutorials/008 - Redshift - Copy & Unload.ipynb similarity index 100% rename from tutorials/08 - Redshift - Copy & Unload.ipynb rename to tutorials/008 - Redshift - Copy & Unload.ipynb diff --git a/tutorials/09 - Redshift - Append, Overwrite, Upsert.ipynb b/tutorials/009 - Redshift - Append, Overwrite, Upsert.ipynb similarity index 100% rename from tutorials/09 - Redshift - Append, Overwrite, Upsert.ipynb rename to tutorials/009 - Redshift - Append, Overwrite, Upsert.ipynb diff --git a/tutorials/10 - Parquet Crawler.ipynb b/tutorials/010 - Parquet Crawler.ipynb similarity index 100% rename from tutorials/10 - Parquet Crawler.ipynb rename to tutorials/010 - Parquet Crawler.ipynb diff --git a/tutorials/11 - CSV Datasets.ipynb b/tutorials/011 - CSV Datasets.ipynb similarity index 100% rename from tutorials/11 - CSV Datasets.ipynb rename to tutorials/011 - CSV Datasets.ipynb diff --git a/tutorials/12 - CSV Crawler.ipynb b/tutorials/012 - CSV Crawler.ipynb similarity index 99% rename from tutorials/12 - CSV Crawler.ipynb rename to tutorials/012 - CSV Crawler.ipynb index d3e4bd710..68973a424 100644 --- a/tutorials/12 - CSV Crawler.ipynb +++ b/tutorials/012 - CSV Crawler.ipynb @@ -478,7 +478,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## You can also extract the metadata directly from the Catalog with you want" + "## You can also extract the metadata directly from the Catalog if you want" ] }, { @@ -691,17 +691,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/tutorials/13 - Merging Datasets on S3.ipynb b/tutorials/013 - Merging Datasets on S3.ipynb similarity index 100% rename from tutorials/13 - Merging Datasets on S3.ipynb rename to tutorials/013 - Merging Datasets on S3.ipynb diff --git a/tutorials/14 - Schema Evolution.ipynb b/tutorials/014 - Schema Evolution.ipynb similarity index 100% rename from tutorials/14 - Schema Evolution.ipynb rename to tutorials/014 - Schema Evolution.ipynb diff --git a/tutorials/15 - EMR.ipynb b/tutorials/015 - EMR.ipynb similarity index 100% rename from tutorials/15 - EMR.ipynb rename to tutorials/015 - EMR.ipynb diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/016 - EMR & Docker.ipynb similarity index 100% rename from tutorials/16 - EMR & Docker.ipynb rename to tutorials/016 - EMR & Docker.ipynb diff --git a/tutorials/17 - Partition Projection.ipynb b/tutorials/017 - Partition Projection.ipynb similarity index 100% rename from tutorials/17 - Partition Projection.ipynb rename to tutorials/017 - Partition Projection.ipynb