From c9b5f1a47361c350172616d019768473217c8c40 Mon Sep 17 00:00:00 2001 From: Maximilian Speicher Date: Sun, 21 Nov 2021 11:05:33 +0100 Subject: [PATCH 1/5] Add storage location template for glue --- awswrangler/catalog/_create.py | 29 +++++++++++++++++++++++++++++ awswrangler/s3/_write_parquet.py | 14 ++++++++++++++ awswrangler/s3/_write_text.py | 14 ++++++++++++++ tests/test_athena_projection.py | 26 ++++++++++++++++++++++++++ 4 files changed, 83 insertions(+) diff --git a/awswrangler/catalog/_create.py b/awswrangler/catalog/_create.py index 47fcd4e35..71960550d 100644 --- a/awswrangler/catalog/_create.py +++ b/awswrangler/catalog/_create.py @@ -42,6 +42,7 @@ def _create_table( # pylint: disable=too-many-branches,too-many-statements projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], + projection_storage_location_template: Optional[str], catalog_id: Optional[str], ) -> None: # Description @@ -95,6 +96,7 @@ def _create_table( # pylint: disable=too-many-branches,too-many-statements mode = _update_if_necessary( dic=table_input["Parameters"], key=f"projection.{k}.digits", value=str(v), mode=mode ) + mode = _update_if_necessary(table_input["Parameters"], key="storage.location.template", value=projection_storage_location_template, mode=mode) else: table_input["Parameters"]["projection.enabled"] = "false" @@ -232,6 +234,7 @@ def _create_parquet_table( projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], + projection_storage_location_template: Optional[str], boto3_session: Optional[boto3.Session], catalog_table_input: Optional[Dict[str, Any]], ) -> None: @@ -280,6 +283,7 @@ def _create_parquet_table( projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, catalog_id=catalog_id, ) @@ -309,6 +313,7 @@ def _create_csv_table( # pylint: disable=too-many-arguments projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], + projection_storage_location_template: Optional[str], catalog_table_input: Optional[Dict[str, Any]], catalog_id: Optional[str], ) -> None: @@ -353,6 +358,7 @@ def _create_csv_table( # pylint: disable=too-many-arguments projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, catalog_id=catalog_id, ) @@ -380,6 +386,7 @@ def _create_json_table( # pylint: disable=too-many-arguments projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], + projection_storage_location_template: Optional[str], catalog_table_input: Optional[Dict[str, Any]], catalog_id: Optional[str], ) -> None: @@ -422,6 +429,7 @@ def _create_json_table( # pylint: disable=too-many-arguments projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, catalog_id=catalog_id, ) @@ -613,6 +621,7 @@ def create_parquet_table( projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, + projection_storage_location_template: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Create a Parquet Table (Metadata Only) in the AWS Glue Catalog. @@ -673,6 +682,11 @@ def create_parquet_table( Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) + projection_storage_location_template: Optional[str] + Value which is allows Athena to properly map partition values if the S3 file locations do not follow + a typical `.../column=value/...` pattern. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html + (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. @@ -721,6 +735,7 @@ def create_parquet_table( projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, boto3_session=boto3_session, catalog_table_input=catalog_table_input, ) @@ -752,6 +767,7 @@ def create_csv_table( projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, + projection_storage_location_template: Optional[str] = None, catalog_id: Optional[str] = None, ) -> None: r"""Create a CSV Table (Metadata Only) in the AWS Glue Catalog. @@ -825,6 +841,11 @@ def create_csv_table( Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) + projection_storage_location_template: Optional[str] + Value which is allows Athena to properly map partition values if the S3 file locations do not follow + a typical `.../column=value/...` pattern. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html + (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. catalog_id : str, optional @@ -877,6 +898,7 @@ def create_csv_table( projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, boto3_session=boto3_session, catalog_table_input=catalog_table_input, sep=sep, @@ -910,6 +932,7 @@ def create_json_table( projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, + projection_storage_location_template: Optional[str] = None, catalog_id: Optional[str] = None, ) -> None: r"""Create a JSON Table (Metadata Only) in the AWS Glue Catalog. @@ -979,6 +1002,11 @@ def create_json_table( Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) + projection_storage_location_template: Optional[str] + Value which is allows Athena to properly map partition values if the S3 file locations do not follow + a typical `.../column=value/...` pattern. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html + (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. catalog_id : str, optional @@ -1030,6 +1058,7 @@ def create_json_table( projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, boto3_session=boto3_session, catalog_table_input=catalog_table_input, serde_library=serde_library, diff --git a/awswrangler/s3/_write_parquet.py b/awswrangler/s3/_write_parquet.py index 13b272901..4965ea40c 100644 --- a/awswrangler/s3/_write_parquet.py +++ b/awswrangler/s3/_write_parquet.py @@ -222,6 +222,7 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, + projection_storage_location_template: Optional[str] = None, catalog_id: Optional[str] = None, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write Parquet file or dataset on Amazon S3. @@ -353,6 +354,11 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) + projection_storage_location_template: Optional[str] + Value which is allows Athena to properly map partition values if the S3 file locations do not follow + a typical `.../column=value/...` pattern. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html + (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. @@ -606,6 +612,7 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, catalog_id=catalog_id, catalog_table_input=catalog_table_input, ) @@ -659,6 +666,7 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, + projection_storage_location_template: Optional[str] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, boto3_session: Optional[boto3.Session] = None, ) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: @@ -762,6 +770,11 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) + projection_storage_location_template: Optional[str] + Value which is allows Athena to properly map partition values if the S3 file locations do not follow + a typical `.../column=value/...` pattern. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html + (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} @@ -829,6 +842,7 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, boto3_session=session, catalog_id=catalog_id, ) diff --git a/awswrangler/s3/_write_text.py b/awswrangler/s3/_write_text.py index 001cdbceb..f0ab83bb4 100644 --- a/awswrangler/s3/_write_text.py +++ b/awswrangler/s3/_write_text.py @@ -101,6 +101,7 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, + projection_storage_location_template: Optional[str] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: @@ -230,6 +231,11 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) + projection_storage_location_template: Optional[str] + Value which is allows Athena to properly map partition values if the S3 file locations do not follow + a typical `.../column=value/...` pattern. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html + (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. @@ -538,6 +544,7 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), @@ -602,6 +609,7 @@ def to_json( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, + projection_storage_location_template: Optional[str] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Union[List[str], Dict[str, Union[List[str], Dict[str, List[str]]]]]: @@ -710,6 +718,11 @@ def to_json( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) + projection_storage_location_template: Optional[str] + Value which is allows Athena to properly map partition values if the S3 file locations do not follow + a typical `.../column=value/...` pattern. + https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html + (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. @@ -888,6 +901,7 @@ def to_json( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, + projection_storage_location_template=projection_storage_location_template, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), diff --git a/tests/test_athena_projection.py b/tests/test_athena_projection.py index 7850daa75..6ee3d8ade 100644 --- a/tests/test_athena_projection.py +++ b/tests/test_athena_projection.py @@ -94,3 +94,29 @@ def test_to_parquet_projection_injected(glue_database, glue_table, path): df2 = wr.athena.read_sql_query(f"SELECT * FROM {glue_table} WHERE c1='foo' AND c2='0'", glue_database) assert df2.shape == (1, 3) assert df2.c0.iloc[0] == 0 + + +def test_to_parquet_storage_location(glue_database, glue_table, path): + df1 = pd.DataFrame({"c0": [0], "c1": ["foo"], "c2": ["0"]}) + df2 = pd.DataFrame({"c0": [1], "c1": ["foo"], "c2": ["1"]}) + df3 = pd.DataFrame({"c0": [2], "c1": ["boo"], "c2": ["2"]}) + df4 = pd.DataFrame({"c0": [3], "c1": ["boo"], "c2": ["3"]}) + + wr.s3.to_parquet(df=df1, path=f"{path}foo/0/") + wr.s3.to_parquet(df=df2, path=f"{path}foo/1/") + wr.s3.to_parquet(df=df3, path=f"{path}boo/2/") + wr.s3.to_parquet( + df=df4, + path=f"{path}boo/3/", + dataset=True, + database=glue_database, + table=glue_table, + regular_partitions=False, + projection_enabled=True, + projection_types={"c1": "injected", "c2": "injected"}, + projection_storage_location_template=f"{path}${{c1}}/${{c2}}" + ) + + df5 = wr.athena.read_sql_query(f"SELECT * FROM {glue_table} WHERE c1='foo' AND c2='0'", glue_database) + assert df5.shape == (1, 3) + assert df5.c0.iloc[0] == 0 From 846c5d4ac97ca6040b7543963f89b51338bc5a07 Mon Sep 17 00:00:00 2001 From: Maximilian Speicher Date: Sun, 21 Nov 2021 11:21:24 +0100 Subject: [PATCH 2/5] Linting --- awswrangler/catalog/_create.py | 7 ++++++- tests/test_athena_projection.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/awswrangler/catalog/_create.py b/awswrangler/catalog/_create.py index 71960550d..8df3feb59 100644 --- a/awswrangler/catalog/_create.py +++ b/awswrangler/catalog/_create.py @@ -96,7 +96,12 @@ def _create_table( # pylint: disable=too-many-branches,too-many-statements mode = _update_if_necessary( dic=table_input["Parameters"], key=f"projection.{k}.digits", value=str(v), mode=mode ) - mode = _update_if_necessary(table_input["Parameters"], key="storage.location.template", value=projection_storage_location_template, mode=mode) + mode = _update_if_necessary( + table_input["Parameters"], + key="storage.location.template", + value=projection_storage_location_template, + mode=mode, + ) else: table_input["Parameters"]["projection.enabled"] = "false" diff --git a/tests/test_athena_projection.py b/tests/test_athena_projection.py index 6ee3d8ade..897f6daa0 100644 --- a/tests/test_athena_projection.py +++ b/tests/test_athena_projection.py @@ -114,7 +114,7 @@ def test_to_parquet_storage_location(glue_database, glue_table, path): regular_partitions=False, projection_enabled=True, projection_types={"c1": "injected", "c2": "injected"}, - projection_storage_location_template=f"{path}${{c1}}/${{c2}}" + projection_storage_location_template=f"{path}${{c1}}/${{c2}}", ) df5 = wr.athena.read_sql_query(f"SELECT * FROM {glue_table} WHERE c1='foo' AND c2='0'", glue_database) From 900bf44513e4a8572a484cd4ed3f65edf6a4315e Mon Sep 17 00:00:00 2001 From: Maximilian Speicher Date: Sun, 21 Nov 2021 12:14:35 +0100 Subject: [PATCH 3/5] Remove storage location template for writing --- awswrangler/catalog/_create.py | 4 ++-- awswrangler/s3/_write_parquet.py | 15 +-------------- awswrangler/s3/_write_text.py | 16 ++-------------- tests/test_athena_projection.py | 19 +++++++++---------- 4 files changed, 14 insertions(+), 40 deletions(-) diff --git a/awswrangler/catalog/_create.py b/awswrangler/catalog/_create.py index 8df3feb59..adefbff1b 100644 --- a/awswrangler/catalog/_create.py +++ b/awswrangler/catalog/_create.py @@ -72,7 +72,7 @@ def _create_table( # pylint: disable=too-many-branches,too-many-statements projection_digits = {sanitize_column_name(k): v for k, v in projection_digits.items()} for k, v in projection_types.items(): dtype: Optional[str] = partitions_types.get(k) - if dtype is None: + if dtype is None and projection_storage_location_template is None: raise exceptions.InvalidArgumentCombination( f"Column {k} appears as projected column but not as partitioned column." ) @@ -747,7 +747,7 @@ def create_parquet_table( @apply_configs -def create_csv_table( +def create_csv_table( # pylint: disable=too-many-arguments database: str, table: str, path: str, diff --git a/awswrangler/s3/_write_parquet.py b/awswrangler/s3/_write_parquet.py index 4965ea40c..410db8490 100644 --- a/awswrangler/s3/_write_parquet.py +++ b/awswrangler/s3/_write_parquet.py @@ -222,7 +222,6 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, - projection_storage_location_template: Optional[str] = None, catalog_id: Optional[str] = None, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write Parquet file or dataset on Amazon S3. @@ -354,11 +353,6 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) - projection_storage_location_template: Optional[str] - Value which is allows Athena to properly map partition values if the S3 file locations do not follow - a typical `.../column=value/...` pattern. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html - (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. @@ -612,7 +606,7 @@ def to_parquet( # pylint: disable=too-many-arguments,too-many-locals projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, - projection_storage_location_template=projection_storage_location_template, + projection_storage_location_template=None, catalog_id=catalog_id, catalog_table_input=catalog_table_input, ) @@ -666,7 +660,6 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, - projection_storage_location_template: Optional[str] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, boto3_session: Optional[boto3.Session] = None, ) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: @@ -770,11 +763,6 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) - projection_storage_location_template: Optional[str] - Value which is allows Athena to properly map partition values if the S3 file locations do not follow - a typical `.../column=value/...` pattern. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html - (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} @@ -842,7 +830,6 @@ def store_parquet_metadata( # pylint: disable=too-many-arguments projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, - projection_storage_location_template=projection_storage_location_template, boto3_session=session, catalog_id=catalog_id, ) diff --git a/awswrangler/s3/_write_text.py b/awswrangler/s3/_write_text.py index f0ab83bb4..0f7ce4330 100644 --- a/awswrangler/s3/_write_text.py +++ b/awswrangler/s3/_write_text.py @@ -101,7 +101,6 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, - projection_storage_location_template: Optional[str] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: @@ -231,11 +230,6 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) - projection_storage_location_template: Optional[str] - Value which is allows Athena to properly map partition values if the S3 file locations do not follow - a typical `.../column=value/...` pattern. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html - (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. @@ -544,7 +538,7 @@ def to_csv( # pylint: disable=too-many-arguments,too-many-locals,too-many-state projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, - projection_storage_location_template=projection_storage_location_template, + projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), @@ -609,7 +603,6 @@ def to_json( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, - projection_storage_location_template: Optional[str] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Union[List[str], Dict[str, Union[List[str], Dict[str, List[str]]]]]: @@ -718,11 +711,6 @@ def to_json( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) - projection_storage_location_template: Optional[str] - Value which is allows Athena to properly map partition values if the S3 file locations do not follow - a typical `.../column=value/...` pattern. - https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html - (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. @@ -901,7 +889,7 @@ def to_json( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, - projection_storage_location_template=projection_storage_location_template, + projection_storage_location_template=None, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=pandas_kwargs.get("compression"), diff --git a/tests/test_athena_projection.py b/tests/test_athena_projection.py index 897f6daa0..abe200721 100644 --- a/tests/test_athena_projection.py +++ b/tests/test_athena_projection.py @@ -102,21 +102,20 @@ def test_to_parquet_storage_location(glue_database, glue_table, path): df3 = pd.DataFrame({"c0": [2], "c1": ["boo"], "c2": ["2"]}) df4 = pd.DataFrame({"c0": [3], "c1": ["boo"], "c2": ["3"]}) - wr.s3.to_parquet(df=df1, path=f"{path}foo/0/") - wr.s3.to_parquet(df=df2, path=f"{path}foo/1/") - wr.s3.to_parquet(df=df3, path=f"{path}boo/2/") - wr.s3.to_parquet( - df=df4, - path=f"{path}boo/3/", - dataset=True, + wr.s3.to_parquet(df=df1, path=f"{path}foo/0/file0.parquet") + wr.s3.to_parquet(df=df2, path=f"{path}foo/1/file1.parquet") + wr.s3.to_parquet(df=df3, path=f"{path}boo/2/file2.parquet") + wr.s3.to_parquet(df=df4, path=f"{path}boo/3/file3.parquet") + column_types, partitions_types = wr.catalog.extract_athena_types(df1) + wr.catalog.create_parquet_table( database=glue_database, table=glue_table, - regular_partitions=False, + path=path, + columns_types=column_types, projection_enabled=True, projection_types={"c1": "injected", "c2": "injected"}, projection_storage_location_template=f"{path}${{c1}}/${{c2}}", ) df5 = wr.athena.read_sql_query(f"SELECT * FROM {glue_table} WHERE c1='foo' AND c2='0'", glue_database) - assert df5.shape == (1, 3) - assert df5.c0.iloc[0] == 0 + pd.testing.assert_frame_equal(df1, df5, check_dtype=False) From acbb8a26a14ce5e0f9eb26a048740284961e2c6c Mon Sep 17 00:00:00 2001 From: Maximilian Speicher Date: Sun, 21 Nov 2021 12:35:11 +0100 Subject: [PATCH 4/5] Trigger CI From 38f0967cbdc7643a3b28ad761b3d628d5fc5859e Mon Sep 17 00:00:00 2001 From: Maximilian Speicher Date: Sun, 21 Nov 2021 13:03:09 +0100 Subject: [PATCH 5/5] Fix tests --- tests/test_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_config.py b/tests/test_config.py index 86ea3eccc..318f0f807 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -126,6 +126,7 @@ def test_basics(path, glue_database, glue_table, workgroup0, workgroup1): def test_athena_cache_configuration(): + wr.config.max_remote_cache_entries = 50 wr.config.max_local_cache_entries = 20 assert wr.config.max_remote_cache_entries == 20