diff --git a/test_infra/stacks/base_stack.py b/test_infra/stacks/base_stack.py index 22a49957e..d7e7a9273 100644 --- a/test_infra/stacks/base_stack.py +++ b/test_infra/stacks/base_stack.py @@ -80,6 +80,18 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs: str) -> None: ], versioned=True, ) + self.bucket_access_point = s3.CfnAccessPoint( + self, + id="aws-sdk-pandas-access-point", + bucket=self.bucket.bucket_name, + bucket_account_id=self.account, + public_access_block_configuration=s3.CfnAccessPoint.PublicAccessBlockConfigurationProperty( + block_public_acls=True, + block_public_policy=True, + ignore_public_acls=True, + restrict_public_buckets=True, + ), + ) glue_data_quality_role = iam.Role( self, @@ -175,12 +187,23 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs: str) -> None: "BucketName", value=self.bucket.bucket_name, ) + CfnOutput( + self, + "BucketAccessPointArn", + value=self.bucket_access_point.attr_arn, + ) ssm.StringParameter( self, "SSM BucketName", parameter_name="/sdk-pandas/base/BucketName", string_value=self.bucket.bucket_name, ) + ssm.StringParameter( + self, + "SSM Bucket Access Point ARN", + parameter_name="/sdk-pandas/base/BucketAccessPointArn", + string_value=self.bucket_access_point.attr_arn, + ) CfnOutput(self, "GlueDatabaseName", value=glue_db.database_name) CfnOutput(self, "GlueDataQualityRole", value=glue_data_quality_role.role_arn) CfnOutput(self, "EMRServerlessExecutionRoleArn", value=emr_serverless_exec_role.role_arn) diff --git a/tests/conftest.py b/tests/conftest.py index 86d58862b..b57041087 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,6 +34,11 @@ def bucket(cloudformation_outputs): return cloudformation_outputs["BucketName"] +@pytest.fixture(scope="session") +def bucket_access_point(cloudformation_outputs): + return cloudformation_outputs["BucketAccessPointArn"] + + @pytest.fixture(scope="session") def glue_database(cloudformation_outputs): return cloudformation_outputs["GlueDatabaseName"] @@ -307,6 +312,11 @@ def path3(bucket): yield from path_generator(bucket) +@pytest.fixture(scope="function") +def access_point_path_path(bucket_access_point): + yield from path_generator(bucket_access_point) + + @pytest.fixture(scope="function") def redshift_table(): name = f"tbl_{get_time_str_with_random_suffix()}" diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py index 18217605e..3f8a35947 100644 --- a/tests/unit/test_s3_parquet.py +++ b/tests/unit/test_s3_parquet.py @@ -992,3 +992,16 @@ def test_read_parquet_table_with_client_side_encryption( columns=columns, ) assert df_out.shape == (3, len(columns)) + + +@pytest.mark.xfail( + is_ray_modin, + raises=pa.lib.ArrowInvalid, + reason="Ray Modin cannot read from access point because PyArrow doesn't support it", +) +def test_read_from_access_point(access_point_path_path: str) -> None: + path = access_point_path_path + "test.parquet" + df = pd.DataFrame({"c0": [0, 1, 2], "c1": [0, 1, 2], "c2": [0, 0, 1]}) + wr.s3.to_parquet(df, path) + df_out = wr.s3.read_parquet(path) + assert df_out.shape == (3, 3)