Skip to content

Commit

Permalink
Merge branch 'main' into fix/issue-2710-iceberg-s3-output
Browse files Browse the repository at this point in the history
  • Loading branch information
jaidisido committed Mar 14, 2024
2 parents 3a27327 + 9e33f0b commit cbed803
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 0 deletions.
23 changes: 23 additions & 0 deletions test_infra/stacks/base_stack.py
Expand Up @@ -80,6 +80,18 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs: str) -> None:
],
versioned=True,
)
self.bucket_access_point = s3.CfnAccessPoint(
self,
id="aws-sdk-pandas-access-point",
bucket=self.bucket.bucket_name,
bucket_account_id=self.account,
public_access_block_configuration=s3.CfnAccessPoint.PublicAccessBlockConfigurationProperty(
block_public_acls=True,
block_public_policy=True,
ignore_public_acls=True,
restrict_public_buckets=True,
),
)

glue_data_quality_role = iam.Role(
self,
Expand Down Expand Up @@ -175,12 +187,23 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs: str) -> None:
"BucketName",
value=self.bucket.bucket_name,
)
CfnOutput(
self,
"BucketAccessPointArn",
value=self.bucket_access_point.attr_arn,
)
ssm.StringParameter(
self,
"SSM BucketName",
parameter_name="/sdk-pandas/base/BucketName",
string_value=self.bucket.bucket_name,
)
ssm.StringParameter(
self,
"SSM Bucket Access Point ARN",
parameter_name="/sdk-pandas/base/BucketAccessPointArn",
string_value=self.bucket_access_point.attr_arn,
)
CfnOutput(self, "GlueDatabaseName", value=glue_db.database_name)
CfnOutput(self, "GlueDataQualityRole", value=glue_data_quality_role.role_arn)
CfnOutput(self, "EMRServerlessExecutionRoleArn", value=emr_serverless_exec_role.role_arn)
Expand Down
10 changes: 10 additions & 0 deletions tests/conftest.py
Expand Up @@ -34,6 +34,11 @@ def bucket(cloudformation_outputs):
return cloudformation_outputs["BucketName"]


@pytest.fixture(scope="session")
def bucket_access_point(cloudformation_outputs):
return cloudformation_outputs["BucketAccessPointArn"]


@pytest.fixture(scope="session")
def glue_database(cloudformation_outputs):
return cloudformation_outputs["GlueDatabaseName"]
Expand Down Expand Up @@ -307,6 +312,11 @@ def path3(bucket):
yield from path_generator(bucket)


@pytest.fixture(scope="function")
def access_point_path_path(bucket_access_point):
yield from path_generator(bucket_access_point)


@pytest.fixture(scope="function")
def redshift_table():
name = f"tbl_{get_time_str_with_random_suffix()}"
Expand Down
13 changes: 13 additions & 0 deletions tests/unit/test_s3_parquet.py
Expand Up @@ -992,3 +992,16 @@ def test_read_parquet_table_with_client_side_encryption(
columns=columns,
)
assert df_out.shape == (3, len(columns))


@pytest.mark.xfail(
is_ray_modin,
raises=pa.lib.ArrowInvalid,
reason="Ray Modin cannot read from access point because PyArrow doesn't support it",
)
def test_read_from_access_point(access_point_path_path: str) -> None:
path = access_point_path_path + "test.parquet"
df = pd.DataFrame({"c0": [0, 1, 2], "c1": [0, 1, 2], "c2": [0, 0, 1]})
wr.s3.to_parquet(df, path)
df_out = wr.s3.read_parquet(path)
assert df_out.shape == (3, 3)

0 comments on commit cbed803

Please sign in to comment.