Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 109 additions & 11 deletions native/core/src/parquet/objectstore/s3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,19 +169,19 @@ fn extract_s3_config_options(
}

// Extract and handle path style access (virtual hosted style)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Extract and handle path style access (virtual hosted style)
// Set 'virtual hosted style' config depending on the value of 'path style access'

I am not sure that it is more clear

let mut virtual_hosted_style_request = false;
let mut path_style_access = false;
if let Some(path_style) = get_config_trimmed(configs, bucket, "path.style.access") {
virtual_hosted_style_request = path_style.to_lowercase() == "true";
path_style_access = path_style.to_lowercase() == "true";
s3_configs.insert(
AmazonS3ConfigKey::VirtualHostedStyleRequest,
virtual_hosted_style_request.to_string(),
(!path_style_access).to_string(),
);
}

// Extract endpoint configuration and modify if virtual hosted style is enabled
// Extract endpoint configuration and modify if path style access is enabled
if let Some(endpoint) = get_config_trimmed(configs, bucket, "endpoint") {
let normalized_endpoint =
normalize_endpoint(endpoint, bucket, virtual_hosted_style_request);
normalize_endpoint(endpoint, bucket, path_style_access);
if let Some(endpoint) = normalized_endpoint {
s3_configs.insert(AmazonS3ConfigKey::Endpoint, endpoint);
}
Expand All @@ -202,7 +202,7 @@ fn extract_s3_config_options(
fn normalize_endpoint(
endpoint: &str,
bucket: &str,
virtual_hosted_style_request: bool,
force_path_style: bool,
) -> Option<String> {
if endpoint.is_empty() {
return None;
Expand All @@ -221,14 +221,31 @@ fn normalize_endpoint(
endpoint.to_string()
};

if virtual_hosted_style_request {
if force_path_style {
// Path-style: bucket in path
// Example: https://s3.oss-me-central-1.aliyuncs.com/my-bucket/key
if endpoint.ends_with("/") {
Some(format!("{endpoint}{bucket}"))
} else {
Some(format!("{endpoint}/{bucket}"))
}
} else {
Some(endpoint) // Avoid extra to_string() call since endpoint is already a String
// Virtual-hosted-style: bucket in subdomain
// For custom endpoints (non-AWS), we need to manually insert the bucket as a subdomain
// because object_store crate only does this automatically for AWS S3 endpoints.
// Example: https://my-bucket.s3.oss-me-central-1.aliyuncs.com/key

// Parse the endpoint to insert bucket as subdomain
if let Some(protocol_end) = endpoint.find("://") {
let protocol = &endpoint[..protocol_end + 3]; // "https://" or "http://"
let rest = &endpoint[protocol_end + 3..]; // "s3.oss-me-central-1.aliyuncs.com" or "s3.oss-me-central-1.aliyuncs.com/path"

// Insert bucket as first subdomain
Some(format!("{protocol}{bucket}.{rest}"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html#general-purpose-bucket-names

Bucket names can consist only of lowercase letters, numbers, periods (.), and hyphens (-).

usage of periods (.) in the bucket name would create several subdomains. Is that supported ? Or a check should be added and reported somehow ?!

} else {
// Shouldn't happen since we add protocol above, but fallback
Some(format!("{bucket}.{endpoint}"))
}
}
}

Expand Down Expand Up @@ -1852,14 +1869,95 @@ mod tests {
);
}

#[test]
fn test_extract_s3_config_path_style_access() {
// Case 1: path.style.access is true -> VirtualHostedStyleRequest should be false
let mut configs = HashMap::new();
configs.insert("fs.s3a.path.style.access".to_string(), "true".to_string());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
configs.insert("fs.s3a.path.style.access".to_string(), "true".to_string());
configs.insert("fs.s3a.path.style.access".to_string(), "tRue".to_string());

to test path_style_access = path_style.to_lowercase() == "true"; (line 174)
Or maybe add a separate test for this.

let s3_configs = extract_s3_config_options(&configs, "test-bucket");
assert_eq!(
s3_configs.get(&AmazonS3ConfigKey::VirtualHostedStyleRequest),
Some(&"false".to_string())
);

// Case 2: path.style.access is false -> VirtualHostedStyleRequest should be true
let mut configs = HashMap::new();
configs.insert("fs.s3a.path.style.access".to_string(), "false".to_string());
let s3_configs = extract_s3_config_options(&configs, "test-bucket");
assert_eq!(
s3_configs.get(&AmazonS3ConfigKey::VirtualHostedStyleRequest),
Some(&"true".to_string())
);

// Case 3: path.style.access is not set -> VirtualHostedStyleRequest should not be set
let configs = HashMap::new();
let s3_configs = extract_s3_config_options(&configs, "test-bucket");
assert_eq!(
s3_configs.get(&AmazonS3ConfigKey::VirtualHostedStyleRequest),
None
);
}


#[test]
fn test_normalize_endpoint() {
let bucket = "test-bucket";

// Case 1: force_path_style = true -> should append bucket
assert_eq!(
normalize_endpoint("http://localhost:9000", bucket, true),
Some("http://localhost:9000/test-bucket".to_string())
);
assert_eq!(
normalize_endpoint("http://localhost:9000/", bucket, true),
Some("http://localhost:9000/test-bucket".to_string())
);
assert_eq!(
normalize_endpoint("https://my-s3-compatible.com", bucket, true),
Some("https://my-s3-compatible.com/test-bucket".to_string())
);

// Case 2: force_path_style = false -> should insert bucket as subdomain
assert_eq!(
normalize_endpoint("http://localhost:9000", bucket, false),
Some("http://test-bucket.localhost:9000".to_string())
);
assert_eq!(
normalize_endpoint("https://my-s3-compatible.com", bucket, false),
Some("https://test-bucket.my-s3-compatible.com".to_string())
);
assert_eq!(
normalize_endpoint("https://s3.oss-me-central-1.aliyuncs.com", bucket, false),
Some("https://test-bucket.s3.oss-me-central-1.aliyuncs.com".to_string())
);

// Case 3: Default S3 endpoint -> should return None (ignored)
assert_eq!(
normalize_endpoint("s3.amazonaws.com", bucket, true),
None
);
assert_eq!(
normalize_endpoint("s3.amazonaws.com", bucket, false),
None
);

// Case 4: Empty endpoint -> should return None
assert_eq!(
normalize_endpoint("", bucket, true),
None
);
}

#[test]
fn test_extract_s3_config_custom_endpoint() {
// When path.style.access is not set, it defaults to false (virtual-hosted-style)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// When path.style.access is not set, it defaults to false (virtual-hosted-style)
// When path.style.access is not set, it defaults to false (i.e. virtual-hosted-style=true)

Is this what you mean ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But https://github.com/apache/datafusion-comet/pull/2803/files#diff-e8053ba6b0c33f3c35806e207a14881282d9c8a83c2a8adeed55d6a91493fb78R1892 says Case 3: path.style.access is not set -> VirtualHostedStyleRequest should not be set

// So bucket should be inserted as subdomain
let cases = vec![
("custom.endpoint.com", "https://custom.endpoint.com"),
("https://custom.endpoint.com", "https://custom.endpoint.com"),
("custom.endpoint.com", "https://test-bucket.custom.endpoint.com"),
("https://custom.endpoint.com", "https://test-bucket.custom.endpoint.com"),
(
"https://custom.endpoint.com/path/to/resource",
"https://custom.endpoint.com/path/to/resource",
"https://test-bucket.custom.endpoint.com/path/to/resource",
),
];
for (endpoint, configured_endpoint) in cases {
Expand Down
Loading