diff --git a/native/core/src/parquet/objectstore/s3.rs b/native/core/src/parquet/objectstore/s3.rs index 6d8f011d0b..cddfd9387a 100644 --- a/native/core/src/parquet/objectstore/s3.rs +++ b/native/core/src/parquet/objectstore/s3.rs @@ -169,19 +169,19 @@ fn extract_s3_config_options( } // Extract and handle path style access (virtual hosted style) - let mut virtual_hosted_style_request = false; + let mut path_style_access = false; if let Some(path_style) = get_config_trimmed(configs, bucket, "path.style.access") { - virtual_hosted_style_request = path_style.to_lowercase() == "true"; + path_style_access = path_style.to_lowercase() == "true"; s3_configs.insert( AmazonS3ConfigKey::VirtualHostedStyleRequest, - virtual_hosted_style_request.to_string(), + (!path_style_access).to_string(), ); } - // Extract endpoint configuration and modify if virtual hosted style is enabled + // Extract endpoint configuration and modify if path style access is enabled if let Some(endpoint) = get_config_trimmed(configs, bucket, "endpoint") { let normalized_endpoint = - normalize_endpoint(endpoint, bucket, virtual_hosted_style_request); + normalize_endpoint(endpoint, bucket, path_style_access); if let Some(endpoint) = normalized_endpoint { s3_configs.insert(AmazonS3ConfigKey::Endpoint, endpoint); } @@ -202,7 +202,7 @@ fn extract_s3_config_options( fn normalize_endpoint( endpoint: &str, bucket: &str, - virtual_hosted_style_request: bool, + force_path_style: bool, ) -> Option { if endpoint.is_empty() { return None; @@ -221,14 +221,31 @@ fn normalize_endpoint( endpoint.to_string() }; - if virtual_hosted_style_request { + if force_path_style { + // Path-style: bucket in path + // Example: https://s3.oss-me-central-1.aliyuncs.com/my-bucket/key if endpoint.ends_with("/") { Some(format!("{endpoint}{bucket}")) } else { Some(format!("{endpoint}/{bucket}")) } } else { - Some(endpoint) // Avoid extra to_string() call since endpoint is already a String + // Virtual-hosted-style: bucket in subdomain + // For custom endpoints (non-AWS), we need to manually insert the bucket as a subdomain + // because object_store crate only does this automatically for AWS S3 endpoints. + // Example: https://my-bucket.s3.oss-me-central-1.aliyuncs.com/key + + // Parse the endpoint to insert bucket as subdomain + if let Some(protocol_end) = endpoint.find("://") { + let protocol = &endpoint[..protocol_end + 3]; // "https://" or "http://" + let rest = &endpoint[protocol_end + 3..]; // "s3.oss-me-central-1.aliyuncs.com" or "s3.oss-me-central-1.aliyuncs.com/path" + + // Insert bucket as first subdomain + Some(format!("{protocol}{bucket}.{rest}")) + } else { + // Shouldn't happen since we add protocol above, but fallback + Some(format!("{bucket}.{endpoint}")) + } } } @@ -1852,14 +1869,95 @@ mod tests { ); } + #[test] + fn test_extract_s3_config_path_style_access() { + // Case 1: path.style.access is true -> VirtualHostedStyleRequest should be false + let mut configs = HashMap::new(); + configs.insert("fs.s3a.path.style.access".to_string(), "true".to_string()); + let s3_configs = extract_s3_config_options(&configs, "test-bucket"); + assert_eq!( + s3_configs.get(&AmazonS3ConfigKey::VirtualHostedStyleRequest), + Some(&"false".to_string()) + ); + + // Case 2: path.style.access is false -> VirtualHostedStyleRequest should be true + let mut configs = HashMap::new(); + configs.insert("fs.s3a.path.style.access".to_string(), "false".to_string()); + let s3_configs = extract_s3_config_options(&configs, "test-bucket"); + assert_eq!( + s3_configs.get(&AmazonS3ConfigKey::VirtualHostedStyleRequest), + Some(&"true".to_string()) + ); + + // Case 3: path.style.access is not set -> VirtualHostedStyleRequest should not be set + let configs = HashMap::new(); + let s3_configs = extract_s3_config_options(&configs, "test-bucket"); + assert_eq!( + s3_configs.get(&AmazonS3ConfigKey::VirtualHostedStyleRequest), + None + ); + } + + + #[test] + fn test_normalize_endpoint() { + let bucket = "test-bucket"; + + // Case 1: force_path_style = true -> should append bucket + assert_eq!( + normalize_endpoint("http://localhost:9000", bucket, true), + Some("http://localhost:9000/test-bucket".to_string()) + ); + assert_eq!( + normalize_endpoint("http://localhost:9000/", bucket, true), + Some("http://localhost:9000/test-bucket".to_string()) + ); + assert_eq!( + normalize_endpoint("https://my-s3-compatible.com", bucket, true), + Some("https://my-s3-compatible.com/test-bucket".to_string()) + ); + + // Case 2: force_path_style = false -> should insert bucket as subdomain + assert_eq!( + normalize_endpoint("http://localhost:9000", bucket, false), + Some("http://test-bucket.localhost:9000".to_string()) + ); + assert_eq!( + normalize_endpoint("https://my-s3-compatible.com", bucket, false), + Some("https://test-bucket.my-s3-compatible.com".to_string()) + ); + assert_eq!( + normalize_endpoint("https://s3.oss-me-central-1.aliyuncs.com", bucket, false), + Some("https://test-bucket.s3.oss-me-central-1.aliyuncs.com".to_string()) + ); + + // Case 3: Default S3 endpoint -> should return None (ignored) + assert_eq!( + normalize_endpoint("s3.amazonaws.com", bucket, true), + None + ); + assert_eq!( + normalize_endpoint("s3.amazonaws.com", bucket, false), + None + ); + + // Case 4: Empty endpoint -> should return None + assert_eq!( + normalize_endpoint("", bucket, true), + None + ); + } + #[test] fn test_extract_s3_config_custom_endpoint() { + // When path.style.access is not set, it defaults to false (virtual-hosted-style) + // So bucket should be inserted as subdomain let cases = vec![ - ("custom.endpoint.com", "https://custom.endpoint.com"), - ("https://custom.endpoint.com", "https://custom.endpoint.com"), + ("custom.endpoint.com", "https://test-bucket.custom.endpoint.com"), + ("https://custom.endpoint.com", "https://test-bucket.custom.endpoint.com"), ( "https://custom.endpoint.com/path/to/resource", - "https://custom.endpoint.com/path/to/resource", + "https://test-bucket.custom.endpoint.com/path/to/resource", ), ]; for (endpoint, configured_endpoint) in cases {