From 0ee83dde0301f4b8614ccef190a66f156b7053f5 Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 7 Oct 2022 14:19:57 +0900 Subject: [PATCH 1/5] fix: ListingTabUrl prefix decoding --- datafusion/core/Cargo.toml | 1 + datafusion/core/src/datasource/listing/url.rs | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 1ba9f769f3f0..114c0b6a526a 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -80,6 +80,7 @@ ordered-float = "3.0" parking_lot = "0.12" parquet = { version = "24.0.0", features = ["arrow", "async"] } paste = "^1.0" +percent-encoding = "2.2.0" pin-project-lite = "^0.2.7" pyo3 = { version = "0.17.1", optional = true } rand = "0.8" diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs index 8676f2118728..18c922fdc135 100644 --- a/datafusion/core/src/datasource/listing/url.rs +++ b/datafusion/core/src/datasource/listing/url.rs @@ -24,6 +24,7 @@ use itertools::Itertools; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; use url::Url; +use percent_encoding; /// A parsed URL identifying files for a listing table, see [`ListingTableUrl::parse`] /// for more information on the supported expressions @@ -108,7 +109,8 @@ impl ListingTableUrl { /// Creates a new [`ListingTableUrl`] from a url and optional glob expression fn new(url: Url, glob: Option) -> Self { - let prefix = Path::parse(url.path()).expect("should be URL safe"); + let decoded_path = percent_encoding::percent_decode_str(url.path()).decode_utf8_lossy(); + let prefix = Path::parse(decoded_path).expect("should be URL safe"); Self { url, prefix, glob } } @@ -246,8 +248,12 @@ mod tests { let url = ListingTableUrl::parse("file:///foo").unwrap(); let child = Path::parse("/foob/bar").unwrap(); assert!(url.strip_prefix(&child).is_none()); + + let url = ListingTableUrl::parse("file:///with space/foo/bar").unwrap(); + assert_eq!(url.prefix.as_ref(), "with space/foo/bar"); } + #[test] fn test_prefix_s3() { let url = ListingTableUrl::parse("s3://bucket/foo/bar").unwrap(); From 92ef8be41226d8b67768043171bbb770b2ff26fe Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 7 Oct 2022 14:26:31 +0900 Subject: [PATCH 2/5] chore: remove waste change --- datafusion/core/src/datasource/listing/url.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs index 18c922fdc135..68d58f6f3282 100644 --- a/datafusion/core/src/datasource/listing/url.rs +++ b/datafusion/core/src/datasource/listing/url.rs @@ -253,7 +253,6 @@ mod tests { assert_eq!(url.prefix.as_ref(), "with space/foo/bar"); } - #[test] fn test_prefix_s3() { let url = ListingTableUrl::parse("s3://bucket/foo/bar").unwrap(); From 65ba95edd68209e9eafc115f88816ead28b7dbcf Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 7 Oct 2022 23:21:59 +0900 Subject: [PATCH 3/5] fix: use from instead of parse --- datafusion/core/src/datasource/listing/url.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs index 68d58f6f3282..c69744d4e368 100644 --- a/datafusion/core/src/datasource/listing/url.rs +++ b/datafusion/core/src/datasource/listing/url.rs @@ -110,7 +110,7 @@ impl ListingTableUrl { /// Creates a new [`ListingTableUrl`] from a url and optional glob expression fn new(url: Url, glob: Option) -> Self { let decoded_path = percent_encoding::percent_decode_str(url.path()).decode_utf8_lossy(); - let prefix = Path::parse(decoded_path).expect("should be URL safe"); + let prefix = Path::from(decoded_path.as_ref()); Self { url, prefix, glob } } From 439f6dcd488a501ca37015169d55ddbeb307bddc Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 7 Oct 2022 23:22:16 +0900 Subject: [PATCH 4/5] test: add test cases for prefix --- datafusion/core/src/datasource/listing/url.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs index c69744d4e368..166d9048c3ba 100644 --- a/datafusion/core/src/datasource/listing/url.rs +++ b/datafusion/core/src/datasource/listing/url.rs @@ -249,8 +249,14 @@ mod tests { let child = Path::parse("/foob/bar").unwrap(); assert!(url.strip_prefix(&child).is_none()); - let url = ListingTableUrl::parse("file:///with space/foo/bar").unwrap(); - assert_eq!(url.prefix.as_ref(), "with space/foo/bar"); + let url = ListingTableUrl::parse("file:///foo/ bar").unwrap(); + assert_eq!(url.prefix.as_ref(), "foo/ bar"); + + let url = ListingTableUrl::parse("file:///foo/bar?").unwrap(); + assert_eq!(url.prefix.as_ref(), "foo/bar"); + + let url = ListingTableUrl::parse("file:///foo/😺").unwrap(); + assert_eq!(url.prefix.as_ref(), "foo/%F0%9F%98%BA"); } #[test] From 7338da4cf05fb17929407d92b94b7397bc25eba7 Mon Sep 17 00:00:00 2001 From: unvalley Date: Tue, 11 Oct 2022 00:38:47 +0900 Subject: [PATCH 5/5] chore: cargo fmt --- datafusion/core/src/datasource/listing/url.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs index 166d9048c3ba..d1a527f23ac7 100644 --- a/datafusion/core/src/datasource/listing/url.rs +++ b/datafusion/core/src/datasource/listing/url.rs @@ -23,8 +23,8 @@ use glob::Pattern; use itertools::Itertools; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; -use url::Url; use percent_encoding; +use url::Url; /// A parsed URL identifying files for a listing table, see [`ListingTableUrl::parse`] /// for more information on the supported expressions @@ -109,7 +109,8 @@ impl ListingTableUrl { /// Creates a new [`ListingTableUrl`] from a url and optional glob expression fn new(url: Url, glob: Option) -> Self { - let decoded_path = percent_encoding::percent_decode_str(url.path()).decode_utf8_lossy(); + let decoded_path = + percent_encoding::percent_decode_str(url.path()).decode_utf8_lossy(); let prefix = Path::from(decoded_path.as_ref()); Self { url, prefix, glob } }